aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py')
-rw-r--r--.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py602
1 files changed, 602 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py
new file mode 100644
index 00000000..5f771a40
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py
@@ -0,0 +1,602 @@
+# -*- coding: utf-8 -*-
+"""Tests of Beautiful Soup as a whole."""
+
+import logging
+import pickle
+import pytest
+from typing import Iterable
+
+from bs4 import (
+ BeautifulSoup,
+ GuessedAtParserWarning,
+ dammit,
+)
+from bs4.builder import (
+ TreeBuilder,
+)
+from bs4.element import (
+ AttributeValueList,
+ XMLAttributeDict,
+ Comment,
+ PYTHON_SPECIFIC_ENCODINGS,
+ Tag,
+ NavigableString,
+)
+from bs4.filter import SoupStrainer
+from bs4.exceptions import (
+ ParserRejectedMarkup,
+)
+from bs4._warnings import (
+ MarkupResemblesLocatorWarning,
+)
+
+
+from . import (
+ default_builder,
+ LXML_PRESENT,
+ SoupTest,
+)
+import warnings
+from typing import Type
+
+
+class TestConstructor(SoupTest):
+ def test_short_unicode_input(self):
+ data = "<h1>éé</h1>"
+ soup = self.soup(data)
+ assert "éé" == soup.h1.string
+
+ def test_embedded_null(self):
+ data = "<h1>foo\0bar</h1>"
+ soup = self.soup(data)
+ assert "foo\0bar" == soup.h1.string
+
+ def test_exclude_encodings(self):
+ utf8_data = "Räksmörgås".encode("utf-8")
+ soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
+ assert "windows-1252" == soup.original_encoding
+
+ def test_custom_builder_class(self):
+ # Verify that you can pass in a custom Builder class and
+ # it'll be instantiated with the appropriate keyword arguments.
+ class Mock(object):
+ def __init__(self, **kwargs):
+ self.called_with = kwargs
+ self.is_xml = True
+ self.store_line_numbers = False
+ self.cdata_list_attributes = []
+ self.preserve_whitespace_tags = []
+ self.string_containers = {}
+ self.attribute_dict_class = XMLAttributeDict
+ self.attribute_value_list_class = AttributeValueList
+
+ def initialize_soup(self, soup):
+ pass
+
+ def feed(self, markup):
+ self.fed = markup
+
+ def reset(self):
+ pass
+
+ def ignore(self, ignore):
+ pass
+
+ set_up_substitutions = can_be_empty_element = ignore
+
+ def prepare_markup(self, *args, **kwargs):
+ yield (
+ "prepared markup",
+ "original encoding",
+ "declared encoding",
+ "contains replacement characters",
+ )
+
+ kwargs = dict(
+ var="value",
+ # This is a deprecated BS3-era keyword argument, which
+ # will be stripped out.
+ convertEntities=True,
+ )
+ with warnings.catch_warnings(record=True):
+ soup = BeautifulSoup("", builder=Mock, **kwargs)
+ assert isinstance(soup.builder, Mock)
+ assert dict(var="value") == soup.builder.called_with
+ assert "prepared markup" == soup.builder.fed
+
+ # You can also instantiate the TreeBuilder yourself. In this
+ # case, that specific object is used and any keyword arguments
+ # to the BeautifulSoup constructor are ignored.
+ builder = Mock(**kwargs)
+ with warnings.catch_warnings(record=True) as w:
+ soup = BeautifulSoup(
+ "",
+ builder=builder,
+ ignored_value=True,
+ )
+ msg = str(w[0].message)
+ assert msg.startswith(
+ "Keyword arguments to the BeautifulSoup constructor will be ignored."
+ )
+ assert builder == soup.builder
+ assert kwargs == builder.called_with
+
+ def test_parser_markup_rejection(self):
+ # If markup is completely rejected by the parser, an
+ # explanatory ParserRejectedMarkup exception is raised.
+ class Mock(TreeBuilder):
+ def feed(self, *args, **kwargs):
+ raise ParserRejectedMarkup("Nope.")
+
+ def prepare_markup(self, markup, *args, **kwargs):
+ # We're going to try two different ways of preparing this markup,
+ # but feed() will reject both of them.
+ yield markup, None, None, False
+ yield markup, None, None, False
+
+
+ with pytest.raises(ParserRejectedMarkup) as exc_info:
+ BeautifulSoup("", builder=Mock)
+ assert (
+ "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help."
+ in str(exc_info.value)
+ )
+
+ def test_cdata_list_attributes(self):
+ # Most attribute values are represented as scalars, but the
+ # HTML standard says that some attributes, like 'class' have
+ # space-separated lists as values.
+ markup = '<a id=" an id " class=" a class "></a>'
+ soup = self.soup(markup)
+
+ # Note that the spaces are stripped for 'class' but not for 'id'.
+ a = soup.a
+ assert " an id " == a["id"]
+ assert ["a", "class"] == a["class"]
+
+ # TreeBuilder takes an argument called 'multi_valued_attributes' which lets
+ # you customize or disable this. As always, you can customize the TreeBuilder
+ # by passing in a keyword argument to the BeautifulSoup constructor.
+ soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
+ assert " a class " == soup.a["class"]
+
+ # Here are two ways of saying that `id` is a multi-valued
+ # attribute in this context, but 'class' is not.
+ for switcheroo in ({"*": "id"}, {"a": "id"}):
+ with warnings.catch_warnings(record=True):
+ # This will create a warning about not explicitly
+ # specifying a parser, but we'll ignore it.
+ soup = self.soup(
+ markup, builder=None, multi_valued_attributes=switcheroo
+ )
+ a = soup.a
+ assert ["an", "id"] == a["id"]
+ assert " a class " == a["class"]
+
+ def test_replacement_classes(self):
+ # Test the ability to pass in replacements for element classes
+ # which will be used when building the tree.
+ class TagPlus(Tag):
+ pass
+
+ class StringPlus(NavigableString):
+ pass
+
+ class CommentPlus(Comment):
+ pass
+
+ soup = self.soup(
+ "<a><b>foo</b>bar</a><!--whee-->",
+ element_classes={
+ Tag: TagPlus,
+ NavigableString: StringPlus,
+ Comment: CommentPlus,
+ },
+ )
+
+ # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
+ # rather than Tag, String, and Comment objects.
+ assert all(
+ isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.descendants
+ )
+
+ def test_alternate_string_containers(self):
+ # Test the ability to customize the string containers for
+ # different types of tags.
+ class PString(NavigableString):
+ pass
+
+ class BString(NavigableString):
+ pass
+
+ soup = self.soup(
+ "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
+ string_containers={
+ "b": BString,
+ "p": PString,
+ },
+ )
+
+ # The string before the <p> tag is a regular NavigableString.
+ assert isinstance(soup.div.contents[0], NavigableString)
+
+ # The string inside the <p> tag, but not inside the <i> tag,
+ # is a PString.
+ assert isinstance(soup.p.contents[0], PString)
+
+ # Every string inside the <b> tag is a BString, even the one that
+ # was also inside an <i> tag.
+ for s in soup.b.strings:
+ assert isinstance(s, BString)
+
+ # Now that parsing was complete, the string_container_stack
+ # (where this information was kept) has been cleared out.
+ assert [] == soup.string_container_stack
+
+ @pytest.mark.parametrize("bad_markup", [1, False, lambda x: False])
+ def test_invalid_markup_type(self, bad_markup):
+ with pytest.raises(TypeError) as exc_info:
+ BeautifulSoup(bad_markup, "html.parser")
+ assert (
+ f"Incoming markup is of an invalid type: {bad_markup!r}. Markup must be a string, a bytestring, or an open filehandle."
+ in str(exc_info.value)
+ )
+
+
+class TestOutput(SoupTest):
+ @pytest.mark.parametrize(
+ "eventual_encoding,actual_encoding",
+ [
+ ("utf-8", "utf-8"),
+ ("utf-16", "utf-16"),
+ ],
+ )
+ def test_decode_xml_declaration(self, eventual_encoding, actual_encoding):
+ # Most of the time, calling decode() on an XML document will
+ # give you a document declaration that mentions the encoding
+ # you intend to use when encoding the document as a
+ # bytestring.
+ soup = self.soup("<tag></tag>")
+ soup.is_xml = True
+ assert (
+ f'<?xml version="1.0" encoding="{actual_encoding}"?>\n<tag></tag>'
+ == soup.decode(eventual_encoding=eventual_encoding)
+ )
+
+ @pytest.mark.parametrize(
+ "eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None]
+ )
+ def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding(
+ self, eventual_encoding
+ ):
+ # But if you pass a Python internal encoding into decode(), or
+ # omit the eventual_encoding altogether, the document
+ # declaration won't mention any particular encoding.
+ soup = BeautifulSoup("<tag></tag>", "html.parser")
+ soup.is_xml = True
+ assert '<?xml version="1.0"?>\n<tag></tag>' == soup.decode(
+ eventual_encoding=eventual_encoding
+ )
+
+ def test(self):
+ # BeautifulSoup subclasses Tag and extends the decode() method.
+ # Make sure the other Tag methods which call decode() call
+ # it correctly.
+ soup = self.soup("<tag></tag>")
+ assert b"<tag></tag>" == soup.encode(encoding="utf-8")
+ assert b"<tag></tag>" == soup.encode_contents(encoding="utf-8")
+ assert "<tag></tag>" == soup.decode_contents()
+ assert "<tag>\n</tag>\n" == soup.prettify()
+
+
+class TestWarnings(SoupTest):
+ # Note that some of the tests in this class create BeautifulSoup
+ # objects directly rather than using self.soup(). That's
+ # because SoupTest.soup is defined in a different file,
+ # which will throw off the assertion in _assert_warning
+ # that the code that triggered the warning is in the same
+ # file as the test.
+
+ def _assert_warning(
+ self, warnings: Iterable[warnings.WarningMessage], cls: Type[Warning]
+ ) -> warnings.WarningMessage:
+ for w in warnings:
+ if isinstance(w.message, cls):
+ assert w.filename == __file__
+ return w
+ raise Exception("%s warning not found in %r" % (cls, warnings))
+
+ def _assert_no_parser_specified(self, w: Iterable[warnings.WarningMessage]) -> None:
+ warning = self._assert_warning(w, GuessedAtParserWarning)
+ message = str(warning.message)
+ assert message.startswith(GuessedAtParserWarning.MESSAGE[:60])
+
+ def test_warning_if_no_parser_specified(self):
+ with warnings.catch_warnings(record=True) as w:
+ BeautifulSoup("<a><b></b></a>")
+ self._assert_no_parser_specified(w)
+
+ def test_warning_if_parser_specified_too_vague(self):
+ with warnings.catch_warnings(record=True) as w:
+ BeautifulSoup("<a><b></b></a>", "html")
+ self._assert_no_parser_specified(w)
+
+ def test_no_warning_if_explicit_parser_specified(self):
+ with warnings.catch_warnings(record=True) as w:
+ self.soup("<a><b></b></a>")
+ assert [] == w
+
+ def test_warning_if_strainer_filters_everything(self):
+ strainer = SoupStrainer(name="a", string="b")
+ with warnings.catch_warnings(record=True) as w:
+ self.soup("<a><b></b></a>", parse_only=strainer)
+ warning = self._assert_warning(w, UserWarning)
+ msg = str(warning.message)
+ assert msg.startswith("The given value for parse_only will exclude everything:")
+
+ def test_parseOnlyThese_renamed_to_parse_only(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = BeautifulSoup(
+ "<a><b></b></a>",
+ "html.parser",
+ parseOnlyThese=SoupStrainer("b"),
+ )
+ warning = self._assert_warning(w, DeprecationWarning)
+ msg = str(warning.message)
+ assert "parseOnlyThese" in msg
+ assert "parse_only" in msg
+ assert b"<b></b>" == soup.encode()
+
+ def test_fromEncoding_renamed_to_from_encoding(self):
+ with warnings.catch_warnings(record=True) as w:
+ utf8 = b"\xc3\xa9"
+ soup = BeautifulSoup(utf8, "html.parser", fromEncoding="utf8")
+ warning = self._assert_warning(w, DeprecationWarning)
+ msg = str(warning.message)
+ assert "fromEncoding" in msg
+ assert "from_encoding" in msg
+ assert "utf8" == soup.original_encoding
+
+ def test_unrecognized_keyword_argument(self):
+ with pytest.raises(TypeError):
+ self.soup("<a>", no_such_argument=True)
+
+ @pytest.mark.parametrize(
+ "markup",
+ [
+ "markup.html",
+ "markup.htm",
+ "markup.HTML",
+ "markup.txt",
+ "markup.xhtml",
+ "markup.xml",
+ "/home/user/file.txt",
+ r"c:\user\file.html" r"\\server\share\path\file.XhTml",
+ ],
+ )
+ def test_resembles_filename_warning(self, markup):
+ # A warning is issued if the "markup" looks like the name of
+ # an HTML or text file, or a full path to a file on disk.
+ with warnings.catch_warnings(record=True) as w:
+ BeautifulSoup(markup, "html.parser")
+ warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
+ assert "looks more like a filename" in str(warning.message)
+
+ @pytest.mark.parametrize(
+ "markup",
+ [
+ "filename",
+ "markuphtml",
+ "markup.com",
+ "",
+ # Excluded due to an irrelevant file extension.
+ "markup.js",
+ "markup.jpg",
+ "markup.markup",
+ # Excluded due to the lack of any file extension.
+ "/home/user/file",
+ r"c:\user\file.html" r"\\server\share\path\file",
+ # Excluded because of two consecutive slashes _and_ the
+ # colon.
+ "log message containing a url http://www.url.com/ right there.html",
+ # Excluded for containing various characters or combinations
+ # not usually found in filenames.
+ "two consecutive spaces.html",
+ "two//consecutive//slashes.html",
+ "looks/like/a/filename/but/oops/theres/a#comment.html",
+ "two\nlines.html",
+ "contains?.html",
+ "contains*.html",
+ "contains#.html",
+ "contains&.html",
+ "contains;.html",
+ "contains>.html",
+ "contains<.html",
+ "contains$.html",
+ "contains|.html",
+ "contains:.html",
+ ":-at-the-front.html",
+ ],
+ )
+ def test_resembles_filename_no_warning(self, markup):
+ # The 'looks more like a filename' warning is not issued if
+ # the markup looks like a bare string, a domain name, or a
+ # file that's not an HTML file.
+ with warnings.catch_warnings(record=True) as w:
+ self.soup(markup)
+ assert [] == w
+
+ def test_url_warning_with_bytes_url(self):
+ url = b"http://www.crummybytes.com/"
+ with warnings.catch_warnings(record=True) as warning_list:
+ BeautifulSoup(url, "html.parser")
+ warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning)
+ assert "looks more like a URL" in str(warning.message)
+ assert url not in str(warning.message).encode("utf8")
+
+ def test_url_warning_with_unicode_url(self):
+ url = "http://www.crummyunicode.com/"
+ with warnings.catch_warnings(record=True) as warning_list:
+ # note - this url must differ from the bytes one otherwise
+ # python's warnings system swallows the second warning
+ BeautifulSoup(url, "html.parser")
+ warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning)
+ assert "looks more like a URL" in str(warning.message)
+ assert url not in str(warning.message)
+
+ def test_url_warning_with_bytes_and_space(self):
+ # Here the markup contains something besides a URL, so no warning
+ # is issued.
+ with warnings.catch_warnings(record=True) as warning_list:
+ self.soup(b"http://www.crummybytes.com/ is great")
+ assert not any("looks more like a URL" in str(w.message) for w in warning_list)
+
+ def test_url_warning_with_unicode_and_space(self):
+ with warnings.catch_warnings(record=True) as warning_list:
+ self.soup("http://www.crummyunicode.com/ is great")
+ assert not any("looks more like a URL" in str(w.message) for w in warning_list)
+
+
+class TestSelectiveParsing(SoupTest):
+ def test_parse_with_soupstrainer(self):
+ markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
+ strainer = SoupStrainer("b")
+ soup = self.soup(markup, parse_only=strainer)
+ assert soup.encode() == b"<b>Yes</b><b>Yes <c>Yes</c></b>"
+
+
+class TestNewTag(SoupTest):
+ """Test the BeautifulSoup.new_tag() method."""
+
+ def test_new_tag(self):
+ soup = self.soup("")
+ new_tag = soup.new_tag("foo", string="txt", bar="baz", attrs={"name": "a name"})
+ assert isinstance(new_tag, Tag)
+ assert "foo" == new_tag.name
+ assert new_tag.string == "txt"
+ assert dict(bar="baz", name="a name") == new_tag.attrs
+ assert None is new_tag.parent
+
+ # string can be null
+ new_tag = soup.new_tag("foo")
+ assert None is new_tag.string
+ new_tag = soup.new_tag("foo", string=None)
+ assert None is new_tag.string
+
+ # Or the empty string
+ new_tag = soup.new_tag("foo", string="")
+ assert "" == new_tag.string
+
+ @pytest.mark.skipif(
+ not LXML_PRESENT, reason="lxml not installed, cannot parse XML document"
+ )
+ def test_xml_tag_inherits_self_closing_rules_from_builder(self):
+ xml_soup = BeautifulSoup("", "xml")
+ xml_br = xml_soup.new_tag("br")
+ xml_p = xml_soup.new_tag("p")
+
+ # Both the <br> and <p> tag are empty-element, just because
+ # they have no contents.
+ assert b"<br/>" == xml_br.encode()
+ assert b"<p/>" == xml_p.encode()
+
+ def test_tag_inherits_self_closing_rules_from_builder(self):
+ html_soup = BeautifulSoup("", "html.parser")
+ html_br = html_soup.new_tag("br")
+ html_p = html_soup.new_tag("p")
+
+ # The HTML builder users HTML's rules about which tags are
+ # empty-element tags, and the new tags reflect these rules.
+ assert b"<br/>" == html_br.encode()
+ assert b"<p></p>" == html_p.encode()
+
+
+class TestNewString(SoupTest):
+ """Test the BeautifulSoup.new_string() method."""
+
+ def test_new_string_creates_navigablestring(self):
+ soup = self.soup("")
+ s = soup.new_string("foo")
+ assert "foo" == s
+ assert isinstance(s, NavigableString)
+
+ def test_new_string_can_create_navigablestring_subclass(self):
+ soup = self.soup("")
+ s = soup.new_string("foo", Comment)
+ assert "foo" == s
+ assert isinstance(s, Comment)
+
+
+class TestPickle(SoupTest):
+ # Test our ability to pickle the BeautifulSoup object itself.
+
+ def test_normal_pickle(self):
+ soup = self.soup("<a>some markup</a>")
+ pickled = pickle.dumps(soup)
+ unpickled = pickle.loads(pickled)
+ assert "some markup" == unpickled.a.string
+
+ def test_pickle_with_no_builder(self):
+ # We had a bug that prevented pickling from working if
+ # the builder wasn't set.
+ soup = self.soup("some markup")
+ soup.builder = None
+ pickled = pickle.dumps(soup)
+ unpickled = pickle.loads(pickled)
+ assert "some markup" == unpickled.string
+
+
+class TestEncodingConversion(SoupTest):
+ # Test Beautiful Soup's ability to decode and encode from various
+ # encodings.
+
+ def setup_method(self):
+ self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
+ self.utf8_data = self.unicode_data.encode("utf-8")
+ # Just so you know what it looks like.
+ assert (
+ self.utf8_data
+ == b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>'
+ )
+
+ def test_ascii_in_unicode_out(self):
+ # ASCII input is converted to Unicode. The original_encoding
+ # attribute is set to 'utf-8', a superset of ASCII.
+ chardet = dammit._chardet_dammit
+ logging.disable(logging.WARNING)
+ try:
+
+ def noop(str):
+ return None
+
+ # Disable chardet, which will realize that the ASCII is ASCII.
+ dammit._chardet_dammit = noop
+ ascii = b"<foo>a</foo>"
+ soup_from_ascii = self.soup(ascii)
+ unicode_output = soup_from_ascii.decode()
+ assert isinstance(unicode_output, str)
+ assert unicode_output == self.document_for(ascii.decode())
+ assert soup_from_ascii.original_encoding.lower() == "utf-8"
+ finally:
+ logging.disable(logging.NOTSET)
+ dammit._chardet_dammit = chardet
+
+ def test_unicode_in_unicode_out(self):
+ # Unicode input is left alone. The original_encoding attribute
+ # is not set.
+ soup_from_unicode = self.soup(self.unicode_data)
+ assert soup_from_unicode.decode() == self.unicode_data
+ assert soup_from_unicode.foo.string == "Sacr\xe9 bleu!"
+ assert soup_from_unicode.original_encoding is None
+
+ def test_utf8_in_unicode_out(self):
+ # UTF-8 input is converted to Unicode. The original_encoding
+ # attribute is set.
+ soup_from_utf8 = self.soup(self.utf8_data)
+ assert soup_from_utf8.decode() == self.unicode_data
+ assert soup_from_utf8.foo.string == "Sacr\xe9 bleu!"
+
+ def test_utf8_out(self):
+ # The internal data structures can be encoded as UTF-8.
+ soup_from_unicode = self.soup(self.unicode_data)
+ assert soup_from_unicode.encode("utf-8") == self.utf8_data