diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/tests')
35 files changed, 7111 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/__init__.py b/.venv/lib/python3.12/site-packages/bs4/tests/__init__.py new file mode 100644 index 00000000..b36f3f38 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/__init__.py @@ -0,0 +1,1305 @@ +# encoding: utf-8 +"""Helper classes for tests.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import pickle +import importlib +import copy +import warnings +import pytest +from bs4 import BeautifulSoup +from bs4.element import ( + AttributeValueList, + CharsetMetaAttributeValue, + Comment, + ContentMetaAttributeValue, + Doctype, + PageElement, + PYTHON_SPECIFIC_ENCODINGS, + Script, + Stylesheet, + Tag, +) +from bs4.filter import SoupStrainer +from bs4.builder import ( + XMLParsedAsHTMLWarning, +) +from bs4._typing import _IncomingMarkup + +from bs4.builder import TreeBuilder +from bs4.builder._htmlparser import HTMLParserTreeBuilder + +from typing import ( + Any, + Iterable, + Optional, + Tuple, + Type, +) + +# Some tests depend on specific third-party libraries. We use +# @pytest.mark.skipIf on the following conditionals to skip them +# if the libraries are not installed. +try: + from soupsieve import SelectorSyntaxError + + SOUP_SIEVE_PRESENT = True +except ImportError: + SOUP_SIEVE_PRESENT = False + +HTML5LIB_PRESENT = importlib.util.find_spec("html5lib") is not None + +try: + import lxml.etree + LXML_PRESENT = True + LXML_VERSION = lxml.etree.LXML_VERSION +except ImportError: + LXML_PRESENT = False + LXML_VERSION = (0,) + +default_builder: Type[TreeBuilder] = HTMLParserTreeBuilder + +BAD_DOCUMENT: str = """A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our\u2603>Tag name contains Unicode characters</our\u2603></div> +<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +""" + + +class SoupTest(object): + @property + def default_builder(self) -> Type[TreeBuilder]: + return default_builder + + def soup(self, markup: _IncomingMarkup, **kwargs: Any) -> BeautifulSoup: + """Build a Beautiful Soup object from markup.""" + builder = kwargs.pop("builder", self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) + + def document_for(self, markup: str, **kwargs: Any) -> str: + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder(**kwargs).test_fragment_to_document(markup) + + def assert_soup( + self, to_parse: _IncomingMarkup, compare_parsed_to: Optional[str] = None + ) -> None: + """Parse some markup using Beautiful Soup and verify that + the output markup is as expected. + """ + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + assert isinstance(to_parse, str) + compare_parsed_to = to_parse + + # Verify that the documents come out the same. + assert obj.decode() == self.document_for(compare_parsed_to) + + # Also run some checks on the BeautifulSoup object itself: + + # Verify that every tag that was opened was eventually closed. + + # There are no tags in the open tag counter. + assert all(v == 0 for v in list(obj.open_tag_counter.values())) + + # The only tag in the tag stack is the one for the root + # document. + assert [obj.ROOT_TAG_NAME] == [x.name for x in obj.tagStack] + + assertSoupEquals = assert_soup + + def assertConnectedness(self, element: Tag) -> None: + """Ensure that next_element and previous_element are properly + set for all descendants of the given element. + """ + earlier = None + for e in element.descendants: + if earlier: + assert e == earlier.next_element + assert earlier == e.previous_element + earlier = e + + def linkage_validator( + self, el: Tag, _recursive_call: bool = False + ) -> Optional[PageElement]: + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert ( + el.previous_element is None + ), "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert ( + el.previous_sibling is None + ), "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert ( + el.next_sibling is None + ), "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert ( + el.next_element is child + ), "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert ( + child.previous_element is el + ), "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert ( + child.previous_sibling is None + ), "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert ( + child.previous_sibling is el.contents[idx - 1] + ), "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert ( + el.contents[idx - 1].next_sibling is child + ), "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert ( + child.previous_element is last_child + ), "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert ( + last_child.next_element is child + ), "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + assert descendant is not None + # A bubbled up descendant should have no next siblings + assert ( + descendant.next_sibling is None + ), "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert ( + child.next_sibling is None + ), "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target: Optional[Tag] = el + while True: + if target is None: + assert ( + child.next_element is None + ), "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert ( + child.next_element is target.next_sibling + ), "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + def assert_selects(self, tags: Iterable[Tag], should_match: Iterable[str]) -> None: + """Make sure that the given tags have the correct text. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + assert [tag.string for tag in tags] == should_match + + def assert_selects_ids( + self, tags: Iterable[Tag], should_match: Iterable[str] + ) -> None: + """Make sure that the given tags have the correct IDs. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + assert [tag["id"] for tag in tags] == should_match + + +class TreeBuilderSmokeTest(SoupTest): + # Tests that are common to HTML and XML tree builders. + + @pytest.mark.parametrize( + "multi_valued_attributes", [None, {}, dict(b=["class"]), {"*": ["notclass"]}] + ) + def test_attribute_not_multi_valued(self, multi_valued_attributes): + markup = '<html xmlns="http://www.w3.org/1999/xhtml"><a class="a b c"></html>' + soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes) + assert soup.a["class"] == "a b c" + + @pytest.mark.parametrize( + "multi_valued_attributes", [dict(a=["class"]), {"*": ["class"]}] + ) + def test_attribute_multi_valued(self, multi_valued_attributes): + markup = '<a class="a b c">' + soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes) + assert soup.a["class"] == ["a", "b", "c"] + + def test_invalid_doctype(self): + # We don't have an official opinion on how these are parsed, + # but they shouldn't crash any of the parsers. + markup = "<![if word]>content<![endif]>" + self.soup(markup) + markup = "<!DOCTYPE html]ff>" + self.soup(markup) + + def test_doctype_filtered(self): + markup = "<!DOCTYPE html>\n<html>\n</html>" + soup = self.soup(markup, parse_only=SoupStrainer(name="html")) + assert not any(isinstance(x, Doctype) for x in soup.descendants) + + def test_custom_attribute_dict_class(self): + class MyAttributeDict(dict): + def __setitem__(self, key: str, value: Any): + # Ignore the provided value and substitute a + # hard-coded one. + super().__setitem__(key, "OVERRIDDEN") + + markup = '<a attr1="val1" attr2="val2">f</a>' + builder = self.default_builder(attribute_dict_class=MyAttributeDict) + soup = self.soup(markup, builder=builder) + tag = soup.a + assert isinstance(tag.attrs, MyAttributeDict) + assert "OVERRIDDEN" == tag["attr1"] + tag["attr3"] = True + assert "OVERRIDDEN" == tag["attr3"] + + expect = '<a attr1="OVERRIDDEN" attr2="OVERRIDDEN" attr3="OVERRIDDEN">f</a>' + assert expect == tag.decode() + + def test_custom_attribute_value_list_class(self): + class MyCustomAttributeValueList(AttributeValueList): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.append("extra") + + builder = self.default_builder( + multi_valued_attributes={"*": set(["attr2"])}, + attribute_value_list_class=MyCustomAttributeValueList, + ) + markup = '<a attr1="val1" attr2="val2">f</a>' + soup = self.soup(markup, builder=builder) + tag = soup.a + assert tag["attr1"] == "val1" + assert tag["attr2"] == ["val2", "extra"] + assert isinstance(tag["attr2"], MyCustomAttributeValueList) + + +class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest): + """A basic test of a treebuilder's competence. + + Any HTML treebuilder, present or future, should be able to pass + these tests. With invalid markup, there's room for interpretation, + and different parsers can handle it differently. But with the + markup in these tests, there's not much room for interpretation. + """ + + def test_empty_element_tags(self): + """Verify that all HTML4 and HTML5 empty element (aka void element) tags + are handled correctly. + """ + for name in [ + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "keygen", + "link", + "menuitem", + "meta", + "param", + "source", + "track", + "wbr", + "spacer", + "frame", + ]: + soup = self.soup("") + new_tag = soup.new_tag(name) + assert new_tag.is_empty_element is True + + self.assert_soup("<br/><br/><br/>", "<br/><br/><br/>") + self.assert_soup("<br /><br /><br />", "<br/><br/><br/>") + + def test_special_string_containers(self): + soup = self.soup("<style>Some CSS</style><script>Some Javascript</script>") + assert isinstance(soup.style.string, Stylesheet) + assert isinstance(soup.script.string, Script) + + soup = self.soup("<style><!--Some CSS--></style>") + assert isinstance(soup.style.string, Stylesheet) + # The contents of the style tag resemble an HTML comment, but + # it's not treated as a comment. + assert soup.style.string == "<!--Some CSS-->" + assert isinstance(soup.style.string, Stylesheet) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + assert loaded.__class__ == BeautifulSoup + assert loaded.decode() == tree.decode() + + def assertDoctypeHandled(self, doctype_fragment: str) -> None: + """Assert that a given doctype string is handled correctly.""" + doctype_str, soup = self._document_with_doctype(doctype_fragment) + + # Make sure a Doctype object was created. + doctype = soup.contents[0] + assert doctype.__class__ == Doctype + assert doctype == doctype_fragment + assert soup.encode("utf8")[: len(doctype_str)] == doctype_str + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + assert soup.p is not None + assert soup.p.contents[0] == "foo" + + def _document_with_doctype( + self, doctype_fragment: str, doctype_string: str = "DOCTYPE" + ) -> Tuple[bytes, BeautifulSoup]: + """Generate and parse a document with the given doctype.""" + doctype = "<!%s %s>" % (doctype_string, doctype_fragment) + markup = doctype + "\n<p>foo</p>" + soup = self.soup(markup) + return doctype.encode("utf8"), soup + + def test_normal_doctypes(self): + """Make sure normal, everyday HTML doctypes are handled correctly.""" + self.assertDoctypeHandled("html") + self.assertDoctypeHandled( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' + ) + + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + assert "" == doctype.strip() + + def test_mixed_case_doctype(self): + # A lowercase or mixed-case doctype becomes a Doctype. + for doctype_fragment in ("doctype", "DocType"): + doctype_str, soup = self._document_with_doctype("html", doctype_fragment) + + # Make sure a Doctype object was created and that the DOCTYPE + # is uppercase. + doctype = soup.contents[0] + assert doctype.__class__ == Doctype + assert doctype == "html" + assert soup.encode("utf8")[: len(doctype_str)] == b"<!DOCTYPE html>" + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + assert soup.p.contents[0] == "foo" + + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + + def test_namespaced_system_doctype(self): + # We can handle a namespaced doctype with a system ID. + self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup) + assert soup.encode("utf-8").replace(b"\n", b"") == markup.replace(b"\n", b"") + + # No warning was issued about parsing an XML document as HTML, + # because XHTML is both. + assert w == [] + + def test_namespaced_html(self): + # When a namespaced XML document is parsed as HTML it should + # be treated as HTML with weird tag names. + markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>""" + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup) + + assert 2 == len(soup.find_all("ns1:foo")) + + # n.b. no "you're parsing XML as HTML" warning was given + # because there was no XML declaration. + assert [] == w + + def test_detect_xml_parsed_as_html(self): + # A warning is issued when parsing an XML document as HTML, + # but basic stuff should still work. + markup = b"""<?xml version="1.0" encoding="utf-8"?><tag>string</tag>""" + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup) + assert soup.tag.string == "string" + [warning] = w + assert isinstance(warning.message, XMLParsedAsHTMLWarning) + assert str(warning.message) == XMLParsedAsHTMLWarning.MESSAGE + + # NOTE: the warning is not issued if the document appears to + # be XHTML (tested with test_real_xhtml_document in the + # superclass) or if there is no XML declaration (tested with + # test_namespaced_html in the superclass). + + def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = """<?PITarget PIContent?>""" + soup = self.soup(markup) + assert markup == soup.decode() + + markup = b"""<?PITarget PIContent?>""" + soup = self.soup(markup) + assert markup == soup.encode("utf8") + + def test_deepcopy(self): + """Make sure you can copy the tree builder. + + This is important because the builder is part of a + BeautifulSoup object, and we want to be able to copy that. + """ + copy.deepcopy(self.default_builder) + + def test_p_tag_is_never_empty_element(self): + """A <p> tag is never designated as an empty-element tag. + + Even if the markup shows it as an empty-element tag, it + shouldn't be presented that way. + """ + soup = self.soup("<p/>") + assert not soup.p.is_empty_element + assert str(soup.p) == "<p></p>" + + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assert_soup("<p>", "<p></p>") + self.assert_soup("<b>", "<b></b>") + + self.assert_soup("<br>", "<br/>") + + def test_br_is_always_empty_element_tag(self): + """A <br> tag is designated as an empty-element tag. + + Some parsers treat <br></br> as one <br/> tag, some parsers as + two tags, but it should always be an empty-element tag. + """ + soup = self.soup("<br></br>") + assert soup.br.is_empty_element + assert str(soup.br) == "<br/>" + + def test_nested_formatting_elements(self): + self.assert_soup("<em><em></em></em>") + + def test_double_head(self): + html = """<!DOCTYPE html> +<html> +<head> +<title>Ordinary HEAD element test</title> +</head> +<script type="text/javascript"> +alert("Help!"); +</script> +<body> +Hello, world! +</body> +</html> +""" + soup = self.soup(html) + assert "text/javascript" == soup.find("script")["type"] + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "<p>foo<!--foobar-->baz</p>" + self.assert_soup(markup) + + soup = self.soup(markup) + comment = soup.find(string="foobar") + assert comment.__class__ == Comment + + # The comment is properly integrated into the tree. + foo = soup.find(string="foo") + assert comment == foo.next_element + baz = soup.find(string="baz") + assert comment == baz.previous_element + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in <pre> and <textarea> tags, + even if that would mean not prettifying the markup. + """ + pre_markup = "<pre>a z</pre>\n" + textarea_markup = "<textarea> woo\nwoo </textarea>\n" + self.assert_soup(pre_markup) + self.assert_soup(textarea_markup) + + soup = self.soup(pre_markup) + assert soup.pre.prettify() == pre_markup + + soup = self.soup(textarea_markup) + assert soup.textarea.prettify() == textarea_markup + + soup = self.soup("<textarea></textarea>") + assert soup.textarea.prettify() == "<textarea></textarea>\n" + + def test_nested_inline_elements(self): + """Inline elements can be nested indefinitely.""" + b_tag = "<b>Inside a B tag</b>" + self.assert_soup(b_tag) + + nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" + self.assert_soup(nested_b_tag) + + double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" + self.assert_soup(double_nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup("<blockquote><p><b>Foo</b></p></blockquote>") + blockquote = soup.blockquote + assert blockquote.p.b.string == "Foo" + assert blockquote.b.string == "Foo" + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ( + '<table id="1">' + "<tr>" + "<td>Here's another table:" + '<table id="2">' + "<tr><td>foo</td></tr>" + "</table></td>" + ) + + self.assert_soup( + markup, + '<table id="1"><tr><td>Here\'s another table:' + '<table id="2"><tr><td>foo</td></tr></table>' + "</td></tr></table>", + ) + + self.assert_soup( + "<table><thead><tr><td>Foo</td></tr></thead>" + "<tbody><tr><td>Bar</td></tr></tbody>" + "<tfoot><tr><td>Baz</td></tr></tfoot></table>" + ) + + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '<div class=" foo bar "></a>' + soup = self.soup(markup) + assert ["foo", "bar"] == soup.div["class"] + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + assert soup.div == soup.find("div", class_="foo bar") + + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '<table><div><div class="css"></div></div></table>' + soup = self.soup(markup) + assert ["css"] == soup.div.div["class"] + + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # <html> tag. This has caused problems with multivalued + # attributes. + markup = '<html class="a b"></html>' + soup = self.soup(markup) + assert ["a", "b"] == soup.html["class"] + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assert_soup('<a b="<a>"></a>', '<a b="<a>"></a>') + + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assert_soup( + "<p>• AT&T is in the s&p 500</p>", + "<p>\u2022 AT&T is in the s&p 500</p>", + ) + + def test_apos_entity(self): + self.assert_soup( + "<p>Bob's Bar</p>", + "<p>Bob's Bar</p>", + ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "<p>“Hello” -☃</p>" + soup = self.soup(markup) + assert "“Hello†-☃" == soup.p.string + + def test_entities_in_attributes_converted_to_unicode(self): + expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' + self.assert_soup('<p id="piñata"></p>', expect) + self.assert_soup('<p id="piñata"></p>', expect) + self.assert_soup('<p id="piñata"></p>', expect) + self.assert_soup('<p id="piñata"></p>', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = "<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>" + self.assert_soup("<p>piñata</p>", expect) + self.assert_soup("<p>piñata</p>", expect) + self.assert_soup("<p>piñata</p>", expect) + self.assert_soup("<p>piñata</p>", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assert_soup( + "<p>I said "good day!"</p>", '<p>I said "good day!"</p>' + ) + + def test_out_of_range_entity(self): + expect = "\N{REPLACEMENT CHARACTER}" + self.assert_soup("�", expect) + self.assert_soup("�", expect) + self.assert_soup("�", expect) + + def test_multipart_strings(self): + "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." + soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") + assert "p" == soup.h2.string.next_element.name + assert "p" == soup.p.name + self.assertConnectedness(soup) + + def test_invalid_html_entity(self): + # The html.parser treebuilder can't distinguish between an + # invalid HTML entity with a semicolon and an invalid HTML + # entity with no semicolon (see its subclass for the tested + # behavior). But the other treebuilders can. + markup = "<p>a &nosuchentity b</p>" + soup = self.soup(markup) + assert "<p>a &nosuchentity b</p>" == soup.p.decode() + + markup = "<p>a &nosuchentity; b</p>" + soup = self.soup(markup) + assert "<p>a &nosuchentity; b</p>" == soup.p.decode() + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """<html><head></head> + <link></link> + <body>foo</body> +</html> +""" + soup = self.soup(content) + assert soup.html.body is not None + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """<!DOCTYPE html> +<html> + <body> + <article id="a" > + <div><a href="1"></div> + <footer> + <a href="2"></a> + </footer> + </article> + </body> +</html> +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' + soup = self.soup(markup) + assert markup == soup.encode() + assert "http://www.w3.org/1999/xhtml" == soup.html["xmlns"] + assert "http://www.w3.org/1998/Math/MathML" == soup.html["xmlns:mathml"] + assert "http://www.w3.org/2000/svg" == soup.html["xmlns:svg"] + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'<a class="foo bar">' + soup = self.soup(markup) + assert ["foo", "bar"] == soup.a["class"] + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' + soup = self.soup(markup) + assert "Sacr\xe9 bleu!" == soup.body.string + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", parse_only=strainer) + assert soup.decode() == "<b>bold</b>" + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assert_soup("<foo attr='bar'></foo>", '<foo attr="bar"></foo>') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + self.assert_soup(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + soup = self.soup(text) + soup.foo["attr"] = 'Brawls happen at "Bob\'s Bar"' + self.assert_soup( + soup.foo.decode(), + """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""", + ) + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assert_soup( + '<this is="really messed up & stuff"></this>', + '<this is="really messed up & stuff"></this>', + ) + + self.assert_soup( + '<a href="http://example.org?a=1&b=2;3">foo</a>', + '<a href="http://example.org?a=1&b=2;3">foo</a>', + ) + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assert_soup('<a href="http://example.org?a=1&b=2;3"></a>') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "<p><<sacré bleu!>></p>" + expected = ( + "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" + ) + self.assert_soup(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"<p>\x91Foo\x92</p>" + soup = self.soup(quote, from_encoding="windows-1252") + assert ( + soup.p.string + == "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}" + ) + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("<a> </a>") + assert soup.a.string == "\N{NO-BREAK SPACE}" * 2 + + def test_entities_converted_on_the_way_out(self): + text = "<p><<sacré bleu!>></p>" + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode( + "utf-8" + ) + soup = self.soup(text) + assert soup.p.encode("utf-8") == expected + + def test_real_iso_8859_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-8859-1. + unicode_html = '<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' + + # That's because we're going to encode it into ISO-8859-1, + # and use that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-8859-1 HTML. + soup = self.soup(iso_latin_html) + + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-8859-1. + expected = unicode_html.replace("ISO-8859-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + assert result == expected + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b"<html><head></head><body><pre>" + b"\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f" + b"\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c" + b"\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B" + b"</pre></body></html>" + ) + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + assert soup.encode("utf-8") == unicode_html.encode("utf-8") + assert soup.encode("euc_jp") == unicode_html.encode("euc_jp") + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b"<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>" + soup = self.soup(hebrew_document, from_encoding="iso8859-8") + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ("iso8859-8", "iso-8859-8") + assert soup.encode("utf-8") == ( + hebrew_document.decode("iso8859-8").encode("utf-8") + ) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ( + '<meta content="text/html; charset=x-sjis" ' 'http-equiv="Content-type"/>' + ) + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + "<html><head>\n%s\n" + '<meta http-equiv="Content-language" content="ja"/>' + "</head><body>Shift-JIS markup goes here." + ) % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find("meta", {"http-equiv": "Content-type"}) + content = parsed_meta["content"] + assert "text/html; charset=x-sjis" == content + + # But that value is actually a ContentMetaAttributeValue object. + assert isinstance(content, ContentMetaAttributeValue) + + # And it will take on a value that reflects its current + # encoding. + assert "text/html; charset=utf8" == content.substitute_encoding("utf8") + + # No matter how the <meta> tag is encoded, its charset attribute + # will always be accurate. + assert b"charset=utf8" in parsed_meta.encode("utf8") + assert b"charset=shift-jis" in parsed_meta.encode("shift-jis") + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = '<meta id="encoding" charset="x-sjis" />' + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + "<html><head>\n%s\n" + '<meta http-equiv="Content-language" content="ja"/>' + "</head><body>Shift-JIS markup goes here." + ) % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find("meta", id="encoding") + charset = parsed_meta["charset"] + assert "x-sjis" == charset + + # But that value is actually a CharsetMetaAttributeValue object. + assert isinstance(charset, CharsetMetaAttributeValue) + + # And it will take on a value that reflects its current + # encoding. + assert "utf8" == charset.substitute_encoding("utf8") + + # No matter how the <meta> tag is encoded, its charset attribute + # will always be accurate. + assert b'charset="utf8"' in parsed_meta.encode("utf8") + assert b'charset="shift-jis"' in parsed_meta.encode("shift-jis") + + def test_python_specific_encodings_not_used_in_charset(self): + # You can encode an HTML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. Instead, the document will appear to + # have no encoding. + for markup in [ + b'<meta charset="utf8"></head>' b'<meta id="encoding" charset="utf-8" />' + ]: + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + "idna", + "mbcs", + "oem", + "undefined", + "string_escape", + "string-escape", + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'meta charset=""' in encoded + assert encoding.encode("ascii") not in encoded + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("<a>text</a>") + data.a["foo"] = "bar" + assert '<a foo="bar">text</a>' == data.a.decode() + + def test_closing_tag_with_no_opening_tag(self): + # Without BeautifulSoup.open_tag_counter, the </span> tag will + # cause _popToTag to be called over and over again as we look + # for a <span> tag that wasn't there. The result is that 'text2' + # will show up outside the body of the document. + soup = self.soup("<body><div><p>text1</p></span>text2</div></body>") + assert "<body><div><p>text1</p>text2</div></body>" == soup.body.decode() + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest): + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + assert loaded.__class__ == BeautifulSoup + assert loaded.decode() == tree.decode() + + def test_docstring_generated(self): + soup = self.soup("<root/>") + assert soup.encode() == b'<?xml version="1.0" encoding="utf-8"?>\n<root/>' + + def test_xml_declaration(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>""" + soup = self.soup(markup) + assert markup == soup.encode("utf8") + + def test_python_specific_encodings_not_used_in_xml_declaration(self): + # You can encode an XML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. + markup = b"""<?xml version="1.0"?>\n<foo/>""" + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + "idna", + "mbcs", + "oem", + "undefined", + "string_escape", + "string-escape", + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'<?xml version="1.0"?>' in encoded + assert encoding.encode("ascii") not in encoded + + def test_processing_instruction(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>""" + soup = self.soup(markup) + assert markup == soup.encode("utf8") + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + assert soup.encode("utf-8") == markup + + def test_nested_namespaces(self): + doc = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<parent xmlns="http://ns1/"> +<child xmlns="http://ns2/" xmlns:ns3="http://ns3/"> +<grandchild ns3:attr="value" xmlns="http://ns4/"/> +</child> +</parent>""" + soup = self.soup(doc) + assert doc == soup.encode() + + def test_formatter_processes_script_tag_for_xml_documents(self): + doc = """ + <script type="text/javascript"> + </script> +""" + soup = BeautifulSoup(doc, "lxml-xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + assert b"< < hey > >" in encoded + + def test_can_parse_unicode_document(self): + markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + soup = self.soup(markup) + assert "Sacr\xe9 bleu!" == soup.root.string + + def test_can_parse_unicode_document_begining_with_bom(self): + markup = '\N{BYTE ORDER MARK}<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + soup = self.soup(markup) + assert "Sacr\xe9 bleu!" == soup.root.string + + def test_popping_namespaced_tag(self): + markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' + soup = self.soup(markup) + assert str(soup.rss) == markup + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("<root/>") + assert ( + soup.encode("latin1") == b'<?xml version="1.0" encoding="latin1"?>\n<root/>' + ) + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = ( + b'<?xml version="1.0" encoding="utf-8"?>\n<root>' + + b"0" * (2**12) + + b"</root>" + ) + soup = self.soup(markup) + assert soup.encode("utf-8") == markup + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assert_soup("<p>", "<p/>") + self.assert_soup("<p>foo</p>") + + def test_namespaces_are_preserved(self): + markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' + soup = self.soup(markup) + root = soup.root + assert "http://example.com/" == root["xmlns:a"] + assert "http://example.net/" == root["xmlns:b"] + + def test_closing_namespaced_tag(self): + markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' + soup = self.soup(markup) + assert str(soup.p) == markup + + def test_namespaced_attributes(self): + markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' + soup = self.soup(markup) + assert str(soup.foo) == markup + + def test_namespaced_attributes_xml_namespace(self): + markup = '<foo xml:lang="fr">bar</foo>' + soup = self.soup(markup) + assert str(soup.foo) == markup + + def test_find_by_prefixed_name(self): + doc = """<?xml version="1.0" encoding="utf-8"?> +<Document xmlns="http://example.com/ns0" + xmlns:ns1="http://example.com/ns1" + xmlns:ns2="http://example.com/ns2"> + <ns1:tag>foo</ns1:tag> + <ns1:tag>bar</ns1:tag> + <ns2:tag key="value">baz</ns2:tag> +</Document> +""" + soup = self.soup(doc) + + # There are three <tag> tags. + assert 3 == len(soup.find_all("tag")) + + # But two of them are ns1:tag and one of them is ns2:tag. + assert 2 == len(soup.find_all("ns1:tag")) + assert 1 == len(soup.find_all("ns2:tag")) + + assert 1, len(soup.find_all("ns2:tag", key="value")) + assert 3, len(soup.find_all(["ns1:tag", "ns2:tag"])) + + def test_copy_tag_preserves_namespace(self): + xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<w:document xmlns:w="http://example.com/ns0"/>""" + + soup = self.soup(xml) + tag = soup.document + duplicate = copy.copy(tag) + + # The two tags have the same namespace prefix. + assert tag.prefix == duplicate.prefix + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "<a>" + soup = self.soup(markup) + assert "http://www.w3.org/1999/xhtml" == soup.a.namespace + + def test_svg_tags_have_namespace(self): + markup = "<svg><circle/></svg>" + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + assert namespace == soup.svg.namespace + assert namespace == soup.circle.namespace + + def test_mathml_tags_have_namespace(self): + markup = "<math><msqrt>5</msqrt></math>" + soup = self.soup(markup) + namespace = "http://www.w3.org/1998/Math/MathML" + assert namespace == soup.math.namespace + assert namespace == soup.msqrt.namespace + + def test_xml_declaration_becomes_comment(self): + markup = '<?xml version="1.0" encoding="utf-8"?><html></html>' + soup = self.soup(markup) + assert isinstance(soup.contents[0], Comment) + assert soup.contents[0] == '?xml version="1.0" encoding="utf-8"?' + assert "html" == soup.contents[0].next_element.name diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256.testcase new file mode 100644 index 00000000..4828f8a4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256.testcase @@ -0,0 +1 @@ + ÿÿ ÿ <css
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320.testcase new file mode 100644 index 00000000..b34be8b1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320.testcase @@ -0,0 +1 @@ +ÿ<!DOCTyPEV PUBLIC'''Ð'
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456.testcase new file mode 100644 index 00000000..dbeed3f5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456.testcase @@ -0,0 +1 @@ +)<a><math><TR><a><mI><a><p><a>
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016.testcase Binary files differnew file mode 100644 index 00000000..8a585ce9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632.testcase Binary files differnew file mode 100644 index 00000000..0fe66dd2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824.testcase Binary files differnew file mode 100644 index 00000000..fd411427 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000.testcase new file mode 100644 index 00000000..6248b2c5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000.testcase @@ -0,0 +1 @@ +ÿ ><applet></applet><applet></applet><apple|><applet><applet><appl›„><applet><applet></applet></applet></applet></applet><applet></applet><apple>t<applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet>et><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><azplet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><plet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet></applet></applet></applet></applet></appt></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet><<meta charset=utf-8>
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624.testcase Binary files differnew file mode 100644 index 00000000..107da539 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912.testcase new file mode 100644 index 00000000..367106c7 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912.testcase @@ -0,0 +1,2 @@ + +<![
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896.testcase new file mode 100644 index 00000000..b8536ef0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896.testcase @@ -0,0 +1 @@ +-<math><sElect><mi><sElect><sElect>
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440.testcase Binary files differnew file mode 100644 index 00000000..d8b549c5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464.testcase new file mode 100644 index 00000000..123e56d4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464.testcase @@ -0,0 +1 @@ +)<math><math><math><math><math><math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&)<math><math><annotation-xul>&
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224.testcase new file mode 100644 index 00000000..2831c484 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224.testcase @@ -0,0 +1 @@ +ñ<table><svg><html>
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640.testcase new file mode 100644 index 00000000..b60a250c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640.testcase @@ -0,0 +1 @@ +- ÿÿ <math><select><mi><select><select>t
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400.testcase Binary files differnew file mode 100644 index 00000000..a823d557 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744.testcase Binary files differnew file mode 100644 index 00000000..65af44d8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08.testcase Binary files differnew file mode 100644 index 00000000..5559adbb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a.testcase b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a.testcase Binary files differnew file mode 100644 index 00000000..88571155 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/fuzz/crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a.testcase diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_builder.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_builder.py new file mode 100644 index 00000000..87d67587 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_builder.py @@ -0,0 +1,28 @@ +import pytest +from unittest.mock import patch +from bs4.builder import DetectsXMLParsedAsHTML + + +class TestDetectsXMLParsedAsHTML: + @pytest.mark.parametrize( + "markup,looks_like_xml", + [ + ("No xml declaration", False), + ("<html>obviously HTML</html", False), + ("<?xml ><html>Actually XHTML</html>", False), + ("<?xml> < html>Tricky XHTML</html>", False), + ("<?xml ><no-html-tag>", True), + ], + ) + def test_warn_if_markup_looks_like_xml(self, markup, looks_like_xml): + # Test of our ability to guess at whether markup looks XML-ish + # _and_ not HTML-ish. + with patch("bs4.builder.DetectsXMLParsedAsHTML._warn") as mock: + for data in markup, markup.encode("utf8"): + result = DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(data) + assert result == looks_like_xml + if looks_like_xml: + assert mock.called + else: + assert not mock.called + mock.reset_mock() diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_builder_registry.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_builder_registry.py new file mode 100644 index 00000000..ad4b5a9e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_builder_registry.py @@ -0,0 +1,139 @@ +"""Tests of the builder registry.""" + +import pytest +import warnings +from typing import Type + +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry as registry, + TreeBuilder, + TreeBuilderRegistry, +) +from bs4.builder._htmlparser import HTMLParserTreeBuilder + +from . import ( + HTML5LIB_PRESENT, + LXML_PRESENT, +) + +if HTML5LIB_PRESENT: + from bs4.builder._html5lib import HTML5TreeBuilder + +if LXML_PRESENT: + from bs4.builder._lxml import ( + LXMLTreeBuilderForXML, + LXMLTreeBuilder, + ) + + +# TODO: Split out the lxml and html5lib tests into their own classes +# and gate with pytest.mark.skipIf. +class TestBuiltInRegistry(object): + """Test the built-in registry with the default builders registered.""" + + def test_combination(self): + assert registry.lookup("strict", "html") == HTMLParserTreeBuilder + if LXML_PRESENT: + assert registry.lookup("fast", "html") == LXMLTreeBuilder + assert registry.lookup("permissive", "xml") == LXMLTreeBuilderForXML + if HTML5LIB_PRESENT: + assert registry.lookup("html5lib", "html") == HTML5TreeBuilder + + def test_lookup_by_markup_type(self): + if LXML_PRESENT: + assert registry.lookup("html") == LXMLTreeBuilder + assert registry.lookup("xml") == LXMLTreeBuilderForXML + else: + assert registry.lookup("xml") is None + if HTML5LIB_PRESENT: + assert registry.lookup("html") == HTML5TreeBuilder + else: + assert registry.lookup("html") == HTMLParserTreeBuilder + + def test_named_library(self): + if LXML_PRESENT: + assert registry.lookup("lxml", "xml") == LXMLTreeBuilderForXML + assert registry.lookup("lxml", "html") == LXMLTreeBuilder + if HTML5LIB_PRESENT: + assert registry.lookup("html5lib") == HTML5TreeBuilder + + assert registry.lookup("html.parser") == HTMLParserTreeBuilder + + def test_beautifulsoup_constructor_does_lookup(self): + with warnings.catch_warnings(record=True): + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) + pass + + # You'll get an exception if BS can't find an appropriate + # builder. + with pytest.raises(ValueError): + BeautifulSoup("", features="no-such-feature") + + +class TestRegistry(object): + """Test the TreeBuilderRegistry class in general.""" + + def setup_method(self): + self.registry = TreeBuilderRegistry() + + def builder_for_features(self, *feature_list: str) -> Type[TreeBuilder]: + cls = type( + "Builder_" + "_".join(feature_list), (object,), {"features": feature_list} + ) + + self.registry.register(cls) + return cls + + def test_register_with_no_features(self): + builder = self.builder_for_features() + + # Since the builder advertises no features, you can't find it + # by looking up features. + assert self.registry.lookup("foo") is None + + # But you can find it by doing a lookup with no features, if + # this happens to be the only registered builder. + assert self.registry.lookup() == builder + + def test_register_with_features_makes_lookup_succeed(self): + builder = self.builder_for_features("foo", "bar") + assert self.registry.lookup("foo") is builder + assert self.registry.lookup("bar") is builder + + def test_lookup_fails_when_no_builder_implements_feature(self): + assert self.registry.lookup("baz") is None + + def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): + self.builder_for_features("foo") + builder2 = self.builder_for_features("bar") + assert self.registry.lookup() == builder2 + + def test_lookup_fails_when_no_tree_builders_registered(self): + assert self.registry.lookup() is None + + def test_lookup_gets_most_recent_builder_supporting_all_features(self): + self.builder_for_features("foo") + self.builder_for_features("bar") + has_both_early = self.builder_for_features("foo", "bar", "baz") + has_both_late = self.builder_for_features("foo", "bar", "quux") + self.builder_for_features("bar") + self.builder_for_features("foo") + + # There are two builders featuring 'foo' and 'bar', but + # the one that also features 'quux' was registered later. + assert self.registry.lookup("foo", "bar") == has_both_late + + # There is only one builder featuring 'foo', 'bar', and 'baz'. + assert self.registry.lookup("foo", "bar", "baz") == has_both_early + + def test_lookup_fails_when_cannot_reconcile_requested_features(self): + self.builder_for_features("foo", "bar") + self.builder_for_features("foo", "baz") + assert self.registry.lookup("bar", "baz") is None diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_css.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_css.py new file mode 100644 index 00000000..b1c42379 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_css.py @@ -0,0 +1,536 @@ +import pytest +import types + +from bs4 import ( + BeautifulSoup, + ResultSet, +) + +from typing import ( + Any, + List, + Tuple, + Type, +) + +from packaging.version import Version + +from . import ( + SoupTest, + SOUP_SIEVE_PRESENT, +) + +SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS: Type[Exception] +if SOUP_SIEVE_PRESENT: + from soupsieve import __version__, SelectorSyntaxError + + # Some behavior changes in soupsieve 2.6 that affects one of our + # tests. For the test to run under all versions of Python + # supported by Beautiful Soup (which includes versions of Python + # not supported by soupsieve 2.6) we need to check both behaviors. + SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS = SelectorSyntaxError + if Version(__version__) < Version("2.6"): + SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS = NotImplementedError + + +@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed") +class TestCSSSelectors(SoupTest): + """Test basic CSS selector functionality. + + This functionality is implemented in soupsieve, which has a much + more comprehensive test suite, so this is basically an extra check + that soupsieve works as expected. + """ + + HTML = """ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" +"http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> +<title>The title</title> +<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> +</head> +<body> +<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> +<div id="main" class="fancy"> +<div id="inner"> +<h1 id="header1">An H1</h1> +<p>Some text</p> +<p class="onep" id="p1">Some more text</p> +<h2 id="header2">An H2</h2> +<p class="class1 class2 class3" id="pmulti">Another</p> +<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> +<h2 id="header3">Another H2</h2> +<a id="me" href="http://simonwillison.net/" rel="me">me</a> +<span class="s1"> +<a href="#" id="s1a1">span1a1</a> +<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> +<span class="span2"> +<a href="#" id="s2a1">span2a1</a> +</span> +<span class="span3"></span> +<custom-dashed-tag class="dashed" id="dash2"/> +<div data-tag="dashedvalue" id="data1"/> +</span> +</div> +<x id="xid"> +<z id="zida"/> +<z id="zidab"/> +<z id="zidac"/> +</x> +<y id="yid"> +<z id="zidb"/> +</y> +<p lang="en" id="lang-en">English</p> +<p lang="en-gb" id="lang-en-gb">English UK</p> +<p lang="en-us" id="lang-en-us">English US</p> +<p lang="fr" id="lang-fr">French</p> +</div> + +<div id="footer"> +</div> +""" + + def setup_method(self): + self._soup = BeautifulSoup(self.HTML, "html.parser") + + def assert_css_selects( + self, selector: str, expected_ids: List[str], **kwargs: Any + ) -> None: + results = self._soup.select(selector, **kwargs) + assert isinstance(results, ResultSet) + el_ids = [el["id"] for el in results] + el_ids.sort() + expected_ids.sort() + assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % ( + selector, + ", ".join(expected_ids), + ", ".join(el_ids), + ) + + assertSelect = assert_css_selects + + def assert_css_select_multiple(self, *tests: Tuple[str, List[str]]): + for selector, expected_ids in tests: + self.assert_css_selects(selector, expected_ids) + + def test_precompiled(self): + sel = self._soup.css.compile("div") + + els = self._soup.select(sel) + assert len(els) == 4 + for div in els: + assert div.name == "div" + + el = self._soup.select_one(sel) + assert "main" == el["id"] + + def test_one_tag_one(self): + els = self._soup.select("title") + assert len(els) == 1 + assert els[0].name == "title" + assert els[0].contents == ["The title"] + + def test_one_tag_many(self): + els = self._soup.select("div") + assert len(els) == 4 + for div in els: + assert div.name == "div" + + el = self._soup.select_one("div") + assert "main" == el["id"] + + def test_select_one_returns_none_if_no_match(self): + match = self._soup.select_one("nonexistenttag") + assert None is match + + def test_tag_in_tag_one(self): + self.assert_css_selects("div div", ["inner", "data1"]) + + def test_tag_in_tag_many(self): + for selector in ("html div", "html body div", "body div"): + self.assert_css_selects(selector, ["data1", "main", "inner", "footer"]) + + def test_limit(self): + self.assert_css_selects("html div", ["main"], limit=1) + self.assert_css_selects("html body div", ["inner", "main"], limit=2) + self.assert_css_selects( + "body div", ["data1", "main", "inner", "footer"], limit=10 + ) + + def test_tag_no_match(self): + assert len(self._soup.select("del")) == 0 + + def test_invalid_tag(self): + with pytest.raises(SelectorSyntaxError): + self._soup.select("tag%t") + + def test_select_dashed_tag_ids(self): + self.assert_css_selects("custom-dashed-tag", ["dash1", "dash2"]) + + def test_select_dashed_by_id(self): + dashed = self._soup.select('custom-dashed-tag[id="dash2"]') + assert dashed[0].name == "custom-dashed-tag" + assert dashed[0]["id"] == "dash2" + + def test_dashed_tag_text(self): + assert self._soup.select("body > custom-dashed-tag")[0].text == "Hello there." + + def test_select_dashed_matches_find_all(self): + assert self._soup.select("custom-dashed-tag") == self._soup.find_all( + "custom-dashed-tag" + ) + + def test_header_tags(self): + self.assert_css_select_multiple( + ("h1", ["header1"]), + ("h2", ["header2", "header3"]), + ) + + def test_class_one(self): + for selector in (".onep", "p.onep", "html p.onep"): + els = self._soup.select(selector) + assert len(els) == 1 + assert els[0].name == "p" + assert els[0]["class"] == ["onep"] + + def test_class_mismatched_tag(self): + els = self._soup.select("div.onep") + assert len(els) == 0 + + def test_one_id(self): + for selector in ("div#inner", "#inner", "div div#inner"): + self.assert_css_selects(selector, ["inner"]) + + def test_bad_id(self): + els = self._soup.select("#doesnotexist") + assert len(els) == 0 + + def test_items_in_id(self): + els = self._soup.select("div#inner p") + assert len(els) == 3 + for el in els: + assert el.name == "p" + assert els[1]["class"] == ["onep"] + assert not els[0].has_attr("class") + + def test_a_bunch_of_emptys(self): + for selector in ("div#main del", "div#main div.oops", "div div#main"): + assert len(self._soup.select(selector)) == 0 + + def test_multi_class_support(self): + for selector in ( + ".class1", + "p.class1", + ".class2", + "p.class2", + ".class3", + "p.class3", + "html p.class2", + "div#inner .class2", + ): + self.assert_css_selects(selector, ["pmulti"]) + + def test_multi_class_selection(self): + for selector in (".class1.class3", ".class3.class2", ".class1.class2.class3"): + self.assert_css_selects(selector, ["pmulti"]) + + def test_child_selector(self): + self.assert_css_selects(".s1 > a", ["s1a1", "s1a2"]) + self.assert_css_selects(".s1 > a span", ["s1a2s1"]) + + def test_child_selector_id(self): + self.assert_css_selects(".s1 > a#s1a2 span", ["s1a2s1"]) + + def test_attribute_equals(self): + self.assert_css_select_multiple( + ('p[class="onep"]', ["p1"]), + ('p[id="p1"]', ["p1"]), + ('[class="onep"]', ["p1"]), + ('[id="p1"]', ["p1"]), + ('link[rel="stylesheet"]', ["l1"]), + ('link[type="text/css"]', ["l1"]), + ('link[href="blah.css"]', ["l1"]), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ["l1"]), + ('[type="text/css"]', ["l1"]), + ('[href="blah.css"]', ["l1"]), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assert_css_select_multiple( + ('p[class~="class1"]', ["pmulti"]), + ('p[class~="class2"]', ["pmulti"]), + ('p[class~="class3"]', ["pmulti"]), + ('[class~="class1"]', ["pmulti"]), + ('[class~="class2"]', ["pmulti"]), + ('[class~="class3"]', ["pmulti"]), + ('a[rel~="friend"]', ["bob"]), + ('a[rel~="met"]', ["bob"]), + ('[rel~="friend"]', ["bob"]), + ('[rel~="met"]', ["bob"]), + ) + + def test_attribute_startswith(self): + self.assert_css_select_multiple( + ('[rel^="style"]', ["l1"]), + ('link[rel^="style"]', ["l1"]), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ["l1"]), + ('a[href^="http://"]', ["bob", "me"]), + ('[href^="http://"]', ["bob", "me"]), + ('[id^="p"]', ["pmulti", "p1"]), + ('[id^="m"]', ["me", "main"]), + ('div[id^="m"]', ["main"]), + ('a[id^="m"]', ["me"]), + ('div[data-tag^="dashed"]', ["data1"]), + ) + + def test_attribute_endswith(self): + self.assert_css_select_multiple( + ('[href$=".css"]', ["l1"]), + ('link[href$=".css"]', ["l1"]), + ('link[id$="1"]', ["l1"]), + ( + '[id$="1"]', + ["data1", "l1", "p1", "header1", "s1a1", "s2a1", "s1a2s1", "dash1"], + ), + ('div[id$="1"]', ["data1"]), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assert_css_select_multiple( + # From test_attribute_startswith + ('[rel*="style"]', ["l1"]), + ('link[rel*="style"]', ["l1"]), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ["l1"]), + ('[href*="http://"]', ["bob", "me"]), + ('[id*="p"]', ["pmulti", "p1"]), + ('div[id*="m"]', ["main"]), + ('a[id*="m"]', ["me"]), + # From test_attribute_endswith + ('[href*=".css"]', ["l1"]), + ('link[href*=".css"]', ["l1"]), + ('link[id*="1"]', ["l1"]), + ( + '[id*="1"]', + [ + "data1", + "l1", + "p1", + "header1", + "s1a1", + "s1a2", + "s2a1", + "s1a2s1", + "dash1", + ], + ), + ('div[id*="1"]', ["data1"]), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ["bob", "me", "l1"]), + ('a[href*="."]', ["bob", "me"]), + ('link[href*="."]', ["l1"]), + ('div[id*="n"]', ["main", "inner"]), + ('div[id*="nn"]', ["inner"]), + ('div[data-tag*="edval"]', ["data1"]), + ) + + def test_attribute_exact_or_hypen(self): + self.assert_css_select_multiple( + ('p[lang|="en"]', ["lang-en", "lang-en-gb", "lang-en-us"]), + ('[lang|="en"]', ["lang-en", "lang-en-gb", "lang-en-us"]), + ('p[lang|="fr"]', ["lang-fr"]), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assert_css_select_multiple( + ("[rel]", ["l1", "bob", "me"]), + ("link[rel]", ["l1"]), + ("a[rel]", ["bob", "me"]), + ("[lang]", ["lang-en", "lang-en-gb", "lang-en-us", "lang-fr"]), + ("p[class]", ["p1", "pmulti"]), + ("[blah]", []), + ("p[blah]", []), + ("div[data-tag]", ["data1"]), + ) + + def test_quoted_space_in_selector_name(self): + html = """<div style="display: wrong">nope</div> + <div style="display: right">yes</div> + """ + soup = BeautifulSoup(html, "html.parser") + [chosen] = soup.select('div[style="display: right"]') + assert "yes" == chosen.string + + def test_unsupported_pseudoclass(self): + with pytest.raises(SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS): + self._soup.select("a:no-such-pseudoclass") + + with pytest.raises(SelectorSyntaxError): + self._soup.select("a:nth-of-type(a)") + + def test_nth_of_type(self): + # Try to select first paragraph + els = self._soup.select("div#inner p:nth-of-type(1)") + assert len(els) == 1 + assert els[0].string == "Some text" + + # Try to select third paragraph + els = self._soup.select("div#inner p:nth-of-type(3)") + assert len(els) == 1 + assert els[0].string == "Another" + + # Try to select (non-existent!) fourth paragraph + els = self._soup.select("div#inner p:nth-of-type(4)") + assert len(els) == 0 + + # Zero will select no tags. + els = self._soup.select("div p:nth-of-type(0)") + assert len(els) == 0 + + def test_nth_of_type_direct_descendant(self): + els = self._soup.select("div#inner > p:nth-of-type(1)") + assert len(els) == 1 + assert els[0].string == "Some text" + + def test_id_child_selector_nth_of_type(self): + self.assert_css_selects("#inner > p:nth-of-type(2)", ["p1"]) + + def test_select_on_element(self): + # Other tests operate on the tree; this operates on an element + # within the tree. + inner = self._soup.find("div", id="main") + selected = inner.select("div") + # The <div id="inner"> tag was selected. The <div id="footer"> + # tag was not. + self.assert_selects_ids(selected, ["inner", "data1"]) + + def test_overspecified_child_id(self): + self.assert_css_selects(".fancy #inner", ["inner"]) + self.assert_css_selects(".normal #inner", []) + + def test_adjacent_sibling_selector(self): + self.assert_css_selects("#p1 + h2", ["header2"]) + self.assert_css_selects("#p1 + h2 + p", ["pmulti"]) + self.assert_css_selects("#p1 + #header2 + .class1", ["pmulti"]) + assert [] == self._soup.select("#p1 + p") + + def test_general_sibling_selector(self): + self.assert_css_selects("#p1 ~ h2", ["header2", "header3"]) + self.assert_css_selects("#p1 ~ #header2", ["header2"]) + self.assert_css_selects("#p1 ~ h2 + a", ["me"]) + self.assert_css_selects('#p1 ~ h2 + [rel="me"]', ["me"]) + assert [] == self._soup.select("#inner ~ h2") + + def test_dangling_combinator(self): + with pytest.raises(SelectorSyntaxError): + self._soup.select("h1 >") + + def test_sibling_combinator_wont_select_same_tag_twice(self): + self.assert_css_selects("p[lang] ~ p", ["lang-en-gb", "lang-en-us", "lang-fr"]) + + # Test the selector grouping operator (the comma) + def test_multiple_select(self): + self.assert_css_selects("x, y", ["xid", "yid"]) + + def test_multiple_select_with_no_space(self): + self.assert_css_selects("x,y", ["xid", "yid"]) + + def test_multiple_select_with_more_space(self): + self.assert_css_selects("x, y", ["xid", "yid"]) + + def test_multiple_select_duplicated(self): + self.assert_css_selects("x, x", ["xid"]) + + def test_multiple_select_sibling(self): + self.assert_css_selects("x, y ~ p[lang=fr]", ["xid", "lang-fr"]) + + def test_multiple_select_tag_and_direct_descendant(self): + self.assert_css_selects("x, y > z", ["xid", "zidb"]) + + def test_multiple_select_direct_descendant_and_tags(self): + self.assert_css_selects( + "div > x, y, z", ["xid", "yid", "zida", "zidb", "zidab", "zidac"] + ) + + def test_multiple_select_indirect_descendant(self): + self.assert_css_selects( + "div x,y, z", ["xid", "yid", "zida", "zidb", "zidab", "zidac"] + ) + + def test_invalid_multiple_select(self): + with pytest.raises(SelectorSyntaxError): + self._soup.select(",x, y") + with pytest.raises(SelectorSyntaxError): + self._soup.select("x,,y") + + def test_multiple_select_attrs(self): + self.assert_css_selects("p[lang=en], p[lang=en-gb]", ["lang-en", "lang-en-gb"]) + + def test_multiple_select_ids(self): + self.assert_css_selects( + "x, y > z[id=zida], z[id=zidab], z[id=zidb]", ["xid", "zidb", "zidab"] + ) + + def test_multiple_select_nested(self): + self.assert_css_selects("body > div > x, y > z", ["xid", "zidb"]) + + def test_select_duplicate_elements(self): + # When markup contains duplicate elements, a multiple select + # will find all of them. + markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' + soup = BeautifulSoup(markup, "html.parser") + selected = soup.select(".c1, .c2") + assert 3 == len(selected) + + # Verify that find_all finds the same elements, though because + # of an implementation detail it finds them in a different + # order. + for element in soup.find_all(class_=["c1", "c2"]): + assert element in selected + + def test_closest(self): + inner = self._soup.find("div", id="inner") + closest = inner.css.closest("div[id=main]") + assert closest == self._soup.find("div", id="main") + + def test_match(self): + inner = self._soup.find("div", id="inner") + main = self._soup.find("div", id="main") + assert inner.css.match("div[id=main]") is False + assert main.css.match("div[id=main]") is True + + def test_iselect(self): + gen = self._soup.css.iselect("h2") + assert isinstance(gen, types.GeneratorType) + [header2, header3] = gen + assert header2["id"] == "header2" + assert header3["id"] == "header3" + + def test_filter(self): + inner = self._soup.find("div", id="inner") + results = inner.css.filter("h2") + assert len(inner.css.filter("h2")) == 2 + + results = inner.css.filter("h2[id=header3]") + assert isinstance(results, ResultSet) + [result] = results + assert result["id"] == "header3" + + def test_escape(self): + m = self._soup.css.escape + assert m(".foo#bar") == "\\.foo\\#bar" + assert m("()[]{}") == "\\(\\)\\[\\]\\{\\}" + assert m(".foo") == self._soup.css.escape(".foo") diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_dammit.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_dammit.py new file mode 100644 index 00000000..ca554fea --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_dammit.py @@ -0,0 +1,433 @@ +# encoding: utf-8 +import pytest +import logging +import warnings +import bs4 +from bs4 import BeautifulSoup +from bs4.dammit import ( + EntitySubstitution, + EncodingDetector, + UnicodeDammit, +) + + +class TestUnicodeDammit(object): + """Standalone tests of UnicodeDammit.""" + + def test_unicode_input(self): + markup = "I'm already Unicode! \N{SNOWMAN}" + dammit = UnicodeDammit(markup) + assert dammit.unicode_markup == markup + + @pytest.mark.parametrize( + "smart_quotes_to,expect_converted", + [ + (None, "\u2018\u2019\u201c\u201d"), + ("xml", "‘’“”"), + ("html", "‘’“”"), + ("ascii", "''" + '""'), + ], + ) + def test_smart_quotes_to(self, smart_quotes_to, expect_converted): + """Verify the functionality of the smart_quotes_to argument + to the UnicodeDammit constructor.""" + markup = b"<foo>\x91\x92\x93\x94</foo>" + converted = UnicodeDammit( + markup, + known_definite_encodings=["windows-1252"], + smart_quotes_to=smart_quotes_to, + ).unicode_markup + assert converted == "<foo>{}</foo>".format(expect_converted) + + def test_detect_utf8(self): + utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" + dammit = UnicodeDammit(utf8) + assert dammit.original_encoding.lower() == "utf-8" + assert dammit.unicode_markup == "Sacr\xe9 bleu! \N{SNOWMAN}" + + def test_convert_hebrew(self): + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + assert dammit.original_encoding.lower() == "iso-8859-8" + assert dammit.unicode_markup == "\u05dd\u05d5\u05dc\u05e9" + + def test_dont_see_smart_quotes_where_there_are_none(self): + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + dammit = UnicodeDammit(utf_8) + assert dammit.original_encoding.lower() == "utf-8" + assert dammit.unicode_markup.encode("utf-8") == utf_8 + + def test_ignore_inappropriate_codecs(self): + utf8_data = "RäksmörgÃ¥s".encode("utf-8") + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) + assert dammit.original_encoding.lower() == "utf-8" + + def test_ignore_invalid_codecs(self): + utf8_data = "RäksmörgÃ¥s".encode("utf-8") + for bad_encoding in [".utf8", "...", "utF---16.!"]: + dammit = UnicodeDammit(utf8_data, [bad_encoding]) + assert dammit.original_encoding.lower() == "utf-8" + + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = "RäksmörgÃ¥s".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + assert dammit.original_encoding.lower() == "windows-1252" + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + assert dammit.original_encoding is None + + +class TestEncodingDetector(object): + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character( + self, + ): + detected = EncodingDetector(b'<?xml version="1.0" encoding="UTF-\xdb" ?>') + encodings = list(detected.encodings) + assert "utf-\N{REPLACEMENT CHARACTER}" in encodings + + def test_detect_html5_style_meta_tag(self): + for data in ( + b'<html><meta charset="euc-jp" /></html>', + b"<html><meta charset='euc-jp' /></html>", + b"<html><meta charset=euc-jp /></html>", + b"<html><meta charset=euc-jp/></html>", + ): + dammit = UnicodeDammit(data, is_html=True) + assert "euc-jp" == dammit.original_encoding + + def test_last_ditch_entity_replacement(self): + # This is a UTF-8 document that contains bytestrings + # completely incompatible with UTF-8 (ie. encoded with some other + # encoding). + # + # Since there is no consistent encoding for the document, + # Unicode, Dammit will eventually encode the document as UTF-8 + # and encode the incompatible characters as REPLACEMENT + # CHARACTER. + # + # If chardet is installed, it will detect that the document + # can be converted into ISO-8859-1 without errors. This happens + # to be the wrong encoding, but it is a consistent encoding, so the + # code we're testing here won't run. + # + # So we temporarily disable chardet if it's present. + doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> +<html><b>\330\250\330\252\330\261</b> +<i>\310\322\321\220\312\321\355\344</i></html>""" + chardet = bs4.dammit._chardet_dammit + logging.disable(logging.WARNING) + try: + + def noop(str): + return None + + bs4.dammit._chardet_dammit = noop + dammit = UnicodeDammit(doc) + assert True is dammit.contains_replacement_characters + assert "\ufffd" in dammit.unicode_markup + + soup = BeautifulSoup(doc, "html.parser") + assert soup.contains_replacement_characters + finally: + logging.disable(logging.NOTSET) + bs4.dammit._chardet_dammit = chardet + + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. + data = b"\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00" + dammit = UnicodeDammit(data) + assert "<a>áé</a>" == dammit.unicode_markup + assert "utf-16le" == dammit.original_encoding + + def test_known_definite_versus_user_encodings(self): + # The known_definite_encodings are used before sniffing the + # byte-order mark; the user_encodings are used afterwards. + + # Here's a document in UTF-16LE. + data = b"\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00" + dammit = UnicodeDammit(data) + + # We can process it as UTF-16 by passing it in as a known + # definite encoding. + before = UnicodeDammit(data, known_definite_encodings=["utf-16"]) + assert "utf-16" == before.original_encoding + + # If we pass UTF-18 as a user encoding, it's not even + # tried--the encoding sniffed from the byte-order mark takes + # precedence. + after = UnicodeDammit(data, user_encodings=["utf-8"]) + assert "utf-16le" == after.original_encoding + assert ["utf-16le"] == [x[0] for x in dammit.tried_encodings] + + # Here's a document in ISO-8859-8. + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit( + hebrew, known_definite_encodings=["utf-8"], user_encodings=["iso-8859-8"] + ) + + # The known_definite_encodings don't work, BOM sniffing does + # nothing (it only works for a few UTF encodings), but one of + # the user_encodings does work. + assert "iso-8859-8" == dammit.original_encoding + assert ["utf-8", "iso-8859-8"] == [x[0] for x in dammit.tried_encodings] + + def test_deprecated_override_encodings(self): + # override_encodings is a deprecated alias for + # known_definite_encodings. + hebrew = b"\xed\xe5\xec\xf9" + with warnings.catch_warnings(record=True) as w: + dammit = UnicodeDammit( + hebrew, + known_definite_encodings=["shift-jis"], + override_encodings=["utf-8"], + user_encodings=["iso-8859-8"], + ) + [warning] = w + message = warning.message + assert isinstance(message, DeprecationWarning) + assert warning.filename == __file__ + assert "iso-8859-8" == dammit.original_encoding + + # known_definite_encodings and override_encodings were tried + # before user_encodings. + assert ["shift-jis", "utf-8", "iso-8859-8"] == ( + [x[0] for x in dammit.tried_encodings] + ) + + def test_detwingle(self): + # Here's a UTF8 document. + utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") + + # Here's a Windows-1252 document. + windows_1252 = ( + "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + "\N{RIGHT DOUBLE QUOTATION MARK}" + ).encode("windows_1252") + + # Through some unholy alchemy, they've been stuck together. + doc = utf8 + windows_1252 + utf8 + + # The document can't be turned into UTF-8: + with pytest.raises(UnicodeDecodeError): + doc.decode("utf8") + + # Unicode, Dammit thinks the whole document is Windows-1252, + # and decodes it into "☃☃☃“Hi, I like Windows!â€Ã¢ËœÆ’☃☃" + + # But if we run it through fix_embedded_windows_1252, it's fixed: + fixed = UnicodeDammit.detwingle(doc) + assert "☃☃☃“Hi, I like Windows!â€â˜ƒâ˜ƒâ˜ƒ" == fixed.decode("utf8") + + def test_detwingle_ignores_multibyte_characters(self): + # Each of these characters has a UTF-8 representation ending + # in \x93. \x93 is a smart quote if interpreted as + # Windows-1252. But our code knows to skip over multibyte + # UTF-8 characters, so they'll survive the process unscathed. + for tricky_unicode_char in ( + "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + ): + input = tricky_unicode_char.encode("utf8") + assert input.endswith(b"\x93") + output = UnicodeDammit.detwingle(input) + assert output == input + + def test_find_declared_encoding(self): + # Test our ability to find a declared encoding inside an + # XML or HTML document. + # + # Even if the document comes in as Unicode, it may be + # interesting to know what encoding was claimed + # originally. + + html_unicode = '<html><head><meta charset="utf-8"></head></html>' + html_bytes = html_unicode.encode("ascii") + + xml_unicode = '<?xml version="1.0" encoding="ISO-8859-1" ?>' + xml_bytes = xml_unicode.encode("ascii") + + m = EncodingDetector.find_declared_encoding + assert m(html_unicode, is_html=False) is None + assert "utf-8" == m(html_unicode, is_html=True) + assert "utf-8" == m(html_bytes, is_html=True) + + assert "iso-8859-1" == m(xml_unicode) + assert "iso-8859-1" == m(xml_bytes) + + # Normally, only the first few kilobytes of a document are checked for + # an encoding. + spacer = b" " * 5000 + assert m(spacer + html_bytes) is None + assert m(spacer + xml_bytes) is None + + # But you can tell find_declared_encoding to search an entire + # HTML document. + assert ( + m(spacer + html_bytes, is_html=True, search_entire_document=True) == "utf-8" + ) + + # The XML encoding declaration has to be the very first thing + # in the document. We'll allow whitespace before the document + # starts, but nothing else. + assert m(xml_bytes, search_entire_document=True) == "iso-8859-1" + assert m(b" " + xml_bytes, search_entire_document=True) == "iso-8859-1" + assert m(b"a" + xml_bytes, search_entire_document=True) is None + + +class TestEntitySubstitution(object): + """Standalone tests of the EntitySubstitution class.""" + + def setup_method(self): + self.sub = EntitySubstitution + + @pytest.mark.parametrize( + "original,substituted", + [ + # Basic case. Unicode characters corresponding to named + # HTML entites are substituted; others are not. + ("foo\u2200\N{SNOWMAN}\u00f5bar", "foo∀\N{SNOWMAN}õbar"), + # MS smart quotes are a common source of frustration, so we + # give them a special test. + ("‘’foo“â€", "‘’foo“”"), + ], + ) + def test_substitute_html(self, original, substituted): + assert self.sub.substitute_html(original) == substituted + + def test_html5_entity(self): + for entity, u in ( + # A few spot checks of our ability to recognize + # special character sequences and convert them + # to named entities. + ("⊧", "\u22a7"), + ("𝔑", "\U0001d511"), + ("≧̸", "\u2267\u0338"), + ("¬", "\xac"), + ("⫬", "\u2aec"), + # We _could_ convert | to &verbarr;, but we don't, because + # | is an ASCII character. + ("|" "|"), + # Similarly for the fj ligature, which we could convert to + # fj, but we don't. + ("fj", "fj"), + # We do convert _these_ ASCII characters to HTML entities, + # because that's required to generate valid HTML. + (">", ">"), + ("<", "<"), + ): + template = "3 %s 4" + raw = template % u + with_entities = template % entity + assert self.sub.substitute_html(raw) == with_entities + + def test_html5_entity_with_variation_selector(self): + # Some HTML5 entities correspond either to a single-character + # Unicode sequence _or_ to the same character plus U+FE00, + # VARIATION SELECTOR 1. We can handle this. + data = "fjords \u2294 penguins" + markup = "fjords ⊔ penguins" + assert self.sub.substitute_html(data) == markup + + data = "fjords \u2294\ufe00 penguins" + markup = "fjords ⊔︀ penguins" + assert self.sub.substitute_html(data) == markup + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + assert self.sub.substitute_xml(s, False) == s + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + assert self.sub.substitute_xml("Welcome", True) == '"Welcome"' + assert self.sub.substitute_xml("Bob's Bar", True) == '"Bob\'s Bar"' + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes( + self, + ): + s = 'Welcome to "my bar"' + assert self.sub.substitute_xml(s, True) == "'Welcome to \"my bar\"'" + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes( + self, + ): + s = 'Welcome to "Bob\'s Bar"' + assert self.sub.substitute_xml(s, True) == '"Welcome to "Bob\'s Bar""' + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + assert self.sub.substitute_xml(quoted) == quoted + + def test_xml_quoting_handles_angle_brackets(self): + assert self.sub.substitute_xml("foo<bar>") == "foo<bar>" + + def test_xml_quoting_handles_ampersands(self): + assert self.sub.substitute_xml("AT&T") == "AT&T" + + def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): + assert self.sub.substitute_xml("ÁT&T") == "&Aacute;T&T" + + def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): + assert ( + self.sub.substitute_xml_containing_entities("ÁT&T") + == "ÁT&T" + ) + + def test_quotes_not_html_substituted(self): + """There's no need to do this except inside attribute values.""" + text = 'Bob\'s "bar"' + assert self.sub.substitute_html(text) == text + + @pytest.mark.parametrize( + "markup, old", + [ + ("foo & bar", "foo & bar"), + ("foo&", "foo&"), + ("foo&&& bar", "foo&&& bar"), + ("x=1&y=2", "x=1&y=2"), + ("&123", "&123"), + ("&abc", "&abc"), + ("foo &0 bar", "foo &0 bar"), + ("foo &lolwat bar", "foo &lolwat bar"), + ], + ) + def test_unambiguous_ampersands_not_escaped(self, markup, old): + assert self.sub.substitute_html(markup) == old + assert self.sub.substitute_html5_raw(markup) == markup + + @pytest.mark.parametrize( + "markup,html,html5,html5raw", + [ + ("÷", "&divide;", "&divide;", "÷"), + ("&nonesuch;", "&nonesuch;", "&nonesuch;", "&nonesuch;"), + ("÷", "&#247;", "&#247;", "&#247;"), + ("¡", "&#xa1;", "&#xa1;", "&#xa1;"), + ], + ) + def test_when_entity_ampersands_are_escaped(self, markup, html, html5, html5raw): + # The html and html5 formatters always escape the ampersand + # that begins an entity reference, because they assume + # Beautiful Soup has already converted any unescaped entity references + # to Unicode characters. + # + # The html5_raw formatter does not escape the ampersand that + # begins a recognized HTML entity, because it does not + # fit the HTML5 definition of an ambiguous ampersand. + # + # The html5_raw formatter does escape the ampersands in front + # of unrecognized named entities, as well as numeric and + # hexadecimal entities, because they do fit the definition. + assert self.sub.substitute_html(markup) == html + assert self.sub.substitute_html5(markup) == html5 + assert self.sub.substitute_html5_raw(markup) == html5raw + + @pytest.mark.parametrize( + "markup,expect", [("&nosuchentity;", "&nosuchentity;")] + ) + def test_ambiguous_ampersands_escaped(self, markup, expect): + assert self.sub.substitute_html(markup) == expect + assert self.sub.substitute_html5_raw(markup) == expect diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_element.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_element.py new file mode 100644 index 00000000..0861eb1c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_element.py @@ -0,0 +1,138 @@ +"""Tests of classes in element.py. + +The really big classes -- Tag, PageElement, and NavigableString -- +are tested in separate files. +""" + +import pytest +from bs4.element import ( + HTMLAttributeDict, + XMLAttributeDict, + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + NamespacedAttribute, + ResultSet, +) + +class TestNamedspacedAttribute: + def test_name_may_be_none_or_missing(self): + a = NamespacedAttribute("xmlns", None) + assert a == "xmlns" + + a = NamespacedAttribute("xmlns", "") + assert a == "xmlns" + + a = NamespacedAttribute("xmlns") + assert a == "xmlns" + + def test_namespace_may_be_none_or_missing(self): + a = NamespacedAttribute(None, "tag") + assert a == "tag" + + a = NamespacedAttribute("", "tag") + assert a == "tag" + + def test_attribute_is_equivalent_to_colon_separated_string(self): + a = NamespacedAttribute("a", "b") + assert "a:b" == a + + def test_attributes_are_equivalent_if_prefix_and_name_identical(self): + a = NamespacedAttribute("a", "b", "c") + b = NamespacedAttribute("a", "b", "c") + assert a == b + + # The actual namespace is not considered. + c = NamespacedAttribute("a", "b", None) + assert a == c + + # But name and prefix are important. + d = NamespacedAttribute("a", "z", "c") + assert a != d + + e = NamespacedAttribute("z", "b", "c") + assert a != e + + +class TestAttributeValueWithCharsetSubstitution: + """Certain attributes are designed to have the charset of the + final document substituted into their value. + """ + + def test_charset_meta_attribute_value(self): + # The value of a CharsetMetaAttributeValue is whatever + # encoding the string is in. + value = CharsetMetaAttributeValue("euc-jp") + assert "euc-jp" == value + assert "euc-jp" == value.original_value + assert "utf8" == value.substitute_encoding("utf8") + assert "ascii" == value.substitute_encoding("ascii") + + # If the target encoding is a Python internal encoding, + # no encoding will be mentioned in the output HTML. + assert "" == value.substitute_encoding("palmos") + + def test_content_meta_attribute_value(self): + value = ContentMetaAttributeValue("text/html; charset=euc-jp") + assert "text/html; charset=euc-jp" == value + assert "text/html; charset=euc-jp" == value.original_value + assert "text/html; charset=utf8" == value.substitute_encoding("utf8") + assert "text/html; charset=ascii" == value.substitute_encoding("ascii") + + # If the target encoding is a Python internal encoding, the + # charset argument will be omitted altogether. + assert "text/html" == value.substitute_encoding("palmos") + + +class TestAttributeDicts: + def test_xml_attribute_value_handling(self): + # Verify that attribute values are processed according to the + # XML spec's rules. + d = XMLAttributeDict() + d["v"] = 100 + assert d["v"] == "100" + d["v"] = 100.123 + assert d["v"] == "100.123" + + # This preserves Beautiful Soup's old behavior in the absence of + # guidance from the spec. + d["v"] = False + assert d["v"] is False + + d["v"] = True + assert d["v"] is True + + d["v"] = None + assert d["v"] == "" + + def test_html_attribute_value_handling(self): + # Verify that attribute values are processed according to the + # HTML spec's rules. + d = HTMLAttributeDict() + d["v"] = 100 + assert d["v"] == "100" + d["v"] = 100.123 + assert d["v"] == "100.123" + + d["v"] = False + assert "v" not in d + + d["v"] = None + assert "v" not in d + + d["v"] = True + assert d["v"] == "v" + + attribute = NamespacedAttribute("prefix", "name", "namespace") + d[attribute] = True + assert d[attribute] == "name" + + +class TestResultSet: + def test_getattr_exception(self): + rs = ResultSet(None) + with pytest.raises(AttributeError) as e: + rs.name + assert ( + """ResultSet object has no attribute "name". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?""" + == str(e.value) + ) diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_filter.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_filter.py new file mode 100644 index 00000000..63b291ee --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_filter.py @@ -0,0 +1,674 @@ +import pytest +import re +import warnings + +from . import ( + SoupTest, +) +from typing import ( + Callable, + Optional, + Tuple, +) +from bs4.element import Tag +from bs4.filter import ( + AttributeValueMatchRule, + ElementFilter, + MatchRule, + SoupStrainer, + StringMatchRule, + TagNameMatchRule, +) +from bs4._typing import _RawAttributeValues + + +class TestElementFilter(SoupTest): + def test_default_behavior(self): + # An unconfigured ElementFilter matches absolutely everything. + selector = ElementFilter() + assert not selector.excludes_everything + assert selector.includes_everything + soup = self.soup("<a>text</a>") + tag = soup.a + string = tag.string + assert True is selector.match(soup) + assert True is selector.match(tag) + assert True is selector.match(string) + assert soup.find(selector).name == "a" + + # And allows any incoming markup to be turned into PageElements. + assert True is selector.allow_tag_creation(None, "tag", None) + assert True is selector.allow_string_creation("some string") + + def test_setup_with_match_function(self): + # Configure an ElementFilter with a match function and + # we can no longer state with certainty that it includes everything. + selector = ElementFilter(lambda x: False) + assert not selector.includes_everything + + def test_match(self): + def m(pe): + return pe.string == "allow" or (isinstance(pe, Tag) and pe.name == "allow") + + soup = self.soup("<allow>deny</allow>allow<deny>deny</deny>") + allow_tag = soup.allow + allow_string = soup.find(string="allow") + deny_tag = soup.deny + deny_string = soup.find(string="deny") + + selector = ElementFilter(match_function=m) + assert True is selector.match(allow_tag) + assert True is selector.match(allow_string) + assert False is selector.match(deny_tag) + assert False is selector.match(deny_string) + + # Since only the match function was provided, there is + # no effect on tag or string creation. + soup = self.soup("<a>text</a>", parse_only=selector) + assert "text" == soup.a.string + + def test_allow_tag_creation(self): + # By default, ElementFilter.allow_tag_creation allows everything. + filter = ElementFilter() + f = filter.allow_tag_creation + assert True is f("allow", "ignore", {}) + assert True is f("ignore", "allow", {}) + assert True is f(None, "ignore", {"allow": "1"}) + assert True is f("no", "no", {"no": "nope"}) + + # You can customize this behavior by overriding + # allow_tag_creation in a subclass. + class MyFilter(ElementFilter): + def allow_tag_creation( + self, + nsprefix: Optional[str], + name: str, + attrs: Optional[_RawAttributeValues], + ): + return ( + nsprefix == "allow" + or name == "allow" + or (attrs is not None and "allow" in attrs) + ) + + filter = MyFilter() + f = filter.allow_tag_creation + assert True is f("allow", "ignore", {}) + assert True is f("ignore", "allow", {}) + assert True is f(None, "ignore", {"allow": "1"}) + assert False is f("no", "no", {"no": "nope"}) + + # Test the customized ElementFilter as a value for parse_only. + soup = self.soup( + "<deny>deny</deny> <allow>deny</allow> allow", parse_only=filter + ) + + # The <deny> tag was filtered out, but there was no effect on + # the strings, since only allow_tag_creation_function was + # overridden. + assert "deny <allow>deny</allow> allow" == soup.decode() + + # Similarly, since match_function was not defined, this + # ElementFilter matches everything. + assert soup.find(filter) == "deny" + + def test_allow_string_creation(self): + # By default, ElementFilter.allow_string_creation allows everything. + filter = ElementFilter() + f = filter.allow_string_creation + assert True is f("allow") + assert True is f("deny") + assert True is f("please allow") + + # You can customize this behavior by overriding allow_string_creation + # in a subclass. + class MyFilter(ElementFilter): + def allow_string_creation(self, s: str): + return s == "allow" + + filter = MyFilter() + f = filter.allow_string_creation + assert True is f("allow") + assert False is f("deny") + assert False is f("please allow") + + # Test the customized ElementFilter as a value for parse_only. + soup = self.soup( + "<deny>deny</deny> <allow>deny</allow> allow", parse_only=filter + ) + + # All incoming strings other than "allow" (even whitespace) + # were filtered out, but there was no effect on the tags, + # since only allow_string_creation_function was defined. + assert "<deny>deny</deny><allow>deny</allow>" == soup.decode() + + # Similarly, since match_function was not defined, this + # ElementFilter matches everything. + assert soup.find(filter).name == "deny" + + +class TestMatchRule(SoupTest): + def _tuple( + self, rule: MatchRule + ) -> Tuple[Optional[str], Optional[str], Optional[Callable], Optional[bool]]: + return ( + rule.string, + rule.pattern.pattern if rule.pattern else None, + rule.function, + rule.present, + ) + + @staticmethod + def tag_function(x: Tag) -> bool: + return False + + @staticmethod + def string_function(x: str) -> bool: + return False + + @pytest.mark.parametrize( + "constructor_args, constructor_kwargs, result", + [ + # String + ([], dict(string="a"), ("a", None, None, None)), + ( + [], + dict(string="\N{SNOWMAN}".encode("utf8")), + ("\N{SNOWMAN}", None, None, None), + ), + # Regular expression + ([], dict(pattern=re.compile("a")), (None, "a", None, None)), + ([], dict(pattern="b"), (None, "b", None, None)), + ([], dict(pattern=b"c"), (None, "c", None, None)), + # Function + ([], dict(function=tag_function), (None, None, tag_function, None)), + ([], dict(function=string_function), (None, None, string_function, None)), + # Boolean + ([], dict(present=True), (None, None, None, True)), + # With positional arguments rather than keywords + (("a", None, None, None), {}, ("a", None, None, None)), + ((None, "b", None, None), {}, (None, "b", None, None)), + ((None, None, tag_function, None), {}, (None, None, tag_function, None)), + ((None, None, None, True), {}, (None, None, None, True)), + ], + ) + def test_constructor(self, constructor_args, constructor_kwargs, result): + rule = MatchRule(*constructor_args, **constructor_kwargs) + assert result == self._tuple(rule) + + def test_empty_match_not_allowed(self): + with pytest.raises( + ValueError, + match="Either string, pattern, function, present, or exclude_everything must be provided.", + ): + MatchRule() + + def test_full_match_not_allowed(self): + with pytest.raises( + ValueError, + match="At most one of string, pattern, function, present, and exclude_everything must be provided.", + ): + MatchRule("a", "b", self.tag_function, True) + + @pytest.mark.parametrize( + "rule_kwargs, match_against, result", + [ + (dict(string="a"), "a", True), + (dict(string="a"), "ab", False), + (dict(pattern="a"), "a", True), + (dict(pattern="a"), "ab", True), + (dict(pattern="^a$"), "a", True), + (dict(pattern="^a$"), "ab", False), + (dict(present=True), "any random value", True), + (dict(present=True), None, False), + (dict(present=False), "any random value", False), + (dict(present=False), None, True), + (dict(function=lambda x: x.upper() == x), "UPPERCASE", True), + (dict(function=lambda x: x.upper() == x), "lowercase", False), + (dict(function=lambda x: x.lower() == x), "UPPERCASE", False), + (dict(function=lambda x: x.lower() == x), "lowercase", True), + ], + ) + def test_matches_string(self, rule_kwargs, match_against, result): + rule = MatchRule(**rule_kwargs) + assert rule.matches_string(match_against) == result + + +class TestTagNameMatchRule(SoupTest): + @pytest.mark.parametrize( + "rule_kwargs, tag_kwargs, result", + [ + (dict(string="a"), dict(name="a"), True), + (dict(string="a"), dict(name="ab"), False), + (dict(pattern="a"), dict(name="a"), True), + (dict(pattern="a"), dict(name="ab"), True), + (dict(pattern="^a$"), dict(name="a"), True), + (dict(pattern="^a$"), dict(name="ab"), False), + # This isn't very useful, but it will work. + (dict(present=True), dict(name="any random value"), True), + (dict(present=False), dict(name="any random value"), False), + ( + dict(function=lambda t: t.name in t.attrs), + dict(name="id", attrs=dict(id="a")), + True, + ), + ( + dict(function=lambda t: t.name in t.attrs), + dict(name="id", attrs={"class": "a"}), + False, + ), + ], + ) + def test_matches_tag(self, rule_kwargs, tag_kwargs, result): + rule = TagNameMatchRule(**rule_kwargs) + tag = Tag(**tag_kwargs) + assert rule.matches_tag(tag) == result + + +# AttributeValueMatchRule and StringMatchRule have the same +# logic as MatchRule. + + +class TestSoupStrainer(SoupTest): + + def test_constructor_string_deprecated_text_argument(self): + with warnings.catch_warnings(record=True) as w: + strainer = SoupStrainer(text="text") + assert strainer.text == "text" + [w1, w2] = w + msg = str(w1.message) + assert w1.filename == __file__ + assert ( + msg + == "As of version 4.11.0, the 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead." + ) + + msg = str(w2.message) + assert w2.filename == __file__ + assert ( + msg + == "Access to deprecated property text. (Look at .string_rules instead) -- Deprecated since version 4.13.0." + ) + + def test_search_tag_deprecated(self): + strainer = SoupStrainer(name="a") + with warnings.catch_warnings(record=True) as w: + assert False is strainer.search_tag("b", {}) + [w1] = w + msg = str(w1.message) + assert w1.filename == __file__ + assert ( + msg + == "Call to deprecated method search_tag. (Replaced by allow_tag_creation) -- Deprecated since version 4.13.0." + ) + + def test_search_deprecated(self): + strainer = SoupStrainer(name="a") + soup = self.soup("<a></a><b></b>") + with warnings.catch_warnings(record=True) as w: + assert soup.a == strainer.search(soup.a) + assert None is strainer.search(soup.b) + [w1, w2] = w + msg = str(w1.message) + assert msg == str(w2.message) + assert w1.filename == __file__ + assert ( + msg + == "Call to deprecated method search. (Replaced by match) -- Deprecated since version 4.13.0." + ) + + # Dummy function used within tests. + def _match_function(x): + pass + + def test_constructor_default(self): + # The default SoupStrainer matches all tags, and only tags. + strainer = SoupStrainer() + [name_rule] = strainer.name_rules + assert True == name_rule.present + assert 0 == len(strainer.attribute_rules) + assert 0 == len(strainer.string_rules) + + def test_constructor(self): + strainer = SoupStrainer( + "tagname", + {"attr1": "value"}, + string=self._match_function, + attr2=["value1", False], + ) + [name_rule] = strainer.name_rules + assert name_rule == TagNameMatchRule(string="tagname") + + [attr1_rule] = strainer.attribute_rules.pop("attr1") + assert attr1_rule == AttributeValueMatchRule(string="value") + + [attr2_rule1, attr2_rule2] = strainer.attribute_rules.pop("attr2") + assert attr2_rule1 == AttributeValueMatchRule(string="value1") + assert attr2_rule2 == AttributeValueMatchRule(present=False) + + assert not strainer.attribute_rules + + [string_rule] = strainer.string_rules + assert string_rule == StringMatchRule(function=self._match_function) + + def test_scalar_attrs_becomes_class_restriction(self): + # For the sake of convenience, passing a scalar value as + # ``args`` results in a restriction on the 'class' attribute. + strainer = SoupStrainer(attrs="mainbody") + assert [] == strainer.name_rules + assert [] == strainer.string_rules + assert {"class": [AttributeValueMatchRule(string="mainbody")]} == ( + strainer.attribute_rules + ) + + def test_constructor_class_attribute(self): + # The 'class' HTML attribute is also treated specially because + # it's a Python reserved word. Passing in "class_" as a + # keyword argument results in a restriction on the 'class' + # attribute. + strainer = SoupStrainer(class_="mainbody") + assert [] == strainer.name_rules + assert [] == strainer.string_rules + assert {"class": [AttributeValueMatchRule(string="mainbody")]} == ( + strainer.attribute_rules + ) + + # But if you pass in "class_" as part of the ``attrs`` dict + # it's not changed. (Otherwise there'd be no way to actually put + # a restriction on an attribute called "class_".) + strainer = SoupStrainer(attrs=dict(class_="mainbody")) + assert [] == strainer.name_rules + assert [] == strainer.string_rules + assert {"class_": [AttributeValueMatchRule(string="mainbody")]} == ( + strainer.attribute_rules + ) + + def test_constructor_with_overlapping_attributes(self): + # If you specify the same attribute in args and **kwargs, you end up + # with two different AttributeValueMatchRule objects. + + # This happens whether you use the 'class' shortcut on attrs... + strainer = SoupStrainer(attrs="class1", class_="class2") + rule1, rule2 = strainer.attribute_rules["class"] + assert rule1.string == "class1" + assert rule2.string == "class2" + + # Or explicitly specify the same attribute twice. + strainer = SoupStrainer(attrs={"id": "id1"}, id="id2") + rule1, rule2 = strainer.attribute_rules["id"] + assert rule1.string == "id1" + assert rule2.string == "id2" + + @pytest.mark.parametrize( + "obj, result", + [ + ("a", MatchRule(string="a")), + (b"a", MatchRule(string="a")), + (True, MatchRule(present=True)), + (False, MatchRule(present=False)), + (re.compile("a"), MatchRule(pattern=re.compile("a"))), + (_match_function, MatchRule(function=_match_function)), + # Pass in a list and get back a list of rules. + (["a", b"b"], [MatchRule(string="a"), MatchRule(string="b")]), + ( + [re.compile("a"), _match_function], + [ + MatchRule(pattern=re.compile("a")), + MatchRule(function=_match_function), + ], + ), + # Anything that doesn't fit is converted to a string. + (100, MatchRule(string="100")), + ], + ) + def test__make_match_rules(self, obj, result): + actual = list(SoupStrainer._make_match_rules(obj, MatchRule)) + # Helper to reduce the number of single-item lists in the + # parameters. + if len(actual) == 1: + [actual] = actual + assert result == actual + + @pytest.mark.parametrize( + "cls, result", + [ + (AttributeValueMatchRule, AttributeValueMatchRule(string="a")), + (StringMatchRule, StringMatchRule(string="a")), + ], + ) + def test__make_match_rules_different_classes(self, cls, result): + actual = cls(string="a") + assert actual == result + + def test__make_match_rules_nested_list(self): + # If you pass a nested list into _make_match_rules, it's + # turned into a restriction that excludes everything, to avoid the + # possibility of an infinite recursion. + + # Create a self-referential object. + selfref = [] + selfref.append(selfref) + + with warnings.catch_warnings(record=True) as w: + rules = SoupStrainer._make_match_rules(["a", selfref, "b"], MatchRule) + assert list(rules) == [MatchRule(string="a"), MatchRule(exclude_everything=True), MatchRule(string="b")] + + [warning] = w + # Don't check the filename because the stacklevel is + # designed for normal use and we're testing the private + # method directly. + msg = str(warning.message) + assert ( + msg + == "Ignoring nested list [[...]] to avoid the possibility of infinite recursion." + ) + + def tag_matches( + self, + strainer: SoupStrainer, + name: str, + attrs: Optional[_RawAttributeValues] = None, + string: Optional[str] = None, + prefix: Optional[str] = None, + ) -> bool: + # Create a Tag with the given prefix, name and attributes, + # then make sure that strainer.matches_tag and allow_tag_creation + # both approve it. + tag = Tag(prefix=prefix, name=name, attrs=attrs) + if string: + tag.string = string + return strainer.matches_tag(tag) and strainer.allow_tag_creation( + prefix, name, attrs + ) + + def test_matches_tag_with_only_string(self): + # A SoupStrainer that only has StringMatchRules won't ever + # match a Tag. + strainer = SoupStrainer(string=["a string", re.compile("string")]) + tag = Tag(name="b", attrs=dict(id="1")) + tag.string = "a string" + assert not strainer.matches_tag(tag) + + # There has to be a TagNameMatchRule or an + # AttributeValueMatchRule as well. + strainer.name_rules.append(TagNameMatchRule(string="b")) + assert strainer.matches_tag(tag) + + strainer.name_rules = [] + strainer.attribute_rules["id"] = [AttributeValueMatchRule("1")] + assert strainer.matches_tag(tag) + + def test_matches_tag_with_prefix(self): + # If a tag has an attached namespace prefix, the tag's name is + # tested both with and without the prefix. + kwargs = dict(name="a", prefix="ns") + + assert self.tag_matches(SoupStrainer(name="a"), **kwargs) + assert self.tag_matches(SoupStrainer(name="ns:a"), **kwargs) + assert not self.tag_matches(SoupStrainer(name="ns2:a"), **kwargs) + + def test_one_name_rule_must_match(self): + # If there are TagNameMatchRule, at least one must match. + kwargs = dict(name="b") + + assert self.tag_matches(SoupStrainer(name="b"), **kwargs) + assert not self.tag_matches(SoupStrainer(name="c"), **kwargs) + assert self.tag_matches(SoupStrainer(name=["c", "d", "d", "b"]), **kwargs) + assert self.tag_matches( + SoupStrainer(name=[re.compile("c-f"), re.compile("[ab]$")]), **kwargs + ) + + def test_one_attribute_rule_must_match_for_each_attribute(self): + # If there is one or more AttributeValueMatchRule for a given + # attribute, at least one must match that attribute's + # value. This is true for *every* attribute -- just matching one + # attribute isn't enough. + kwargs = dict(name="b", attrs={"class": "main", "id": "1"}) + + # 'class' and 'id' match + assert self.tag_matches( + SoupStrainer( + class_=["other", "main"], id=["20", "a", re.compile("^[0-9]")] + ), + **kwargs, + ) + + # 'class' and 'id' are present and 'data' attribute is missing + assert self.tag_matches( + SoupStrainer(class_=True, id=True, data=False), **kwargs + ) + + # 'id' matches, 'class' does not. + assert not self.tag_matches(SoupStrainer(class_=["other"], id=["2"]), **kwargs) + + # 'class' matches, 'id' does not + assert not self.tag_matches(SoupStrainer(class_=["main"], id=["2"]), **kwargs) + + # 'class' and 'id' match but 'data' attribute is missing + assert not self.tag_matches( + SoupStrainer(class_=["main"], id=["1"], data=True), **kwargs + ) + + def test_match_against_multi_valued_attribute(self): + # If an attribute has multiple values, only one of them + # has to match the AttributeValueMatchRule. + kwargs = dict(name="b", attrs={"class": ["main", "big"]}) + assert self.tag_matches(SoupStrainer(attrs="main"), **kwargs) + assert self.tag_matches(SoupStrainer(attrs="big"), **kwargs) + assert self.tag_matches(SoupStrainer(attrs=["main", "big"]), **kwargs) + assert self.tag_matches(SoupStrainer(attrs=["big", "small"]), **kwargs) + assert not self.tag_matches(SoupStrainer(attrs=["small", "smaller"]), **kwargs) + + def test_match_against_multi_valued_attribute_as_string(self): + # If an attribute has multiple values, you can treat the entire + # thing as one string during a match. + kwargs = dict(name="b", attrs={"class": ["main", "big"]}) + assert self.tag_matches(SoupStrainer(attrs="main big"), **kwargs) + + # But you can't put them in any order; it's got to be the + # order they are present in the Tag, which basically means the + # order they were originally present in the document. + assert not self.tag_matches(SoupStrainer(attrs=["big main"]), **kwargs) + + def test_one_string_rule_must_match(self): + # If there's a TagNameMatchRule and/or an + # AttributeValueMatchRule, then the StringMatchRule is _not_ + # ignored, and must match as well. + tag = Tag(name="b", attrs=dict(id="1")) + tag.string = "A string" + + assert SoupStrainer(name="b", string="A string").matches_tag(tag) + assert not SoupStrainer(name="a", string="A string").matches_tag(tag) + assert not SoupStrainer(name="a", string="Wrong string").matches_tag(tag) + assert SoupStrainer(id="1", string="A string").matches_tag(tag) + assert not SoupStrainer(id="2", string="A string").matches_tag(tag) + assert not SoupStrainer(id="1", string="Wrong string").matches_tag(tag) + + assert SoupStrainer(name="b", id="1", string="A string").matches_tag(tag) + + # If there are multiple string rules, only one needs to match. + assert SoupStrainer( + name="b", + id="1", + string=["Wrong string", "Also wrong", re.compile("string")], + ).matches_tag(tag) + + def test_allowing_tag_implies_allowing_its_contents(self): + markup = "<a><b>one string<div>another string</div></b></a>" + + # Letting the <b> tag through implies parsing the <div> tag + # and both strings, even though they wouldn't match the + # SoupStrainer on their own. + assert ( + "<b>one string<div>another string</div></b>" + == self.soup(markup, parse_only=SoupStrainer(name="b")).decode() + ) + + @pytest.mark.parametrize( + "soupstrainer", + [ + SoupStrainer(name="b", string="one string"), + SoupStrainer(name="div", string="another string"), + ], + ) + def test_parse_only_combining_tag_and_string(self, soupstrainer): + # If you pass parse_only a SoupStrainer that contains both tag + # restrictions and string restrictions, you get no results, + # because the string restrictions can't be evaluated during + # the parsing process, and the tag restrictions eliminate + # any strings from consideration. + # + # We can detect this ahead of time, and warn about it, + # thanks to SoupStrainer.excludes_everything + markup = "<a><b>one string<div>another string</div></b></a>" + + with warnings.catch_warnings(record=True) as w: + assert True, soupstrainer.excludes_everything + assert "" == self.soup(markup, parse_only=soupstrainer).decode() + [warning] = w + str(warning.message) + assert warning.filename == __file__ + assert str(warning.message).startswith( + "The given value for parse_only will exclude everything:" + ) + + # The average SoupStrainer has excludes_everything=False + assert not SoupStrainer().excludes_everything + + def test_documentation_examples(self): + """Medium-weight real-world tests based on the Beautiful Soup + documentation. + """ + html_doc = """<html><head><title>The Dormouse's story</title></head> +<body> +<p class="title"><b>The Dormouse's story</b></p> + +<p class="story">Once upon a time there were three little sisters; and their names were +<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, +<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and +<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; +and they lived at the bottom of a well.</p> + +<p class="story">...</p> +""" + only_a_tags = SoupStrainer("a") + only_tags_with_id_link2 = SoupStrainer(id="link2") + + def is_short_string(string): + return string is not None and len(string) < 10 + + only_short_strings = SoupStrainer(string=is_short_string) + + a_soup = self.soup(html_doc, parse_only=only_a_tags) + assert ( + '<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a><a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>' + == a_soup.decode() + ) + + id_soup = self.soup(html_doc, parse_only=only_tags_with_id_link2) + assert ( + '<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>' + == id_soup.decode() + ) + string_soup = self.soup(html_doc, parse_only=only_short_strings) + assert "\n\n\nElsie,\nLacie and\nTillie\n...\n" == string_soup.decode() diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_formatter.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_formatter.py new file mode 100644 index 00000000..0b840c58 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_formatter.py @@ -0,0 +1,170 @@ +import pytest + +from bs4.element import Tag +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) +from . import SoupTest + + +class TestFormatter(SoupTest): + def test_default_attributes(self): + # Test the default behavior of Formatter.attributes(). + formatter = Formatter() + tag = Tag(name="tag") + tag["b"] = "1" + tag["a"] = "2" + + # Attributes come out sorted by name. In Python 3, attributes + # normally come out of a dictionary in the order they were + # added. + assert [("a", "2"), ("b", "1")] == formatter.attributes(tag) + + # This works even if Tag.attrs is None, though this shouldn't + # normally happen. + tag.attrs = None + assert [] == formatter.attributes(tag) + + assert " " == formatter.indent + + def test_sort_attributes(self): + # Test the ability to override Formatter.attributes() to, + # e.g., disable the normal sorting of attributes. + class UnsortedFormatter(Formatter): + def attributes(self, tag): + self.called_with = tag + for k, v in sorted(tag.attrs.items()): + if k == "ignore": + continue + yield k, v + + soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') + formatter = UnsortedFormatter() + decoded = soup.decode(formatter=formatter) + + # attributes() was called on the <p> tag. It filtered out one + # attribute and sorted the other two. + assert formatter.called_with == soup.p + assert '<p aval="2" cval="1"></p>' == decoded + + def test_empty_attributes_are_booleans(self): + # Test the behavior of empty_attributes_are_booleans as well + # as which Formatters have it enabled. + + for name in ("html", "minimal", None): + formatter = HTMLFormatter.REGISTRY[name] + assert False is formatter.empty_attributes_are_booleans + + formatter = XMLFormatter.REGISTRY[None] + assert False is formatter.empty_attributes_are_booleans + + formatter = HTMLFormatter.REGISTRY["html5"] + assert True is formatter.empty_attributes_are_booleans + + # Verify that the constructor sets the value. + formatter = Formatter(empty_attributes_are_booleans=True) + assert True is formatter.empty_attributes_are_booleans + + # Now demonstrate what it does to markup. + for markup in ("<option selected></option>", '<option selected=""></option>'): + soup = self.soup(markup) + for formatter in ("html", "minimal", "xml", None): + assert b'<option selected=""></option>' == soup.option.encode( + formatter="html" + ) + assert b"<option selected></option>" == soup.option.encode( + formatter="html5" + ) + + @pytest.mark.parametrize( + "indent,expect", + [ + (None, "<a>\n<b>\ntext\n</b>\n</a>\n"), + (-1, "<a>\n<b>\ntext\n</b>\n</a>\n"), + (0, "<a>\n<b>\ntext\n</b>\n</a>\n"), + ("", "<a>\n<b>\ntext\n</b>\n</a>\n"), + (1, "<a>\n <b>\n text\n </b>\n</a>\n"), + (2, "<a>\n <b>\n text\n </b>\n</a>\n"), + ("\t", "<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>\n"), + ("abc", "<a>\nabc<b>\nabcabctext\nabc</b>\n</a>\n"), + # Some invalid inputs -- the default behavior is used. + (object(), "<a>\n <b>\n text\n </b>\n</a>\n"), + (b"bytes", "<a>\n <b>\n text\n </b>\n</a>\n"), + ], + ) + def test_indent(self, indent, expect): + # Pretty-print a tree with a Formatter set to + # indent in a certain way and verify the results. + soup = self.soup("<a><b>text</b></a>") + formatter = Formatter(indent=indent) + assert soup.prettify(formatter=formatter) == expect + + # Pretty-printing only happens with prettify(), not + # encode(). + assert soup.encode(formatter=formatter) != expect + + def test_default_indent_value(self): + formatter = Formatter() + assert formatter.indent == " " + + @pytest.mark.parametrize("formatter,expect", + [ + (HTMLFormatter(indent=1), "<p>\n a\n</p>\n"), + (HTMLFormatter(indent=2), "<p>\n a\n</p>\n"), + (XMLFormatter(indent=1), "<p>\n a\n</p>\n"), + (XMLFormatter(indent="\t"), "<p>\n\ta\n</p>\n"), + ] ) + def test_indent_subclasses(self, formatter, expect): + soup = self.soup("<p>a</p>") + assert expect == soup.p.prettify(formatter=formatter) + + @pytest.mark.parametrize( + "s,expect_html,expect_html5", + [ + # The html5 formatter is much less aggressive about escaping ampersands + # than the html formatter. + ("foo & bar", "foo & bar", "foo & bar"), + ("foo&", "foo&", "foo&"), + ("foo&&& bar", "foo&&& bar", "foo&&& bar"), + ("x=1&y=2", "x=1&y=2", "x=1&y=2"), + ("&123", "&123", "&123"), + ("&abc", "&abc", "&abc"), + ("foo &0 bar", "foo &0 bar", "foo &0 bar"), + ("foo &lolwat bar", "foo &lolwat bar", "foo &lolwat bar"), + # But both formatters escape what the HTML5 spec considers ambiguous ampersands. + ("&nosuchentity;", "&nosuchentity;", "&nosuchentity;"), + ], + ) + def test_entity_substitution(self, s, expect_html, expect_html5): + assert HTMLFormatter.REGISTRY["html"].substitute(s) == expect_html + assert HTMLFormatter.REGISTRY["html5"].substitute(s) == expect_html5 + assert HTMLFormatter.REGISTRY["html5-4.12"].substitute(s) == expect_html + + def test_entity_round_trip(self): + # This is more an explanatory test and a way to avoid regressions than a test of functionality. + + markup = "<p>Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ &divide; &#247;</p>" + soup = self.soup(markup) + assert ( + "Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ ÷ ÷" + == soup.p.string + ) + + # Oops, I forgot to mention the entity. + soup.p.string = soup.p.string + " ÷" + + assert ( + "Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ ÷ ÷ ÷" + == soup.p.string + ) + + expect = "<p>Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ &divide; &#247; &#xf7;</p>" + assert expect == soup.p.decode(formatter="html") + assert expect == soup.p.decode(formatter="html5") + + markup = "<p>a & b</p>" + soup = self.soup(markup) + assert "<p>a & b</p>" == soup.p.decode(formatter="html") + assert "<p>a & b</p>" == soup.p.decode(formatter="html5") diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py new file mode 100644 index 00000000..f5b0990d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py @@ -0,0 +1,181 @@ +"""This file contains test cases reported by third parties using +fuzzing tools, primarily from Google's oss-fuzz project. Some of these +represent real problems with Beautiful Soup, but many are problems in +libraries that Beautiful Soup depends on, and many of the test cases +represent different ways of triggering the same problem. + +Grouping these test cases together makes it easy to see which test +cases represent the same problem, and puts the test cases in close +proximity to code that can trigger the problems. +""" + +import os +import importlib +import pytest +from bs4 import ( + BeautifulSoup, + ParserRejectedMarkup, +) + +try: + from soupsieve.util import SelectorSyntaxError + has_lxml = importlib.util.find_spec("lxml") + has_html5lib = importlib.util.find_spec("html5lib") + fully_fuzzable = has_lxml != None and has_html5lib != None +except ImportError: + fully_fuzzable = False + + +@pytest.mark.skipif( + not fully_fuzzable, reason="Prerequisites for fuzz tests are not installed." +) +class TestFuzz(object): + # Test case markup files from fuzzers are given this extension so + # they can be included in builds. + TESTCASE_SUFFIX = ".testcase" + + # Copied 20230512 from + # https://github.com/google/oss-fuzz/blob/4ac6a645a197a695fe76532251feb5067076b3f3/projects/bs4/bs4_fuzzer.py + # + # Copying the code lets us precisely duplicate the behavior of + # oss-fuzz. The downside is that this code changes over time, so + # multiple copies of the code must be kept around to run against + # older tests. I'm not sure what to do about this, but I may + # retire old tests after a time. + def fuzz_test_with_css(self, filename: str) -> None: + data = self.__markup(filename) + parsers = ["lxml-xml", "html5lib", "html.parser", "lxml"] + try: + idx = int(data[0]) % len(parsers) + except ValueError: + return + + css_selector, data = data[1:10], data[10:] + + try: + soup = BeautifulSoup(data[1:], features=parsers[idx]) + except ParserRejectedMarkup: + return + except ValueError: + return + + list(soup.find_all(True)) + try: + soup.css.select(css_selector.decode("utf-8", "replace")) + except SelectorSyntaxError: + return + soup.prettify() + + # This class of error has been fixed by catching a less helpful + # exception from html.parser and raising ParserRejectedMarkup + # instead. + @pytest.mark.parametrize( + "filename", + [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912", + "crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a", + ], + ) + def test_rejected_markup(self, filename): + markup = self.__markup(filename) + with pytest.raises(ParserRejectedMarkup): + BeautifulSoup(markup, "html.parser") + + # This class of error has to do with very deeply nested documents + # which overflow the Python call stack when the tree is converted + # to a string. This is an issue with Beautiful Soup which was fixed + # as part of [bug=1471755]. + # + # These test cases are in the older format that doesn't specify + # which parser to use or give a CSS selector. + @pytest.mark.parametrize( + "filename", + [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632", + "clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464", + "clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400", + ], + ) + def test_deeply_nested_document_without_css(self, filename): + # Parsing the document and encoding it back to a string is + # sufficient to demonstrate that the overflow problem has + # been fixed. + markup = self.__markup(filename) + BeautifulSoup(markup, "html.parser").encode() + + # This class of error has to do with very deeply nested documents + # which overflow the Python call stack when the tree is converted + # to a string. This is an issue with Beautiful Soup which was fixed + # as part of [bug=1471755]. + @pytest.mark.parametrize( + "filename", + [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624", + ], + ) + def test_deeply_nested_document(self, filename): + self.fuzz_test_with_css(filename) + + @pytest.mark.parametrize( + "filename", + [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824", + ], + ) + def test_soupsieve_errors(self, filename): + self.fuzz_test_with_css(filename) + + # This class of error represents problems with html5lib's parser, + # not Beautiful Soup. I use + # https://github.com/html5lib/html5lib-python/issues/568 to notify + # the html5lib developers of these issues. + # + # These test cases are in the older format that doesn't specify + # which parser to use or give a CSS selector. + @pytest.mark.skip(reason="html5lib-specific problems") + @pytest.mark.parametrize( + "filename", + [ + # b"""ÿ<!DOCTyPEV PUBLIC'''Ã'""" + "clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320", + # b')<a><math><TR><a><mI><a><p><a>' + "clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456", + # b'-<math><sElect><mi><sElect><sElect>' + "clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896", + # b'ñ<table><svg><html>' + "clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224", + # <TABLE>, some ^@ characters, some <math> tags. + "clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744", + # Nested table + "crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08", + ], + ) + def test_html5lib_parse_errors_without_css(self, filename): + markup = self.__markup(filename) + print(BeautifulSoup(markup, "html5lib").encode()) + + # This class of error represents problems with html5lib's parser, + # not Beautiful Soup. I use + # https://github.com/html5lib/html5lib-python/issues/568 to notify + # the html5lib developers of these issues. + @pytest.mark.skip(reason="html5lib-specific problems") + @pytest.mark.parametrize( + "filename", + [ + # b'- \xff\xff <math>\x10<select><mi><select><select>t' + "clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640", + ], + ) + def test_html5lib_parse_errors(self, filename): + self.fuzz_test_with_css(filename) + + def __markup(self, filename: str) -> bytes: + if not filename.endswith(self.TESTCASE_SUFFIX): + filename += self.TESTCASE_SUFFIX + this_dir = os.path.split(__file__)[0] + path = os.path.join(this_dir, "fuzz", filename) + return open(path, "rb").read() diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_html5lib.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_html5lib.py new file mode 100644 index 00000000..593c12bd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_html5lib.py @@ -0,0 +1,264 @@ +"""Tests to ensure that the html5lib tree builder generates good trees.""" + +import pytest +import warnings + +from bs4 import BeautifulSoup +from bs4.filter import SoupStrainer +from . import ( + HTML5LIB_PRESENT, + HTML5TreeBuilderSmokeTest, +) + + +@pytest.mark.skipif( + not HTML5LIB_PRESENT, + reason="html5lib seems not to be present, not testing its tree builder.", +) +class TestHTML5LibBuilder(HTML5TreeBuilderSmokeTest): + """See ``HTML5TreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + from bs4.builder import HTML5TreeBuilder + + return HTML5TreeBuilder + + def test_soupstrainer(self): + # The html5lib tree builder does not support parse_only. + strainer = SoupStrainer("b") + markup = "<p>A <b>bold</b> statement.</p>" + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup(markup, "html5lib", parse_only=strainer) + assert soup.decode() == self.document_for(markup) + + [warning] = w + assert warning.filename == __file__ + assert "the html5lib tree builder doesn't support parse_only" in str( + warning.message + ) + + def test_correctly_nested_tables(self): + """html5lib inserts <tbody> tags where other parsers don't.""" + markup = ( + '<table id="1">' + "<tr>" + "<td>Here's another table:" + '<table id="2">' + "<tr><td>foo</td></tr>" + "</table></td>" + ) + + self.assert_soup( + markup, + '<table id="1"><tbody><tr><td>Here\'s another table:' + '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' + "</td></tr></tbody></table>", + ) + + self.assert_soup( + "<table><thead><tr><td>Foo</td></tr></thead>" + "<tbody><tr><td>Bar</td></tr></tbody>" + "<tfoot><tr><td>Baz</td></tr></tfoot></table>" + ) + + def test_xml_declaration_followed_by_doctype(self): + markup = """<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html> +<html> + <head> + </head> + <body> + <p>foo</p> + </body> +</html>""" + soup = self.soup(markup) + # Verify that we can reach the <p> tag; this means the tree is connected. + assert b"<p>foo</p>" == soup.p.encode() + + def test_reparented_markup(self): + markup = "<p><em>foo</p>\n<p>bar<a></a></em></p>" + soup = self.soup(markup) + assert ( + "<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>" + == soup.body.decode() + ) + assert 2 == len(soup.find_all("p")) + + def test_reparented_markup_ends_with_whitespace(self): + markup = "<p><em>foo</p>\n<p>bar<a></a></em></p>\n" + soup = self.soup(markup) + assert ( + "<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>" + == soup.body.decode() + ) + assert 2 == len(soup.find_all("p")) + + def test_reparented_markup_containing_identical_whitespace_nodes(self): + """Verify that we keep the two whitespace nodes in this + document distinct when reparenting the adjacent <tbody> tags. + """ + markup = "<table> <tbody><tbody><ims></tbody> </table>" + soup = self.soup(markup) + space1, space2 = soup.find_all(string=" ") + tbody1, tbody2 = soup.find_all("tbody") + assert space1.next_element is tbody1 + assert tbody2.next_element is space2 + + def test_reparented_markup_containing_children(self): + markup = ( + "<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>" + ) + soup = self.soup(markup) + noscript = soup.noscript + assert "target" == noscript.next_element + target = soup.find(string="target") + + # The 'aftermath' string was duplicated; we want the second one. + final_aftermath = soup.find_all(string="aftermath")[-1] + + # The <noscript> tag was moved beneath a copy of the <a> tag, + # but the 'target' string within is still connected to the + # (second) 'aftermath' string. + assert final_aftermath == target.next_element + assert target == final_aftermath.previous_element + + def test_processing_instruction(self): + """Processing instructions become comments.""" + markup = b"""<?PITarget PIContent?>""" + soup = self.soup(markup) + assert str(soup).startswith("<!--?PITarget PIContent?-->") + + def test_cloned_multivalue_node(self): + markup = b"""<a class="my_class"><p></a>""" + soup = self.soup(markup) + a1, a2 = soup.find_all("a") + assert a1 == a2 + assert a1 is not a2 + + def test_foster_parenting(self): + markup = b"""<table><td></tbody>A""" + soup = self.soup(markup) + assert ( + "<body>A<table><tbody><tr><td></td></tr></tbody></table></body>" + == soup.body.decode() + ) + + def test_extraction(self): + """ + Test that extraction does not destroy the tree. + + https://bugs.launchpad.net/beautifulsoup/+bug/1782928 + """ + + markup = """ +<html><head></head> +<style> +</style><script></script><body><p>hello</p></body></html> +""" + soup = self.soup(markup) + [s.extract() for s in soup("script")] + [s.extract() for s in soup("style")] + + assert len(soup.find_all("p")) == 1 + + def test_empty_comment(self): + """ + Test that empty comment does not break structure. + + https://bugs.launchpad.net/beautifulsoup/+bug/1806598 + """ + + markup = """ +<html> +<body> +<form> +<!----><input type="text"> +</form> +</body> +</html> +""" + soup = self.soup(markup) + inputs = [] + for form in soup.find_all("form"): + inputs.extend(form.find_all("input")) + assert len(inputs) == 1 + + def test_tracking_line_numbers(self): + # The html.parser TreeBuilder keeps track of line number and + # position of each element. + markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>" + soup = self.soup(markup) + assert 2 == soup.p.sourceline + assert 5 == soup.p.sourcepos + assert "sourceline" == soup.p.find("sourceline").name + + # You can deactivate this behavior. + soup = self.soup(markup, store_line_numbers=False) + assert None is soup.p.sourceline + assert None is soup.p.sourcepos + + def test_special_string_containers(self): + # The html5lib tree builder doesn't support this standard feature, + # because there's no way of knowing, when a string is created, + # where in the tree it will eventually end up. + pass + + def test_html5_attributes(self): + # The html5lib TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + # + # This is a copy of the same test from + # HTMLParserTreeBuilderSmokeTest. It's not in the superclass + # because the lxml HTML TreeBuilder _doesn't_ work this way. + for input_element, output_unicode, output_element in ( + ("⇄", "\u21c4", b"⇄"), + ("⊧", "\u22a7", b"⊧"), + ("𝔑", "\U0001d511", b"𝔑"), + ("≧̸", "\u2267\u0338", b"≧̸"), + ("¬", "\xac", b"¬"), + ("⫬", "\u2aec", b"⫬"), + (""", '"', b'"'), + ("∴", "\u2234", b"∴"), + ("∴", "\u2234", b"∴"), + ("∴", "\u2234", b"∴"), + ("fj", "fj", b"fj"), + ("⊔", "\u2294", b"⊔"), + ("⊔︀", "\u2294\ufe00", b"⊔︀"), + ("'", "'", b"'"), + ("|", "|", b"|"), + ): + markup = "<div>%s</div>" % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"<div>%s</div>" % output_unicode.encode("utf8") + assert without_element == expect + + with_element = div.encode(formatter="html") + expect = b"<div>%s</div>" % output_element + assert with_element == expect + + @pytest.mark.parametrize( + "name,value", + [("document_declared_encoding", "utf8"), ("exclude_encodings", ["utf8"])], + ) + def test_prepare_markup_warnings(self, name, value): + # html5lib doesn't support a couple of the common arguments to + # prepare_markup. + builder = self.default_builder() + kwargs = {name: value} + with warnings.catch_warnings(record=True) as w: + list(builder.prepare_markup("a", **kwargs)) + [warning] = w + msg = str(warning.message) + assert ( + msg + == f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}." + ) + + def test_doctype_filtered(self): + # Since the html5lib parser doesn't support parse_only, this standard + # smoke-test test can't be run. + pass diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py new file mode 100644 index 00000000..b2bd07fc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py @@ -0,0 +1,161 @@ +"""Tests to ensure that the html.parser tree builder generates good +trees.""" + +import pickle +import pytest +from bs4.builder._htmlparser import ( + _DuplicateAttributeHandler, + BeautifulSoupHTMLParser, + HTMLParserTreeBuilder, +) +from bs4.exceptions import ParserRejectedMarkup +from typing import Any +from . import HTMLTreeBuilderSmokeTest + + +class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest): + default_builder = HTMLParserTreeBuilder + + def test_rejected_input(self): + # Python's html.parser will occasionally reject markup, + # especially when there is a problem with the initial DOCTYPE + # declaration. Different versions of Python sound the alarm in + # different ways, but Beautiful Soup consistently raises + # errors as ParserRejectedMarkup exceptions. + bad_markup = [ + # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873 + # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700 + # https://github.com/python/cpython/issues/81928 + b"\n<![\xff\xfe\xfe\xcd\x00", + # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8 + # https://github.com/python/cpython/issues/78661 + # + b"<![n\x00", + b"<![UNKNOWN[]]>", + ] + for markup in bad_markup: + with pytest.raises(ParserRejectedMarkup): + self.soup(markup) + + def test_namespaced_system_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_namespaced_public_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_builder_is_pickled(self): + """Unlike most tree builders, HTMLParserTreeBuilder and will + be restored after pickling. + """ + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + assert isinstance(loaded.builder, type(tree.builder)) + + def test_redundant_empty_element_closing_tags(self): + self.assert_soup("<br></br><br></br><br></br>", "<br/><br/><br/>") + self.assert_soup("</br></br></br>", "") + + def test_empty_element(self): + # This verifies that any buffered data present when the parser + # finishes working is handled. + self.assert_soup("foo &# bar", "foo &# bar") + + def test_tracking_line_numbers(self): + # The html.parser TreeBuilder keeps track of line number and + # position of each element. + markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>" + soup = self.soup(markup) + assert 2 == soup.p.sourceline + assert 3 == soup.p.sourcepos + assert "sourceline" == soup.p.find("sourceline").name + + # You can deactivate this behavior. + soup = self.soup(markup, store_line_numbers=False) + assert None is soup.p.sourceline + assert None is soup.p.sourcepos + + def test_on_duplicate_attribute(self): + # The html.parser tree builder has a variety of ways of + # handling a tag that contains the same attribute multiple times. + + markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">' + + # If you don't provide any particular value for + # on_duplicate_attribute, later values replace earlier values. + soup = self.soup(markup) + assert "url3" == soup.a["href"] + assert ["cls"] == soup.a["class"] + assert "id" == soup.a["id"] + + # You can also get this behavior explicitly. + def assert_attribute( + on_duplicate_attribute: _DuplicateAttributeHandler, expected: Any + ) -> None: + soup = self.soup(markup, on_duplicate_attribute=on_duplicate_attribute) + assert soup.a is not None + assert expected == soup.a["href"] + + # Verify that non-duplicate attributes are treated normally. + assert ["cls"] == soup.a["class"] + assert "id" == soup.a["id"] + + assert_attribute(None, "url3") + assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") + + # You can ignore subsequent values in favor of the first. + assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1") + + # And you can pass in a callable that does whatever you want. + def accumulate(attrs, key, value): + if not isinstance(attrs[key], list): + attrs[key] = [attrs[key]] + attrs[key].append(value) + + assert_attribute(accumulate, ["url1", "url2", "url3"]) + + def test_html5_attributes(self): + # The html.parser TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + for input_element, output_unicode, output_element in ( + ("⇄", "\u21c4", b"⇄"), + ("⊧", "\u22a7", b"⊧"), + ("𝔑", "\U0001d511", b"𝔑"), + ("≧̸", "\u2267\u0338", b"≧̸"), + ("¬", "\xac", b"¬"), + ("⫬", "\u2aec", b"⫬"), + (""", '"', b'"'), + ("∴", "\u2234", b"∴"), + ("∴", "\u2234", b"∴"), + ("∴", "\u2234", b"∴"), + ("fj", "fj", b"fj"), + ("⊔", "\u2294", b"⊔"), + ("⊔︀", "\u2294\ufe00", b"⊔︀"), + ("'", "'", b"'"), + ("|", "|", b"|"), + ): + markup = "<div>%s</div>" % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"<div>%s</div>" % output_unicode.encode("utf8") + assert without_element == expect + + with_element = div.encode(formatter="html") + expect = b"<div>%s</div>" % output_element + assert with_element == expect + + def test_invalid_html_entity(self): + # The html.parser treebuilder can't distinguish between an invalid + # HTML entity with a semicolon and an invalid HTML entity with no + # semicolon. + markup = "<p>a &nosuchentity b</p>" + soup = self.soup(markup) + assert "<p>a &nosuchentity b</p>" == soup.p.decode() + + markup = "<p>a &nosuchentity; b</p>" + soup = self.soup(markup) + assert "<p>a &nosuchentity b</p>" == soup.p.decode() diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py new file mode 100644 index 00000000..04a0ee88 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py @@ -0,0 +1,196 @@ +"""Tests to ensure that the lxml tree builder generates good trees.""" + +import pickle +import pytest +import warnings +from . import LXML_PRESENT, LXML_VERSION + +if LXML_PRESENT: + from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML + +from bs4 import ( + BeautifulStoneSoup, +) +from . import ( + HTMLTreeBuilderSmokeTest, + XMLTreeBuilderSmokeTest, + SOUP_SIEVE_PRESENT, +) + + +@pytest.mark.skipif( + not LXML_PRESENT, + reason="lxml seems not to be present, not testing its tree builder.", +) +class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilder + + def test_out_of_range_entity(self): + self.assert_soup("<p>foo�bar</p>", "<p>foobar</p>") + self.assert_soup("<p>foo�bar</p>", "<p>foobar</p>") + self.assert_soup("<p>foo�bar</p>", "<p>foobar</p>") + + def test_entities_in_foreign_document_encoding(self): + # We can't implement this case correctly because by the time we + # hear about markup like "“", it's been (incorrectly) converted into + # a string like u'\x93' + pass + + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. + + @pytest.mark.skipif( + not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0), + reason="Skipping doctype test for old version of lxml to avoid segfault.", + ) + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + assert "" == doctype.strip() + + def test_beautifulstonesoup_is_xml_parser(self): + # Make sure that the deprecated BSS class uses an xml builder + # if one is installed. + with warnings.catch_warnings(record=True) as w: + soup = BeautifulStoneSoup("<b />") + assert "<b/>" == str(soup.b) + [warning] = w + assert warning.filename == __file__ + assert "The BeautifulStoneSoup class was deprecated" in str(warning.message) + + def test_tracking_line_numbers(self): + # The lxml TreeBuilder cannot keep track of line numbers from + # the original markup. Even if you ask for line numbers, we + # don't have 'em. + # + # However, for consistency with other parsers, Tag.sourceline + # and Tag.sourcepos are always set to None, rather than being + # available as an alias for find(). + soup = self.soup( + "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>", + store_line_numbers=True, + ) + assert None is soup.p.sourceline + assert None is soup.p.sourcepos + + +@pytest.mark.skipif( + not LXML_PRESENT, + reason="lxml seems not to be present, not testing its XML tree builder.", +) +class TestLXMLXMLTreeBuilder(XMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilderForXML + + def test_namespace_indexing(self): + soup = self.soup( + '<?xml version="1.1"?>\n' + "<root>" + '<tag xmlns="http://unprefixed-namespace.com">content</tag>' + '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>' + '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">' + '<subtag xmlns="http://another-unprefixed-namespace.com">' + '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">' + "</prefix2:tag3>" + "</root>" + ) + + # The BeautifulSoup object includes every namespace prefix + # defined in the entire document. This is the default set of + # namespaces used by soupsieve. + # + # Un-prefixed namespaces are not included, and if a given + # prefix is defined twice, only the first prefix encountered + # in the document shows up here. + assert soup._namespaces == { + "xml": "http://www.w3.org/XML/1998/namespace", + "prefix": "http://prefixed-namespace.com", + "prefix2": "http://another-namespace.com", + } + + # A Tag object includes only the namespace prefixes + # that were in scope when it was parsed. + + # We do not track un-prefixed namespaces as we can only hold + # one (the first one), and it will be recognized as the + # default namespace by soupsieve, even when operating from a + # tag with a different un-prefixed namespace. + assert soup.tag._namespaces == { + "xml": "http://www.w3.org/XML/1998/namespace", + } + + assert soup.tag2._namespaces == { + "prefix": "http://prefixed-namespace.com", + "xml": "http://www.w3.org/XML/1998/namespace", + } + + assert soup.subtag._namespaces == { + "prefix2": "http://another-namespace.com", + "xml": "http://www.w3.org/XML/1998/namespace", + } + + assert soup.subsubtag._namespaces == { + "prefix2": "http://another-namespace.com", + "xml": "http://www.w3.org/XML/1998/namespace", + } + + @pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed") + def test_namespace_interaction_with_select_and_find(self): + # Demonstrate how namespaces interact with select* and + # find* methods. + + soup = self.soup( + '<?xml version="1.1"?>\n' + "<root>" + '<tag xmlns="http://unprefixed-namespace.com">content</tag>' + '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>' + '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">' + "<prefix:tag3>" + "</subtag>" + "</root>" + ) + + # soupselect uses namespace URIs. + assert soup.select_one("tag").name == "tag" + assert soup.select_one("prefix|tag2").name == "tag2" + + # If a prefix is declared more than once, only the first usage + # is registered with the BeautifulSoup object. + assert soup.select_one("prefix|tag3") is None + + # But you can always explicitly specify a namespace dictionary. + assert ( + soup.select_one("prefix|tag3", namespaces=soup.subtag._namespaces).name + == "tag3" + ) + + # And a Tag (as opposed to the BeautifulSoup object) will + # have a set of default namespaces scoped to that Tag. + assert soup.subtag.select_one("prefix|tag3").name == "tag3" + + # the find() methods aren't fully namespace-aware; they just + # look at prefixes. + assert soup.find("tag").name == "tag" + assert soup.find("prefix:tag2").name == "tag2" + assert soup.find("prefix:tag3").name == "tag3" + assert soup.subtag.find("prefix:tag3").name == "tag3" + + def test_pickle_restores_builder(self): + # The lxml TreeBuilder is not picklable, so when unpickling + # a document created with it, a new TreeBuilder of the + # appropriate class is created. + soup = self.soup("<a>some markup</a>") + assert isinstance(soup.builder, self.default_builder) + pickled = pickle.dumps(soup) + unpickled = pickle.loads(pickled) + + assert "some markup" == unpickled.a.string + assert unpickled.builder != soup.builder + assert isinstance(unpickled.builder, self.default_builder) diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_navigablestring.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_navigablestring.py new file mode 100644 index 00000000..3e33469f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_navigablestring.py @@ -0,0 +1,144 @@ +import pytest + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + NavigableString, + RubyParenthesisString, + RubyTextString, + Script, + Stylesheet, + TemplateString, +) + +from . import SoupTest + + +class TestNavigableString(SoupTest): + def test_text_acquisition_methods(self): + # These methods are intended for use against Tag, but they + # work on NavigableString as well, + + s = NavigableString("fee ") + cdata = CData("fie ") + comment = Comment("foe ") + + assert "fee " == s.get_text() + assert "fee " == s.string + assert "fee" == s.get_text(strip=True) + assert ["fee "] == list(s.strings) + assert ["fee"] == list(s.stripped_strings) + assert ["fee "] == list(s._all_strings()) + + assert "fie " == cdata.get_text() + assert "fie " == cdata.string + assert "fie" == cdata.get_text(strip=True) + assert ["fie "] == list(cdata.strings) + assert ["fie"] == list(cdata.stripped_strings) + assert ["fie "] == list(cdata._all_strings()) + + # Since a Comment isn't normally considered 'text', + # these methods generally do nothing. + assert "" == comment.get_text() + assert [] == list(comment.strings) + assert [] == list(comment.stripped_strings) + assert [] == list(comment._all_strings()) + + # Unless you specifically say that comments are okay. + assert "foe" == comment.get_text(strip=True, types=Comment) + assert "foe " == comment.get_text(types=(Comment, NavigableString)) + + def test_string_has_immutable_name_property(self): + # string.name is defined as None and can't be modified + string = self.soup("s").string + assert None is string.name + with pytest.raises(AttributeError): + string.name = "foo" + + +class TestNavigableStringSubclasses(SoupTest): + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + assert str(soup) == "<![CDATA[foo]]>" + assert soup.find(string="foo") == "foo" + assert soup.contents[0] == "foo" + + def test_cdata_is_never_formatted(self): + """Text inside a CData object is passed into the formatter. + + But the return value is ignored. + """ + + self.count = 0 + + def increment(*args): + self.count += 1 + return "BITTER FAILURE" + + soup = self.soup("") + cdata = CData("<><><>") + soup.insert(1, cdata) + assert b"<![CDATA[<><><>]]>" == soup.encode(formatter=increment) + assert 1 == self.count + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + assert soup.encode() == b"<!DOCTYPE foo>\n" + + def test_declaration(self): + d = Declaration("foo") + assert "<?foo?>" == d.output_ready() + + def test_default_string_containers(self): + # In some cases, we use different NavigableString subclasses for + # the same text in different tags. + soup = self.soup("<div>text</div><script>text</script><style>text</style>") + assert [NavigableString, Script, Stylesheet] == [ + x.__class__ for x in soup.find_all(string=True) + ] + + # The TemplateString is a little unusual because it's generally found + # _inside_ children of a <template> element, not a direct child of the + # <template> element. + soup = self.soup( + "<template>Some text<p>In a tag</p></template>Some text outside" + ) + assert all( + isinstance(x, TemplateString) + for x in soup.template._all_strings(types=None) + ) + + # Once the <template> tag closed, we went back to using + # NavigableString. + outside = soup.template.next_sibling + assert isinstance(outside, NavigableString) + assert not isinstance(outside, TemplateString) + + # The TemplateString is also unusual because it can contain + # NavigableString subclasses of _other_ types, such as + # Comment. + markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>" + soup = self.soup(markup) + assert markup == soup.template.encode("utf8") + + def test_ruby_strings(self): + markup = "<ruby>æ¼¢ <rp>(</rp><rt>kan</rt><rp>)</rp> å— <rp>(</rp><rt>ji</rt><rp>)</rp></ruby>" + soup = self.soup(markup) + assert isinstance(soup.rp.string, RubyParenthesisString) + assert isinstance(soup.rt.string, RubyTextString) + + # Just as a demo, here's what this means for get_text usage. + assert "æ¼¢å—" == soup.get_text(strip=True) + assert "æ¼¢(kan)å—(ji)" == soup.get_text( + strip=True, types=(NavigableString, RubyTextString, RubyParenthesisString) + ) diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_pageelement.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_pageelement.py new file mode 100644 index 00000000..91d57792 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_pageelement.py @@ -0,0 +1,437 @@ +"""Tests of the bs4.element.PageElement class""" + +import copy +import pickle +import pytest +import sys +import warnings + +from bs4 import BeautifulSoup +from bs4.element import ( + AttributeValueList, + Comment, +) +from bs4.filter import SoupStrainer +from . import ( + SoupTest, +) + + +class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert soup.b.string.encode("utf-8") == "\N{SNOWMAN}".encode("utf-8") + + def test_tag_containing_unicode_string_can_be_encoded(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert soup.b.encode("utf-8") == html.encode("utf-8") + + def test_encoding_substitutes_unrecognized_characters_by_default(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert soup.b.encode("ascii") == b"<b>☃</b>" + + def test_encoding_can_be_made_strict(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + with pytest.raises(UnicodeEncodeError): + soup.encode("ascii", errors="strict") + + def test_decode_contents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert "\N{SNOWMAN}" == soup.b.decode_contents() + + def test_encode_contents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents(encoding="utf8") + + def test_encode_deeply_nested_document(self): + # This test verifies that encoding a string doesn't involve + # any recursive function calls. If it did, this test would + # overflow the Python interpreter stack. + limit = sys.getrecursionlimit() + 1 + markup = "<span>" * limit + soup = self.soup(markup) + encoded = soup.encode() + assert limit == encoded.count(b"<span>") + + def test_deprecated_renderContents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + with warnings.catch_warnings(record=True) as w: + soup.renderContents() + assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents() + msgs = [str(warning.message) for warning in w] + assert all( + x + == "Call to deprecated method renderContents. (Replaced by encode_contents) -- Deprecated since version 4.0.0." + for x in msgs + ) + + def test_repr(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert html == repr(soup) + + +class TestFormatters(SoupTest): + """Test the formatting feature, used by methods like decode() and + prettify(), and the formatters themselves. + """ + + def test_default_formatter_is_minimal(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + assert decoded == self.document_for( + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + ) + + def test_formatter_html(self): + markup = ( + "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + ) + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + assert decoded == self.document_for( + "<br/><b><<Sacré bleu!>></b>" + ) + + def test_formatter_html5(self): + markup = ( + "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + ) + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + assert decoded == self.document_for( + "<br><b><<Sacré bleu!>></b>" + ) + + def test_formatter_minimal(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + assert decoded == self.document_for( + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + ) + + def test_formatter_null(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + assert decoded == self.document_for( + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + ) + + def test_formatter_custom(self): + markup = "<b><foo></b><b>bar</b><br/>" + soup = self.soup(markup) + decoded = soup.decode(formatter=lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + assert decoded == self.document_for("<b><FOO></b><b>BAR</b><br/>") + + def test_formatter_is_run_on_attribute_values(self): + markup = '<a href="http://a.com?a=b&c=é">e</a>' + soup = self.soup(markup) + a = soup.a + + expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' + + assert expect_minimal == a.decode() + assert expect_minimal == a.decode(formatter="minimal") + + expect_html = '<a href="http://a.com?a=b&c=é">e</a>' + assert expect_html == a.decode(formatter="html") + + assert markup == a.decode(formatter=None) + expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' + assert expect_upper == a.decode(formatter=lambda x: x.upper()) + + def test_formatter_skips_script_tag_for_html_documents(self): + doc = """ + <script type="text/javascript"> + console.log("< < hey > > "); + </script> +""" + encoded = BeautifulSoup(doc, "html.parser").encode() + assert b"< < hey > >" in encoded + + def test_formatter_skips_style_tag_for_html_documents(self): + doc = """ + <style type="text/css"> + console.log("< < hey > > "); + </style> +""" + encoded = BeautifulSoup(doc, "html.parser").encode() + assert b"< < hey > >" in encoded + + def test_prettify_leaves_preformatted_text_alone(self): + soup = self.soup( + "<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>" + ) + # Everything outside the <pre> tag is reformatted, but everything + # inside is left alone. + assert ( + "<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n" + == soup.div.prettify() + ) + + def test_prettify_handles_nested_string_literal_tags(self): + # Most of this markup is inside a <pre> tag, so prettify() + # only does three things to it: + # 1. Add a newline and a space between the <div> and the <pre> + # 2. Add a newline after the </pre> + # 3. Add a newline at the end. + # + # The contents of the <pre> tag are left completely alone. In + # particular, we don't start adding whitespace again once we + # encounter the first </pre> tag, because we know it's not + # the one that put us into string literal mode. + markup = """<div><pre><code>some +<script><pre>code</pre></script> for you +</code></pre></div>""" + + expect = """<div> + <pre><code>some +<script><pre>code</pre></script> for you +</code></pre> +</div> +""" + soup = self.soup(markup) + assert expect == soup.div.prettify() + + def test_prettify_accepts_formatter_function(self): + soup = BeautifulSoup("<html><body>foo</body></html>", "html.parser") + pretty = soup.prettify(formatter=lambda x: x.upper()) + assert "FOO" in pretty + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("<a></a>") + assert str is type(soup.prettify()) + + def test_prettify_can_encode_data(self): + soup = self.soup("<a></a>") + assert bytes is type(soup.prettify("utf-8")) + + def test_html_entity_substitution_off_by_default(self): + markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + assert encoded == markup.encode("utf-8") + + def test_encoding_substitution(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ( + '<meta content="text/html; charset=x-sjis" ' 'http-equiv="Content-type"/>' + ) + soup = self.soup(meta_tag) + + # Parse the document, and the charset apprears unchanged. + assert soup.meta["content"] == "text/html; charset=x-sjis" + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + assert b"charset=utf-8" in utf_8 + + euc_jp = soup.encode("euc_jp") + assert b"charset=euc_jp" in euc_jp + + shift_jis = soup.encode("shift-jis") + assert b"charset=shift-jis" in shift_jis + + utf_16_u = soup.encode("utf-16").decode("utf-16") + assert "charset=utf-16" in utf_16_u + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ( + '<head><meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type"/></head><pre>foo</pre>' + ) + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer("pre") + soup = self.soup(markup, parse_only=strainer) + assert soup.contents[0].name == "pre" + + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setup_method(self): + self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" +"http://www.w3.org/TR/REC-html40/transitional.dtd"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +<title>Beautiful Soup: We called him Tortoise because he taught us.</title> +<link rev="made" href="mailto:leonardr@segfault.org"> +<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> +<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> +<meta name="author" content="Leonard Richardson"> +</head> +<body> +<a href="foo">foo</a> +<a href="foo"><b>bar</b></a> +</body> +</html>""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + assert loaded.__class__ == BeautifulSoup + assert loaded.decode() == self.tree.decode() + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + assert copied.decode() == self.tree.decode() + + def test_copy_deeply_nested_document(self): + # This test verifies that copy and deepcopy don't involve any + # recursive function calls. If they did, this test would + # overflow the Python interpreter stack. + limit = sys.getrecursionlimit() + 1 + markup = "<span>" * limit + + soup = self.soup(markup) + + copy.copy(soup) + copy.deepcopy(soup) + + def test_copy_preserves_encoding(self): + soup = BeautifulSoup(b"<p> </p>", "html.parser") + encoding = soup.original_encoding + copy = soup.__copy__() + assert "<p> </p>" == str(copy) + assert encoding == copy.original_encoding + + def test_copy_preserves_builder_information(self): + tag = self.soup("<p></p>").p + + # Simulate a tag obtained from a source file. + tag.sourceline = 10 + tag.sourcepos = 33 + + copied = tag.__copy__() + + # The TreeBuilder object is no longer availble, but information + # obtained from it gets copied over to the new Tag object. + assert tag.sourceline == copied.sourceline + assert tag.sourcepos == copied.sourcepos + assert tag.can_be_empty_element == copied.can_be_empty_element + assert tag.cdata_list_attributes == copied.cdata_list_attributes + assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags + assert tag.interesting_string_types == copied.interesting_string_types + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + assert loaded.decode() == soup.decode() + + def test_copy_navigablestring_is_not_attached_to_tree(self): + html = "<b>Foo<a></a></b><b>Bar</b>" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) + assert s1 == s2 + assert None is s2.parent + assert None is s2.next_element + assert None is not s1.next_sibling + assert None is s2.next_sibling + assert None is s2.previous_element + + def test_copy_navigablestring_subclass_has_same_type(self): + html = "<b><!--Foo--></b>" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) + assert s1 == s2 + assert isinstance(s2, Comment) + + def test_copy_entire_soup(self): + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + assert soup == soup_copy + + def test_copy_tag_copies_contents(self): + html = "<div class='a b c'><b>Foo<a></a></b><b>Bar</b></div>end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + assert str(div) == str(div_copy) + assert div == div_copy + + # But they're not the same object. + assert div is not div_copy + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + assert None is div_copy.parent + assert None is div_copy.previous_element + assert None is div_copy.find(string="Bar").next_element + assert None is not div.find(string="Bar").next_element + + # Modifying one of the tag's multi-valued attributes + # doesn't modify the other. + assert div["class"] is not div_copy["class"] + div["class"].append("d") + assert "a b c d".split() == div["class"] + assert "a b c".split() == div_copy["class"] + assert isinstance(div_copy["class"], AttributeValueList) + + +class TestEquality(SoupTest): + + def test_comparison(self): + soup = self.soup("<a>string</a> <a>string</a>") + first_a, second_a = soup.find_all('a') + first_string, second_string = soup.find_all(string='string') + + # Tags with the same markup are equal. + assert first_a == second_a + + # NavigableStrings with the same content are equal, and also + # equal to a Python string with the same content... + assert first_string == second_string == "string" + + # ...but not equivalent to a bytestring with the same content. + assert first_string != b"string" + + def test_hash(self): + soup = self.soup("<a>string</a> <a>string</a>") + first_a, second_a = soup.find_all('a') + first_string, second_string = soup.find_all(string='string') + + # Tags with the same markup hash to the same value. + assert hash(first_a) == hash(second_a) + + # But they're not the same object. + assert id(first_a) != id(second_a) + + # NavigableStrings with the same contents hash to the value of + # the contents. + assert hash(first_string) == hash(second_string) == hash("string") diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py new file mode 100644 index 00000000..5f771a40 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py @@ -0,0 +1,602 @@ +# -*- coding: utf-8 -*- +"""Tests of Beautiful Soup as a whole.""" + +import logging +import pickle +import pytest +from typing import Iterable + +from bs4 import ( + BeautifulSoup, + GuessedAtParserWarning, + dammit, +) +from bs4.builder import ( + TreeBuilder, +) +from bs4.element import ( + AttributeValueList, + XMLAttributeDict, + Comment, + PYTHON_SPECIFIC_ENCODINGS, + Tag, + NavigableString, +) +from bs4.filter import SoupStrainer +from bs4.exceptions import ( + ParserRejectedMarkup, +) +from bs4._warnings import ( + MarkupResemblesLocatorWarning, +) + + +from . import ( + default_builder, + LXML_PRESENT, + SoupTest, +) +import warnings +from typing import Type + + +class TestConstructor(SoupTest): + def test_short_unicode_input(self): + data = "<h1>éé</h1>" + soup = self.soup(data) + assert "éé" == soup.h1.string + + def test_embedded_null(self): + data = "<h1>foo\0bar</h1>" + soup = self.soup(data) + assert "foo\0bar" == soup.h1.string + + def test_exclude_encodings(self): + utf8_data = "RäksmörgÃ¥s".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + assert "windows-1252" == soup.original_encoding + + def test_custom_builder_class(self): + # Verify that you can pass in a custom Builder class and + # it'll be instantiated with the appropriate keyword arguments. + class Mock(object): + def __init__(self, **kwargs): + self.called_with = kwargs + self.is_xml = True + self.store_line_numbers = False + self.cdata_list_attributes = [] + self.preserve_whitespace_tags = [] + self.string_containers = {} + self.attribute_dict_class = XMLAttributeDict + self.attribute_value_list_class = AttributeValueList + + def initialize_soup(self, soup): + pass + + def feed(self, markup): + self.fed = markup + + def reset(self): + pass + + def ignore(self, ignore): + pass + + set_up_substitutions = can_be_empty_element = ignore + + def prepare_markup(self, *args, **kwargs): + yield ( + "prepared markup", + "original encoding", + "declared encoding", + "contains replacement characters", + ) + + kwargs = dict( + var="value", + # This is a deprecated BS3-era keyword argument, which + # will be stripped out. + convertEntities=True, + ) + with warnings.catch_warnings(record=True): + soup = BeautifulSoup("", builder=Mock, **kwargs) + assert isinstance(soup.builder, Mock) + assert dict(var="value") == soup.builder.called_with + assert "prepared markup" == soup.builder.fed + + # You can also instantiate the TreeBuilder yourself. In this + # case, that specific object is used and any keyword arguments + # to the BeautifulSoup constructor are ignored. + builder = Mock(**kwargs) + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup( + "", + builder=builder, + ignored_value=True, + ) + msg = str(w[0].message) + assert msg.startswith( + "Keyword arguments to the BeautifulSoup constructor will be ignored." + ) + assert builder == soup.builder + assert kwargs == builder.called_with + + def test_parser_markup_rejection(self): + # If markup is completely rejected by the parser, an + # explanatory ParserRejectedMarkup exception is raised. + class Mock(TreeBuilder): + def feed(self, *args, **kwargs): + raise ParserRejectedMarkup("Nope.") + + def prepare_markup(self, markup, *args, **kwargs): + # We're going to try two different ways of preparing this markup, + # but feed() will reject both of them. + yield markup, None, None, False + yield markup, None, None, False + + + with pytest.raises(ParserRejectedMarkup) as exc_info: + BeautifulSoup("", builder=Mock) + assert ( + "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help." + in str(exc_info.value) + ) + + def test_cdata_list_attributes(self): + # Most attribute values are represented as scalars, but the + # HTML standard says that some attributes, like 'class' have + # space-separated lists as values. + markup = '<a id=" an id " class=" a class "></a>' + soup = self.soup(markup) + + # Note that the spaces are stripped for 'class' but not for 'id'. + a = soup.a + assert " an id " == a["id"] + assert ["a", "class"] == a["class"] + + # TreeBuilder takes an argument called 'multi_valued_attributes' which lets + # you customize or disable this. As always, you can customize the TreeBuilder + # by passing in a keyword argument to the BeautifulSoup constructor. + soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) + assert " a class " == soup.a["class"] + + # Here are two ways of saying that `id` is a multi-valued + # attribute in this context, but 'class' is not. + for switcheroo in ({"*": "id"}, {"a": "id"}): + with warnings.catch_warnings(record=True): + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + soup = self.soup( + markup, builder=None, multi_valued_attributes=switcheroo + ) + a = soup.a + assert ["an", "id"] == a["id"] + assert " a class " == a["class"] + + def test_replacement_classes(self): + # Test the ability to pass in replacements for element classes + # which will be used when building the tree. + class TagPlus(Tag): + pass + + class StringPlus(NavigableString): + pass + + class CommentPlus(Comment): + pass + + soup = self.soup( + "<a><b>foo</b>bar</a><!--whee-->", + element_classes={ + Tag: TagPlus, + NavigableString: StringPlus, + Comment: CommentPlus, + }, + ) + + # The tree was built with TagPlus, StringPlus, and CommentPlus objects, + # rather than Tag, String, and Comment objects. + assert all( + isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.descendants + ) + + def test_alternate_string_containers(self): + # Test the ability to customize the string containers for + # different types of tags. + class PString(NavigableString): + pass + + class BString(NavigableString): + pass + + soup = self.soup( + "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text", + string_containers={ + "b": BString, + "p": PString, + }, + ) + + # The string before the <p> tag is a regular NavigableString. + assert isinstance(soup.div.contents[0], NavigableString) + + # The string inside the <p> tag, but not inside the <i> tag, + # is a PString. + assert isinstance(soup.p.contents[0], PString) + + # Every string inside the <b> tag is a BString, even the one that + # was also inside an <i> tag. + for s in soup.b.strings: + assert isinstance(s, BString) + + # Now that parsing was complete, the string_container_stack + # (where this information was kept) has been cleared out. + assert [] == soup.string_container_stack + + @pytest.mark.parametrize("bad_markup", [1, False, lambda x: False]) + def test_invalid_markup_type(self, bad_markup): + with pytest.raises(TypeError) as exc_info: + BeautifulSoup(bad_markup, "html.parser") + assert ( + f"Incoming markup is of an invalid type: {bad_markup!r}. Markup must be a string, a bytestring, or an open filehandle." + in str(exc_info.value) + ) + + +class TestOutput(SoupTest): + @pytest.mark.parametrize( + "eventual_encoding,actual_encoding", + [ + ("utf-8", "utf-8"), + ("utf-16", "utf-16"), + ], + ) + def test_decode_xml_declaration(self, eventual_encoding, actual_encoding): + # Most of the time, calling decode() on an XML document will + # give you a document declaration that mentions the encoding + # you intend to use when encoding the document as a + # bytestring. + soup = self.soup("<tag></tag>") + soup.is_xml = True + assert ( + f'<?xml version="1.0" encoding="{actual_encoding}"?>\n<tag></tag>' + == soup.decode(eventual_encoding=eventual_encoding) + ) + + @pytest.mark.parametrize( + "eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None] + ) + def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding( + self, eventual_encoding + ): + # But if you pass a Python internal encoding into decode(), or + # omit the eventual_encoding altogether, the document + # declaration won't mention any particular encoding. + soup = BeautifulSoup("<tag></tag>", "html.parser") + soup.is_xml = True + assert '<?xml version="1.0"?>\n<tag></tag>' == soup.decode( + eventual_encoding=eventual_encoding + ) + + def test(self): + # BeautifulSoup subclasses Tag and extends the decode() method. + # Make sure the other Tag methods which call decode() call + # it correctly. + soup = self.soup("<tag></tag>") + assert b"<tag></tag>" == soup.encode(encoding="utf-8") + assert b"<tag></tag>" == soup.encode_contents(encoding="utf-8") + assert "<tag></tag>" == soup.decode_contents() + assert "<tag>\n</tag>\n" == soup.prettify() + + +class TestWarnings(SoupTest): + # Note that some of the tests in this class create BeautifulSoup + # objects directly rather than using self.soup(). That's + # because SoupTest.soup is defined in a different file, + # which will throw off the assertion in _assert_warning + # that the code that triggered the warning is in the same + # file as the test. + + def _assert_warning( + self, warnings: Iterable[warnings.WarningMessage], cls: Type[Warning] + ) -> warnings.WarningMessage: + for w in warnings: + if isinstance(w.message, cls): + assert w.filename == __file__ + return w + raise Exception("%s warning not found in %r" % (cls, warnings)) + + def _assert_no_parser_specified(self, w: Iterable[warnings.WarningMessage]) -> None: + warning = self._assert_warning(w, GuessedAtParserWarning) + message = str(warning.message) + assert message.startswith(GuessedAtParserWarning.MESSAGE[:60]) + + def test_warning_if_no_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + BeautifulSoup("<a><b></b></a>") + self._assert_no_parser_specified(w) + + def test_warning_if_parser_specified_too_vague(self): + with warnings.catch_warnings(record=True) as w: + BeautifulSoup("<a><b></b></a>", "html") + self._assert_no_parser_specified(w) + + def test_no_warning_if_explicit_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + self.soup("<a><b></b></a>") + assert [] == w + + def test_warning_if_strainer_filters_everything(self): + strainer = SoupStrainer(name="a", string="b") + with warnings.catch_warnings(record=True) as w: + self.soup("<a><b></b></a>", parse_only=strainer) + warning = self._assert_warning(w, UserWarning) + msg = str(warning.message) + assert msg.startswith("The given value for parse_only will exclude everything:") + + def test_parseOnlyThese_renamed_to_parse_only(self): + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup( + "<a><b></b></a>", + "html.parser", + parseOnlyThese=SoupStrainer("b"), + ) + warning = self._assert_warning(w, DeprecationWarning) + msg = str(warning.message) + assert "parseOnlyThese" in msg + assert "parse_only" in msg + assert b"<b></b>" == soup.encode() + + def test_fromEncoding_renamed_to_from_encoding(self): + with warnings.catch_warnings(record=True) as w: + utf8 = b"\xc3\xa9" + soup = BeautifulSoup(utf8, "html.parser", fromEncoding="utf8") + warning = self._assert_warning(w, DeprecationWarning) + msg = str(warning.message) + assert "fromEncoding" in msg + assert "from_encoding" in msg + assert "utf8" == soup.original_encoding + + def test_unrecognized_keyword_argument(self): + with pytest.raises(TypeError): + self.soup("<a>", no_such_argument=True) + + @pytest.mark.parametrize( + "markup", + [ + "markup.html", + "markup.htm", + "markup.HTML", + "markup.txt", + "markup.xhtml", + "markup.xml", + "/home/user/file.txt", + r"c:\user\file.html" r"\\server\share\path\file.XhTml", + ], + ) + def test_resembles_filename_warning(self, markup): + # A warning is issued if the "markup" looks like the name of + # an HTML or text file, or a full path to a file on disk. + with warnings.catch_warnings(record=True) as w: + BeautifulSoup(markup, "html.parser") + warning = self._assert_warning(w, MarkupResemblesLocatorWarning) + assert "looks more like a filename" in str(warning.message) + + @pytest.mark.parametrize( + "markup", + [ + "filename", + "markuphtml", + "markup.com", + "", + # Excluded due to an irrelevant file extension. + "markup.js", + "markup.jpg", + "markup.markup", + # Excluded due to the lack of any file extension. + "/home/user/file", + r"c:\user\file.html" r"\\server\share\path\file", + # Excluded because of two consecutive slashes _and_ the + # colon. + "log message containing a url http://www.url.com/ right there.html", + # Excluded for containing various characters or combinations + # not usually found in filenames. + "two consecutive spaces.html", + "two//consecutive//slashes.html", + "looks/like/a/filename/but/oops/theres/a#comment.html", + "two\nlines.html", + "contains?.html", + "contains*.html", + "contains#.html", + "contains&.html", + "contains;.html", + "contains>.html", + "contains<.html", + "contains$.html", + "contains|.html", + "contains:.html", + ":-at-the-front.html", + ], + ) + def test_resembles_filename_no_warning(self, markup): + # The 'looks more like a filename' warning is not issued if + # the markup looks like a bare string, a domain name, or a + # file that's not an HTML file. + with warnings.catch_warnings(record=True) as w: + self.soup(markup) + assert [] == w + + def test_url_warning_with_bytes_url(self): + url = b"http://www.crummybytes.com/" + with warnings.catch_warnings(record=True) as warning_list: + BeautifulSoup(url, "html.parser") + warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning) + assert "looks more like a URL" in str(warning.message) + assert url not in str(warning.message).encode("utf8") + + def test_url_warning_with_unicode_url(self): + url = "http://www.crummyunicode.com/" + with warnings.catch_warnings(record=True) as warning_list: + # note - this url must differ from the bytes one otherwise + # python's warnings system swallows the second warning + BeautifulSoup(url, "html.parser") + warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning) + assert "looks more like a URL" in str(warning.message) + assert url not in str(warning.message) + + def test_url_warning_with_bytes_and_space(self): + # Here the markup contains something besides a URL, so no warning + # is issued. + with warnings.catch_warnings(record=True) as warning_list: + self.soup(b"http://www.crummybytes.com/ is great") + assert not any("looks more like a URL" in str(w.message) for w in warning_list) + + def test_url_warning_with_unicode_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + self.soup("http://www.crummyunicode.com/ is great") + assert not any("looks more like a URL" in str(w.message) for w in warning_list) + + +class TestSelectiveParsing(SoupTest): + def test_parse_with_soupstrainer(self): + markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" + strainer = SoupStrainer("b") + soup = self.soup(markup, parse_only=strainer) + assert soup.encode() == b"<b>Yes</b><b>Yes <c>Yes</c></b>" + + +class TestNewTag(SoupTest): + """Test the BeautifulSoup.new_tag() method.""" + + def test_new_tag(self): + soup = self.soup("") + new_tag = soup.new_tag("foo", string="txt", bar="baz", attrs={"name": "a name"}) + assert isinstance(new_tag, Tag) + assert "foo" == new_tag.name + assert new_tag.string == "txt" + assert dict(bar="baz", name="a name") == new_tag.attrs + assert None is new_tag.parent + + # string can be null + new_tag = soup.new_tag("foo") + assert None is new_tag.string + new_tag = soup.new_tag("foo", string=None) + assert None is new_tag.string + + # Or the empty string + new_tag = soup.new_tag("foo", string="") + assert "" == new_tag.string + + @pytest.mark.skipif( + not LXML_PRESENT, reason="lxml not installed, cannot parse XML document" + ) + def test_xml_tag_inherits_self_closing_rules_from_builder(self): + xml_soup = BeautifulSoup("", "xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the <br> and <p> tag are empty-element, just because + # they have no contents. + assert b"<br/>" == xml_br.encode() + assert b"<p/>" == xml_p.encode() + + def test_tag_inherits_self_closing_rules_from_builder(self): + html_soup = BeautifulSoup("", "html.parser") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + assert b"<br/>" == html_br.encode() + assert b"<p></p>" == html_p.encode() + + +class TestNewString(SoupTest): + """Test the BeautifulSoup.new_string() method.""" + + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + assert "foo" == s + assert isinstance(s, NavigableString) + + def test_new_string_can_create_navigablestring_subclass(self): + soup = self.soup("") + s = soup.new_string("foo", Comment) + assert "foo" == s + assert isinstance(s, Comment) + + +class TestPickle(SoupTest): + # Test our ability to pickle the BeautifulSoup object itself. + + def test_normal_pickle(self): + soup = self.soup("<a>some markup</a>") + pickled = pickle.dumps(soup) + unpickled = pickle.loads(pickled) + assert "some markup" == unpickled.a.string + + def test_pickle_with_no_builder(self): + # We had a bug that prevented pickling from working if + # the builder wasn't set. + soup = self.soup("some markup") + soup.builder = None + pickled = pickle.dumps(soup) + unpickled = pickle.loads(pickled) + assert "some markup" == unpickled.string + + +class TestEncodingConversion(SoupTest): + # Test Beautiful Soup's ability to decode and encode from various + # encodings. + + def setup_method(self): + self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' + self.utf8_data = self.unicode_data.encode("utf-8") + # Just so you know what it looks like. + assert ( + self.utf8_data + == b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>' + ) + + def test_ascii_in_unicode_out(self): + # ASCII input is converted to Unicode. The original_encoding + # attribute is set to 'utf-8', a superset of ASCII. + chardet = dammit._chardet_dammit + logging.disable(logging.WARNING) + try: + + def noop(str): + return None + + # Disable chardet, which will realize that the ASCII is ASCII. + dammit._chardet_dammit = noop + ascii = b"<foo>a</foo>" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + assert isinstance(unicode_output, str) + assert unicode_output == self.document_for(ascii.decode()) + assert soup_from_ascii.original_encoding.lower() == "utf-8" + finally: + logging.disable(logging.NOTSET) + dammit._chardet_dammit = chardet + + def test_unicode_in_unicode_out(self): + # Unicode input is left alone. The original_encoding attribute + # is not set. + soup_from_unicode = self.soup(self.unicode_data) + assert soup_from_unicode.decode() == self.unicode_data + assert soup_from_unicode.foo.string == "Sacr\xe9 bleu!" + assert soup_from_unicode.original_encoding is None + + def test_utf8_in_unicode_out(self): + # UTF-8 input is converted to Unicode. The original_encoding + # attribute is set. + soup_from_utf8 = self.soup(self.utf8_data) + assert soup_from_utf8.decode() == self.unicode_data + assert soup_from_utf8.foo.string == "Sacr\xe9 bleu!" + + def test_utf8_out(self): + # The internal data structures can be encoded as UTF-8. + soup_from_unicode = self.soup(self.unicode_data) + assert soup_from_unicode.encode("utf-8") == self.utf8_data diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_tag.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_tag.py new file mode 100644 index 00000000..b83e829b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_tag.py @@ -0,0 +1,241 @@ +import warnings +from bs4.element import ( + Comment, + NavigableString, +) +from . import SoupTest + + +class TestTag(SoupTest): + """Test various methods of Tag which aren't so complicated they + need their own classes. + """ + + def test__should_pretty_print(self): + # Test the rules about when a tag should be pretty-printed. + tag = self.soup("").new_tag("a_tag") + + # No list of whitespace-preserving tags -> pretty-print + tag._preserve_whitespace_tags = None + assert True is tag._should_pretty_print(0) + + # List exists but tag is not on the list -> pretty-print + tag.preserve_whitespace_tags = ["some_other_tag"] + assert True is tag._should_pretty_print(1) + + # Indent level is None -> don't pretty-print + assert False is tag._should_pretty_print(None) + + # Tag is on the whitespace-preserving list -> don't pretty-print + tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"] + assert False is tag._should_pretty_print(1) + + def test_len(self): + """The length of a Tag is its number of children.""" + soup = self.soup("<top>1<b>2</b>3</top>") + + # The BeautifulSoup object itself contains one element: the + # <top> tag. + assert len(soup.contents) == 1 + assert len(soup) == 1 + + # The <top> tag contains three elements: the text node "1", the + # <b> tag, and the text node "3". + assert len(soup.top) == 3 + assert len(soup.top.contents) == 3 + + def test_member_access_invokes_find(self): + """Accessing a Python member .foo invokes find('foo')""" + soup = self.soup("<b><i></i></b>") + assert soup.b == soup.find("b") + assert soup.b.i == soup.find("b").find("i") + assert soup.a is None + + def test_deprecated_member_access(self): + soup = self.soup("<b><i></i></b>") + with warnings.catch_warnings(record=True) as w: + tag = soup.bTag + assert soup.b == tag + assert ( + '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")' + == str(w[0].message) + ) + + def test_has_attr(self): + """has_attr() checks for the presence of an attribute. + + Please note note: has_attr() is different from + __in__. has_attr() checks the tag's attributes and __in__ + checks the tag's chidlren. + """ + soup = self.soup("<foo attr='bar'>") + assert soup.foo.has_attr("attr") + assert not soup.foo.has_attr("attr2") + + def test_attributes_come_out_in_alphabetical_order(self): + markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' + self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') + + def test_string(self): + # A Tag that contains only a text node makes that node + # available as .string. + soup = self.soup("<b>foo</b>") + assert soup.b.string == "foo" + + def test_empty_tag_has_no_string(self): + # A Tag with no children has no .stirng. + soup = self.soup("<b></b>") + assert soup.b.string is None + + def test_tag_with_multiple_children_has_no_string(self): + # A Tag with no children has no .string. + soup = self.soup("<a>foo<b></b><b></b></b>") + assert soup.b.string is None + + soup = self.soup("<a>foo<b></b>bar</b>") + assert soup.b.string is None + + # Even if all the children are strings, due to trickery, + # it won't work--but this would be a good optimization. + soup = self.soup("<a>foo</b>") + soup.a.insert(1, "bar") + assert soup.a.string is None + + def test_tag_with_recursive_string_has_string(self): + # A Tag with a single child which has a .string inherits that + # .string. + soup = self.soup("<a><b>foo</b></a>") + assert soup.a.string == "foo" + assert soup.string == "foo" + + def test_lack_of_string(self): + """Only a Tag containing a single text node has a .string.""" + soup = self.soup("<b>f<i>e</i>o</b>") + assert soup.b.string is None + + soup = self.soup("<b></b>") + assert soup.b.string is None + + def test_all_text(self): + """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" + soup = self.soup("<a>a<b>r</b> <r> t </r></a>") + assert soup.a.text == "ar t " + assert soup.a.get_text(strip=True) == "art" + assert soup.a.get_text(",") == "a,r, , t " + assert soup.a.get_text(",", strip=True) == "a,r,t" + + def test_get_text_ignores_special_string_containers(self): + soup = self.soup("foo<!--IGNORE-->bar") + assert soup.get_text() == "foobar" + + assert soup.get_text(types=(NavigableString, Comment)) == "fooIGNOREbar" + assert soup.get_text(types=None) == "fooIGNOREbar" + + soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") + assert soup.get_text() == "foobar" + + def test_all_strings_ignores_special_string_containers(self): + soup = self.soup("foo<!--IGNORE-->bar") + assert ["foo", "bar"] == list(soup.strings) + + soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") + assert ["foo", "bar"] == list(soup.strings) + + def test_string_methods_inside_special_string_container_tags(self): + # Strings inside tags like <script> are generally ignored by + # methods like get_text, because they're not what humans + # consider 'text'. But if you call get_text on the <script> + # tag itself, those strings _are_ considered to be 'text', + # because there's nothing else you might be looking for. + + style = self.soup("<div>a<style>Some CSS</style></div>") + template = self.soup( + "<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>" + ) + script = self.soup("<div>a<script><!--a comment-->Some text</script></div>") + + assert style.div.get_text() == "a" + assert list(style.div.strings) == ["a"] + assert style.div.style.get_text() == "Some CSS" + assert list(style.div.style.strings) == ["Some CSS"] + + # The comment is not picked up here. That's because it was + # parsed into a Comment object, which is not considered + # interesting by template.strings. + assert template.div.get_text() == "a" + assert list(template.div.strings) == ["a"] + assert template.div.template.get_text() == "Templated text." + assert list(template.div.template.strings) == ["Templated ", "text", "."] + + # The comment is included here, because it didn't get parsed + # into a Comment object--it's part of the Script string. + assert script.div.get_text() == "a" + assert list(script.div.strings) == ["a"] + assert script.div.script.get_text() == "<!--a comment-->Some text" + assert list(script.div.script.strings) == ["<!--a comment-->Some text"] + + +class TestMultiValuedAttributes(SoupTest): + """Test the behavior of multi-valued attributes like 'class'. + + The values of such attributes are always presented as lists. + """ + + def test_single_value_becomes_list(self): + soup = self.soup("<a class='foo'>") + assert ["foo"] == soup.a["class"] + + def test_multiple_values_becomes_list(self): + soup = self.soup("<a class='foo bar'>") + assert ["foo", "bar"] == soup.a["class"] + + def test_multiple_values_separated_by_weird_whitespace(self): + soup = self.soup("<a class='foo\tbar\nbaz'>") + assert ["foo", "bar", "baz"] == soup.a["class"] + + def test_attributes_joined_into_string_on_output(self): + soup = self.soup("<a class='foo\tbar'>") + assert b'<a class="foo bar"></a>' == soup.a.encode() + + def test_get_attribute_list(self): + soup = self.soup("<a id='abc def'>") + assert ["abc def"] == soup.a.get_attribute_list("id") + assert [] == soup.a.get_attribute_list("no such attribute") + + def test_accept_charset(self): + soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') + assert ["ISO-8859-1", "UTF-8"] == soup.form["accept-charset"] + + def test_cdata_attribute_applying_only_to_one_tag(self): + data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' + soup = self.soup(data) + # We saw in another test that accept-charset is a cdata-list + # attribute for the <form> tag. But it's not a cdata-list + # attribute for any other tag. + assert "ISO-8859-1 UTF-8" == soup.a["accept-charset"] + + def test_customization(self): + # It's possible to change which attributes of which tags + # are treated as multi-valued attributes. + # + # Here, 'id' is a multi-valued attribute and 'class' is not. + # + # TODO: This code is in the builder and should be tested there. + soup = self.soup( + '<a class="foo" id="bar">', multi_valued_attributes={"*": "id"} + ) + assert soup.a["class"] == "foo" + assert soup.a["id"] == ["bar"] + + def test_hidden_tag_is_invisible(self): + # Setting .hidden on a tag makes it invisible in output, but + # leaves its contents visible. + # + # This is not a documented or supported feature of Beautiful + # Soup (e.g. NavigableString doesn't support .hidden even + # though it could), but some people use it and it's not + # hurting anything to verify that it keeps working. + # + soup = self.soup('<div id="1"><span id="2">a string</span></div>') + soup.span.hidden = True + assert '<div id="1">a string</div>' == str(soup.div) diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py new file mode 100644 index 00000000..06d62981 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py @@ -0,0 +1,1452 @@ +# -*- coding: utf-8 -*- +"""Tests for Beautiful Soup's tree traversal methods. + +The tree traversal methods are the main advantage of using Beautiful +Soup over just using a parser. + +Different parsers will build different Beautiful Soup trees given the +same markup, but all Beautiful Soup trees can be traversed with the +methods tested here. +""" + +import pytest +import re +import warnings +from bs4 import BeautifulSoup +from bs4.builder import builder_registry +from bs4.element import ( + AttributeResemblesVariableWarning, + CData, + Comment, + NavigableString, + Tag, +) +from bs4.filter import SoupStrainer +from . import ( + SoupTest, +) + + +class TestFind(SoupTest): + """Basic tests of the find() method. + """ + + def test_find_tag(self): + soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") + assert soup.find("b").string == "2" + + def test_unicode_text_find(self): + soup = self.soup("<h1>RäksmörgÃ¥s</h1>") + assert soup.find(string="RäksmörgÃ¥s") == "RäksmörgÃ¥s" + + def test_unicode_attribute_find(self): + soup = self.soup('<h1 id="RäksmörgÃ¥s">here it is</h1>') + str(soup) + assert "here it is" == soup.find(id="RäksmörgÃ¥s").text + + def test_find_everything(self): + """Test an optimization that finds all tags.""" + soup = self.soup("<a>foo</a><b>bar</b>") + assert 2 == len(soup.find_all()) + + def test_find_everything_with_name(self): + """Test an optimization that finds all tags with a given name.""" + soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") + assert 2 == len(soup.find_all("a")) + + def test_find_with_no_arguments(self): + soup = self.soup("<div></div><p></p>") + assert "div" == soup.find().name + assert "div" == soup.find("p").find_previous_sibling().name + assert "p" == soup.find("div").find_next_sibling().name + + def test_find_with_no_arguments_only_finds_tags(self): + soup = self.soup("text<div>text</div>text<p>text</p>") + assert "div" == soup.find().name + assert "div" == soup.find("p").find_previous_sibling().name + assert "p" == soup.find("div").find_next_sibling().name + + +class TestFindAll(SoupTest): + """Basic tests of the find_all() method.""" + + def test_find_all_with_no_arguments_only_finds_tags(self): + soup = self.soup("<body>text<div>text</div>text<p>text</p></body>") + assert 2 == len(soup.body.find_all()) + assert 1 == len(soup.find("p").find_previous_siblings()) + assert 1 == len(soup.find("div").find_next_siblings()) + + def test_find_all_text_nodes(self): + """You can search the tree for text nodes.""" + soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") + # Exact match. + assert soup.find_all(string="bar") == ["bar"] + + # Match any of a number of strings. + assert soup.find_all(string=["Foo", "bar"]) == ["Foo", "bar"] + # Match a regular expression. + assert soup.find_all(string=re.compile(".*")) == ["Foo", "bar", "\xbb"] + # Match anything. + assert soup.find_all(string=True) == ["Foo", "bar", "\xbb"] + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" + soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") + self.assert_selects(soup.find_all("a", limit=3), ["1", "2", "3"]) + self.assert_selects(soup.find_all("a", limit=1), ["1"]) + self.assert_selects(soup.find_all("a", limit=10), ["1", "2", "3", "4", "5"]) + + # A limit of 0 means no limit. + self.assert_selects(soup.find_all("a", limit=0), ["1", "2", "3", "4", "5"]) + + def test_calling_a_tag_is_calling_findall(self): + soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") + self.assert_selects(soup("a", limit=1), ["1"]) + self.assert_selects(soup.b(id="foo"), ["3"]) + + def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion( + self, + ): + soup = self.soup("<a></a>") + # Create a self-referential list. + selfref = [] + selfref.append(selfref) + + # Without special code in SoupStrainer, this would cause infinite + # recursion. + with warnings.catch_warnings(record=True) as w: + assert [] == soup.find_all(selfref) + [warning] = w + assert warning.filename == __file__ + msg = str(warning.message) + assert ( + msg + == "Ignoring nested list [[...]] to avoid the possibility of infinite recursion." + ) + + def test_find_all_resultset(self): + """All find_all calls return a ResultSet""" + soup = self.soup("<a></a>") + result = soup.find_all("a") + assert hasattr(result, "source") + + result = soup.find_all(True) + assert hasattr(result, "source") + + result = soup.find_all(string="foo") + assert hasattr(result, "source") + + +class TestFindAllBasicNamespaces(SoupTest): + def test_find_by_namespaced_name(self): + soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') + assert "4" == soup.find("mathml:msqrt").string + assert "a" == soup.find(attrs={"svg:fill": "red"}).name + + +class TestFindAllByName(SoupTest): + """Test ways of finding tags by tag name.""" + + def setup_method(self) -> None: + self.tree = self.soup("""<a>First tag.</a> + <b>Second tag.</b> + <c>Third <a>Nested tag.</a> tag.</c>""") + + def test_find_all_by_tag_name(self): + # Find all the <a> tags. + self.assert_selects(self.tree.find_all("a"), ["First tag.", "Nested tag."]) + + def test_find_all_by_name_and_text(self): + self.assert_selects( + self.tree.find_all("a", string="First tag."), ["First tag."] + ) + + self.assert_selects( + self.tree.find_all("a", string=True), ["First tag.", "Nested tag."] + ) + + self.assert_selects( + self.tree.find_all("a", string=re.compile("tag")), + ["First tag.", "Nested tag."], + ) + + def test_find_all_on_non_root_element(self): + # You can call find_all on any node, not just the root. + self.assert_selects(self.tree.c.find_all("a"), ["Nested tag."]) + + def test_calling_element_invokes_find_all(self): + self.assert_selects(self.tree("a"), ["First tag.", "Nested tag."]) + + def test_find_all_by_tag_strainer(self): + self.assert_selects( + self.tree.find_all(SoupStrainer("a")), ["First tag.", "Nested tag."] + ) + + def test_find_all_by_tag_names(self): + self.assert_selects( + self.tree.find_all(["a", "b"]), ["First tag.", "Second tag.", "Nested tag."] + ) + + def test_find_all_by_tag_dict(self): + self.assert_selects( + self.tree.find_all({"a": True, "b": True}), + ["First tag.", "Second tag.", "Nested tag."], + ) + + def test_find_all_by_tag_re(self): + self.assert_selects( + self.tree.find_all(re.compile("^[ab]$")), + ["First tag.", "Second tag.", "Nested tag."], + ) + + def test_find_all_with_tags_matching_method(self): + # You can define an oracle method that determines whether + # a tag matches the search. + def id_matches_name(tag): + return tag.name == tag.get("id") + + tree = self.soup("""<a id="a">Match 1.</a> + <a id="1">Does not match.</a> + <b id="b">Match 2.</a>""") + + self.assert_selects(tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) + + def test_find_with_multi_valued_attribute(self): + soup = self.soup( + "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>" + ) + r1 = soup.find("div", "a d") + r2 = soup.find("div", re.compile(r"a d")) + r3, r4 = soup.find_all("div", ["a b", "a d"]) + assert "3" == r1.string + assert "3" == r2.string + assert "1" == r3.string + assert "3" == r4.string + + +class TestFindAllByAttribute(SoupTest): + def test_find_all_by_attribute_name(self): + # You can pass in keyword arguments to find_all to search by + # attribute. + tree = self.soup(""" + <a id="first">Matching a.</a> + <a id="second"> + Non-matching <b id="first">Matching b.</b>a. + </a>""") + self.assert_selects(tree.find_all(id="first"), ["Matching a.", "Matching b."]) + + def test_find_all_by_utf8_attribute_value(self): + peace = "×ולש".encode("utf8") + data = '<a title="×ולש"></a>'.encode("utf8") + soup = self.soup(data) + assert [soup.a] == soup.find_all(title=peace) + assert [soup.a] == soup.find_all(title=peace.decode("utf8")) + assert [soup.a], soup.find_all(title=[peace, "something else"]) + + def test_find_all_by_attribute_dict(self): + # You can pass in a dictionary as the argument 'attrs'. This + # lets you search for attributes like 'name' (a fixed argument + # to find_all) and 'class' (a reserved word in Python.) + tree = self.soup(""" + <a name="name1" class="class1">Name match.</a> + <a name="name2" class="class2">Class match.</a> + <a name="name3" class="class3">Non-match.</a> + <name1>A tag called 'name1'.</name1> + """) + + # This doesn't do what you want. + self.assert_selects(tree.find_all(name="name1"), ["A tag called 'name1'."]) + # This does what you want. + self.assert_selects(tree.find_all(attrs={"name": "name1"}), ["Name match."]) + + self.assert_selects(tree.find_all(attrs={"class": "class2"}), ["Class match."]) + + def test_find_all_by_class(self): + tree = self.soup(""" + <a class="1">Class 1.</a> + <a class="2">Class 2.</a> + <b class="1">Class 1.</b> + <c class="3 4">Class 3 and 4.</c> + """) + + # Passing in the class_ keyword argument will search against + # the 'class' attribute. + self.assert_selects(tree.find_all("a", class_="1"), ["Class 1."]) + self.assert_selects(tree.find_all("c", class_="3"), ["Class 3 and 4."]) + self.assert_selects(tree.find_all("c", class_="4"), ["Class 3 and 4."]) + + # Passing in a string to 'attrs' will also search the CSS class. + self.assert_selects(tree.find_all("a", "1"), ["Class 1."]) + self.assert_selects(tree.find_all(attrs="1"), ["Class 1.", "Class 1."]) + self.assert_selects(tree.find_all("c", "3"), ["Class 3 and 4."]) + self.assert_selects(tree.find_all("c", "4"), ["Class 3 and 4."]) + + def test_find_by_class_when_multiple_classes_present(self): + tree = self.soup("<gar class='foo bar'>Found it</gar>") + + f = tree.find_all("gar", class_=re.compile("o")) + self.assert_selects(f, ["Found it"]) + + f = tree.find_all("gar", class_=re.compile("a")) + self.assert_selects(f, ["Found it"]) + + # If the search fails to match the individual strings "foo" and "bar", + # it will be tried against the combined string "foo bar". + f = tree.find_all("gar", class_=re.compile("o b")) + self.assert_selects(f, ["Found it"]) + + def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): + soup = self.soup("<a class='bar'>Found it</a>") + + self.assert_selects(soup.find_all("a", re.compile("ba")), ["Found it"]) + + def big_attribute_value(value): + return len(value) > 3 + + self.assert_selects(soup.find_all("a", big_attribute_value), []) + + def small_attribute_value(value): + return len(value) <= 3 + + self.assert_selects(soup.find_all("a", small_attribute_value), ["Found it"]) + + def test_find_all_with_string_for_attrs_finds_multiple_classes(self): + soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') + a, a2 = soup.find_all("a") + assert [a, a2], soup.find_all("a", "foo") + assert [a], soup.find_all("a", "bar") + + # If you specify the class as a string that contains a + # space, only that specific value will be found. + assert [a] == soup.find_all("a", class_="foo bar") + assert [a] == soup.find_all("a", "foo bar") + assert [] == soup.find_all("a", "bar foo") + + def test_find_all_by_attribute_soupstrainer(self): + tree = self.soup(""" + <a id="first">Match.</a> + <a id="second">Non-match.</a>""") + + strainer = SoupStrainer(attrs={"id": "first"}) + self.assert_selects(tree.find_all(strainer), ["Match."]) + + def test_find_all_with_missing_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that do not have that attribute set. + tree = self.soup("""<a id="1">ID present.</a> + <a>No ID present.</a> + <a id="">ID is empty.</a>""") + self.assert_selects(tree.find_all("a", id=None), ["No ID present."]) + + def test_find_all_with_defined_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that have that attribute set to any value. + tree = self.soup("""<a id="1">ID present.</a> + <a>No ID present.</a> + <a id="">ID is empty.</a>""") + self.assert_selects(tree.find_all(id=True), ["ID present.", "ID is empty."]) + + def test_find_all_with_numeric_attribute(self): + # If you search for a number, it's treated as a string. + tree = self.soup("""<a id=1>Unquoted attribute.</a> + <a id="1">Quoted attribute.</a>""") + + expected = ["Unquoted attribute.", "Quoted attribute."] + self.assert_selects(tree.find_all(id=1), expected) + self.assert_selects(tree.find_all(id="1"), expected) + + def test_find_all_with_list_attribute_values(self): + # You can pass a list of attribute values instead of just one, + # and you'll get tags that match any of the values. + tree = self.soup("""<a id="1">1</a> + <a id="2">2</a> + <a id="3">3</a> + <a>No ID.</a>""") + self.assert_selects(tree.find_all(id=["1", "3", "4"]), ["1", "3"]) + + # If you pass in an empty list, you get nothing. + self.assert_selects(tree.find_all(id=[]), []) + + def test_find_all_with_regular_expression_attribute_value(self): + # You can pass a regular expression as an attribute value, and + # you'll get tags whose values for that attribute match the + # regular expression. + tree = self.soup("""<a id="a">One a.</a> + <a id="aa">Two as.</a> + <a id="ab">Mixed as and bs.</a> + <a id="b">One b.</a> + <a>No ID.</a>""") + + self.assert_selects(tree.find_all(id=re.compile("^a+$")), ["One a.", "Two as."]) + + def test_find_by_name_and_containing_string(self): + soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") + a = soup.a + + assert [a] == soup.find_all("a", string="foo") + assert [] == soup.find_all("a", string="bar") + + def test_find_by_name_and_containing_string_when_string_is_buried(self): + soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") + assert soup.find_all("a") == soup.find_all("a", string="foo") + + def test_find_by_attribute_and_containing_string(self): + soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') + a = soup.a + + assert [a] == soup.find_all(id=2, string="foo") + assert [] == soup.find_all(id=1, string="bar") + + +class TestSmooth(SoupTest): + """Test Tag.smooth.""" + + def test_smooth(self): + soup = self.soup("<div>a</div>") + div = soup.div + div.append("b") + div.append("c") + div.append(Comment("Comment 1")) + div.append(Comment("Comment 2")) + div.append("d") + builder = self.default_builder() + span = Tag(soup, builder, "span") + span.append("1") + span.append("2") + div.append(span) + + # At this point the tree has a bunch of adjacent + # NavigableStrings. This is normal, but it has no meaning in + # terms of HTML, so we may want to smooth things out for + # output. + + # Since the <span> tag has two children, its .string is None. + assert None is div.span.string + + assert 7 == len(div.contents) + div.smooth() + assert 5 == len(div.contents) + + # The three strings at the beginning of div.contents have been + # merged into on string. + # + assert "abc" == div.contents[0] + + # The call is recursive -- the <span> tag was also smoothed. + assert "12" == div.span.string + + # The two comments have _not_ been merged, even though + # comments are strings. Merging comments would change the + # meaning of the HTML. + assert "Comment 1" == div.contents[1] + assert "Comment 2" == div.contents[2] + + +class TestIndex(SoupTest): + """Test Tag.index""" + + def test_index(self): + tree = self.soup("""<div> + <a>Identical</a> + <b>Not identical</b> + <a>Identical</a> + + <c><d>Identical with child</d></c> + <b>Also not identical</b> + <c><d>Identical with child</d></c> + </div>""") + div = tree.div + for i, element in enumerate(div.contents): + assert i == div.index(element) + with pytest.raises(ValueError): + tree.index(1) + + +class TestParentOperations(SoupTest): + """Test navigation and searching through an element's parents.""" + + def setup_method(self) -> None: + self.tree = self.soup("""<ul id="empty"></ul> + <ul id="top"> + <ul id="middle"> + <ul id="bottom"> + <b id="start">Start here</b> + </ul> + </ul>""") + self.start = self.tree.b + + def test_parent(self): + assert self.start.parent["id"] == "bottom" + assert self.start.parent.parent["id"] == "middle" + assert self.start.parent.parent.parent["id"] == "top" + + def test_parent_of_top_tag_is_soup_object(self): + top_tag = self.tree.contents[0] + assert top_tag.parent == self.tree + + def test_soup_object_has_no_parent(self): + assert None is self.tree.parent + + def test_find_parents(self): + self.assert_selects_ids( + self.start.find_parents("ul"), ["bottom", "middle", "top"] + ) + self.assert_selects_ids(self.start.find_parents("ul", id="middle"), ["middle"]) + assert self.start.find_parents(id="start") == [] + + def test_find_parent(self): + # assert self.start.find_parent('ul')['id'] == 'bottom' + assert self.start.find_parent("ul", id="top")["id"] == "top" + + assert self.start.find_parent(id="start") is None + + def test_parent_of_text_element(self): + text = self.tree.find(string="Start here") + assert text.parent.name == "b" + + def test_text_element_find_parent(self): + text = self.tree.find(string="Start here") + assert text.find_parent("ul")["id"] == "bottom" + + def test_parent_generator(self): + parents = [ + parent["id"] + for parent in self.start.parents + if parent is not None and "id" in parent.attrs + ] + assert parents == ["bottom", "middle", "top"] + + def test_self_and_parent_generator(self): + results = [ + parent["id"] + for parent in self.start.self_and_parents + if parent is not None and "id" in parent.attrs + ] + assert results == ["start", "bottom", "middle", "top"] + + +class ProximityTest(SoupTest): + def setup_method(self) -> None: + self.tree = self.soup( + '<html id="start"><head id="headtag"></head><body id="bodytag"><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>' + ) + + +class TestNextOperations(ProximityTest): + def setup_method(self) -> None: + super(TestNextOperations, self).setup_method() + self.start = self.tree.b + + def test_next(self): + assert self.start.next_element == "One" + assert self.start.next_element.next_element["id"] == "2" + + def test_next_of_last_item_is_none(self): + last = self.tree.find(string="Three") + assert last.next_element is None + + def test_next_of_root_is_none(self): + # The document root is outside the next/previous chain. + assert self.tree.next_element is None + + def test_find_all_next(self): + self.assert_selects(self.start.find_all_next("b"), ["Two", "Three"]) + self.start.find_all_next(id=3) + self.assert_selects(self.start.find_all_next(id=3), ["Three"]) + + def test_find_next(self): + assert self.start.find_next("b")["id"] == "2" + assert self.start.find_next(string="Three") == "Three" + + def test_find_next_for_text_element(self): + text = self.tree.find(string="One") + assert text.find_next("b").string == "Two" + self.assert_selects(text.find_all_next("b"), ["Two", "Three"]) + + def test_next_generators(self): + start = self.tree.find(string="Two") + successors = [node for node in start.next_elements] + # There are two successors: the final <b> tag and its text contents. + tag, contents = successors + assert tag["id"] == "3" + assert contents == "Three" + + successors2 = [node for node in start.self_and_next_elements] + assert successors2[1:] == successors + assert successors2[0] == start + + +class TestPreviousOperations(ProximityTest): + def setup_method(self) -> None: + super(TestPreviousOperations, self).setup_method() + self.end = self.tree.find(string="Three") + + def test_previous(self): + assert self.end.previous_element["id"] == "3" + assert self.end.previous_element.previous_element == "Two" + + def test_previous_of_first_item_is_none(self): + first = self.tree.find("html") + assert first.previous_element is None + + def test_previous_of_root_is_none(self): + # The document root is outside the next/previous chain. + assert self.tree.previous_element is None + + def test_find_all_previous(self): + # The <b> tag containing the "Three" node is the predecessor + # of the "Three" node itself, which is why "Three" shows up + # here. + self.assert_selects(self.end.find_all_previous("b"), ["Three", "Two", "One"]) + self.assert_selects(self.end.find_all_previous(id=1), ["One"]) + + def test_find_previous(self): + assert self.end.find_previous("b")["id"] == "3" + assert self.end.find_previous(string="One") == "One" + + def test_find_previous_for_text_element(self): + text = self.tree.find(string="Three") + assert text.find_previous("b").string == "Three" + self.assert_selects(text.find_all_previous("b"), ["Three", "Two", "One"]) + + def test_previous_generators(self): + start = self.tree.find("b", string="One") + self.assert_selects_ids(start.previous_elements, ["bodytag", "headtag", 'start']) + self.assert_selects_ids(start.self_and_previous_elements, ["1", "bodytag", "headtag", "start"]) + + +class SiblingTest(SoupTest): + def setup_method(self) -> None: + markup = """<html> + <span id="1"> + <span id="1.1"></span> + </span> + <span id="2"> + <span id="2.1"></span> + </span> + <span id="3"> + <span id="3.1"></span> + </span> + <span id="4"></span> + </html>""" + # All that whitespace looks good but makes the tests more + # difficult. Get rid of it. + markup = re.compile(r"\n\s*").sub("", markup) + self.tree = self.soup(markup) + + +class TestNextSibling(SiblingTest): + def setup_method(self) -> None: + super(TestNextSibling, self).setup_method() + self.start = self.tree.find(id="1") + + def test_next_sibling_of_root_is_none(self): + assert self.tree.next_sibling is None + + def test_next_sibling(self): + assert self.start.next_sibling["id"] == "2" + assert self.start.next_sibling.next_sibling["id"] == "3" + + # Note the difference between next_sibling and next_element. + assert self.start.next_element["id"] == "1.1" + + def test_next_sibling_may_not_exist(self): + assert self.tree.html.next_sibling is None + + nested_span = self.tree.find(id="1.1") + assert nested_span.next_sibling is None + + last_span = self.tree.find(id="4") + assert last_span.next_sibling is None + + def test_find_next_sibling(self): + assert self.start.find_next_sibling("span")["id"] == "2" + + def test_next_siblings(self): + self.assert_selects_ids(self.start.find_next_siblings("span"), ["2", "3", "4"]) + + self.assert_selects_ids(self.start.find_next_siblings(id="3"), ["3"]) + + def test_next_siblings_generators(self): + self.assert_selects_ids(self.start.next_siblings, ["2", "3", "4"]) + self.assert_selects_ids(self.start.self_and_next_siblings, ["1", "2", "3", "4"]) + + def test_next_sibling_for_text_element(self): + soup = self.soup("Foo<b>bar</b>baz") + start = soup.find(string="Foo") + assert start.next_sibling.name == "b" + assert start.next_sibling.next_sibling == "baz" + + self.assert_selects(start.find_next_siblings("b"), ["bar"]) + assert start.find_next_sibling(string="baz") == "baz" + assert start.find_next_sibling(string="nonesuch") is None + + +class TestPreviousSibling(SiblingTest): + def setup_method(self) -> None: + super(TestPreviousSibling, self).setup_method() + self.end = self.tree.find(id="4") + + def test_previous_sibling_of_root_is_none(self): + assert self.tree.previous_sibling is None + + def test_previous_sibling(self): + assert self.end.previous_sibling["id"] == "3" + assert self.end.previous_sibling.previous_sibling["id"] == "2" + + # Note the difference between previous_sibling and previous_element. + assert self.end.previous_element["id"] == "3.1" + + def test_previous_sibling_may_not_exist(self): + assert self.tree.html.previous_sibling is None + + nested_span = self.tree.find(id="1.1") + assert nested_span.previous_sibling is None + + first_span = self.tree.find(id="1") + assert first_span.previous_sibling is None + + def test_find_previous_sibling(self): + assert self.end.find_previous_sibling("span")["id"] == "3" + + def test_previous_siblings(self): + self.assert_selects_ids( + self.end.find_previous_siblings("span"), ["3", "2", "1"] + ) + + self.assert_selects_ids(self.end.find_previous_siblings(id="1"), ["1"]) + + def test_previous_siblings_generators(self): + self.assert_selects_ids(self.end.previous_siblings, ["3", "2", "1"]) + self.assert_selects_ids(self.end.self_and_previous_siblings, ["4", "3", "2", "1"]) + + def test_previous_sibling_for_text_element(self): + soup = self.soup("Foo<b>bar</b>baz") + start = soup.find(string="baz") + assert start.previous_sibling.name == "b" + assert start.previous_sibling.previous_sibling == "Foo" + + self.assert_selects(start.find_previous_siblings("b"), ["bar"]) + assert start.find_previous_sibling(string="Foo") == "Foo" + assert start.find_previous_sibling(string="nonesuch") is None + + +class TestTreeModification(SoupTest): + def test_attribute_modification(self): + soup = self.soup('<a id="1"></a>') + soup.a["id"] = 2 + assert soup.decode() == self.document_for('<a id="2"></a>') + del soup.a["id"] + assert soup.decode() == self.document_for("<a></a>") + soup.a["id2"] = "foo" + assert soup.decode() == self.document_for('<a id2="foo"></a>') + + def test_new_tag_creation(self): + builder = builder_registry.lookup("html")() + soup = self.soup("<body></body>", builder=builder) + a = Tag(soup, builder, "a") + ol = Tag(soup, builder, "ol") + a["href"] = "http://foo.com/" + soup.body.insert(0, a) + soup.body.insert(1, ol) + assert ( + soup.body.encode() + == b'<body><a href="http://foo.com/"></a><ol></ol></body>' + ) + + def test_append_to_contents_moves_tag(self): + doc = """<p id="1">Don't leave me <b>here</b>.</p> + <p id="2">Don\'t leave!</p>""" + soup = self.soup(doc) + second_para = soup.find(id="2") + bold = soup.b + + # Move the <b> tag to the end of the second paragraph. + soup.find(id="2").append(soup.b) + + # The <b> tag is now a child of the second paragraph. + assert bold.parent == second_para + + assert soup.decode() == self.document_for( + '<p id="1">Don\'t leave me .</p>\n' '<p id="2">Don\'t leave!<b>here</b></p>' + ) + + def test_insertion_returns_inserted_things(self): + soup = self.soup("<html></html>") + html = soup.find('html') + head = html.append(soup.new_tag('head')) + assert head.name == 'head' + + [title] = head.insert(0, soup.new_tag('title')) + assert title.name == 'title' + + text5 = title.append('5') + assert text5 == '5' + text34 = text5.insert_before('3', '4') + assert text34 == ['3', '4'] + text67 = text5.insert_after('6', '7') + assert text67 == ['6', '7'] + text89 = title.extend(['8', '9']) + assert text89 == ['8', '9'] + assert title.get_text() == '3456789' + + def test_replace_with_returns_thing_that_was_replaced(self): + text = "<a></a><b><c></c></b>" + soup = self.soup(text) + a = soup.a + new_a = a.replace_with(soup.c) + assert a == new_a + + def test_unwrap_returns_thing_that_was_replaced(self): + text = "<a><b></b><c></c></a>" + soup = self.soup(text) + a = soup.a + new_a = a.unwrap() + assert a == new_a + + def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): + soup = self.soup("<a><b>Foo</b></a><c>Bar</c>") + a = soup.a + a.extract() + assert None is a.parent + with pytest.raises(ValueError): + a.unwrap() + with pytest.raises(ValueError): + a.replace_with(soup.c) + + def test_replace_tag_with_itself(self): + text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" + soup = self.soup(text) + c = soup.c + result = soup.c.replace_with(c) + assert result == c + assert soup.decode() == self.document_for(text) + + def test_replace_tag_with_its_parent_raises_exception(self): + text = "<a><b></b></a>" + soup = self.soup(text) + with pytest.raises(ValueError): + soup.b.replace_with(soup.a) + + def test_insert_tag_into_itself_raises_exception(self): + text = "<a><b></b></a>" + soup = self.soup(text) + with pytest.raises(ValueError): + soup.a.insert(0, soup.a) + + def test_insert_multiple_elements(self): + soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>") + p2, p3 = soup.insert(1, soup.new_tag("p", string="p2"), soup.new_tag("p", string="p3")) + assert "p2" == p2.string + assert "p3" == p3.string + + p1, p2, p3, p4 = list(soup.children) + assert "And now, a word:" == p1.string + assert "p2" == p2.string + assert "p3" == p3.string + assert "And we're back." == p4.string + + def test_insert_beautifulsoup_object_inserts_children(self): + """Inserting one BeautifulSoup object into another actually inserts all + of its children -- you'll never combine BeautifulSoup objects. + """ + soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>") + + text = "<p>p2</p><p>p3</p>" + to_insert = self.soup(text) + p2, p3 = soup.insert(1, to_insert) + assert "p2" == p2.string + assert "p3" == p3.string + + for i in soup.descendants: + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) + assert "And now, a word:" == p1.string + assert "p2" == p2.string + assert "p3" == p3.string + assert "And we're back." == p4.string + + def test_replace_with_maintains_next_element_throughout(self): + soup = self.soup("<p><a>one</a><b>three</b></p>") + a = soup.a + # Make it so the <a> tag has two text children. + a.insert(1, "two") + + # Now replace each one with the empty string. + left, right = a.contents + left.replace_with("") + right.replace_with("") + + # The <b> tag is still connected to the tree. + assert "three" == soup.b.string + + def test_replace_final_node(self): + soup = self.soup("<b>Argh!</b>") + soup.find(string="Argh!").replace_with("Hooray!") + new_text = soup.find(string="Hooray!") + b = soup.b + assert new_text.previous_element == b + assert new_text.parent == b + assert new_text.previous_element.next_element == new_text + assert new_text.next_element is None + + def test_consecutive_text_nodes(self): + # A builder should never create two consecutive text nodes, + # but if you insert one next to another, Beautiful Soup will + # handle it correctly. + soup = self.soup("<a><b>Argh!</b><c></c></a>") + soup.b.insert(1, "Hooray!") + + assert soup.decode() == self.document_for("<a><b>Argh!Hooray!</b><c></c></a>") + + new_text = soup.find(string="Hooray!") + assert new_text.previous_element == "Argh!" + assert new_text.previous_element.next_element == new_text + + assert new_text.previous_sibling == "Argh!" + assert new_text.previous_sibling.next_sibling == new_text + + assert new_text.next_sibling is None + assert new_text.next_element == soup.c + + def test_insert_string(self): + soup = self.soup("<a></a>") + soup.a.insert(0, "bar") + soup.a.insert(0, "foo") + # The string were added to the tag. + assert ["foo", "bar"] == soup.a.contents + # And they were converted to NavigableStrings. + assert soup.a.contents[0].next_element == "bar" + + def test_append(self): + soup = self.soup("<b>1</b>") + result = soup.b.append("2") + assert result == "2" + assert soup.b.decode() == "<b>12</b>" + + def test_insert_tag(self): + builder = self.default_builder() + soup = self.soup("<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) + magic_tag = Tag(soup, builder, "magictag") + magic_tag.insert(0, "the") + soup.a.insert(1, magic_tag) + + assert soup.decode() == self.document_for( + "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>" + ) + + # Make sure all the relationships are hooked up correctly. + b_tag = soup.b + assert b_tag.next_sibling == magic_tag + assert magic_tag.previous_sibling == b_tag + + find = b_tag.find(string="Find") + assert find.next_element == magic_tag + assert magic_tag.previous_element == find + + c_tag = soup.c + assert magic_tag.next_sibling == c_tag + assert c_tag.previous_sibling == magic_tag + + the = magic_tag.find(string="the") + assert the.parent == magic_tag + assert the.next_element == c_tag + assert c_tag.previous_element == the + + def test_insert_into_the_current_location(self): + data = "<a>b<c></c>d</a>" + soup = self.soup(data) + soup.a.insert(1, soup.c) + assert data == soup.decode() + + def test_append_child_thats_already_at_the_end(self): + data = "<a><b></b></a>" + soup = self.soup(data) + soup.a.append(soup.b) + assert data == soup.decode() + + def test_extend_with_a_list_of_elements(self): + data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>" + soup = self.soup(data) + elements = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b] + soup.a.extend(elements) + assert "<a><g></g><f></f><e></e><d></d><c></c><b></b></a>" == soup.decode() + + def test_extend_with_a_list_of_strings(self): + data = "<a></a>" + soup = self.soup(data) + elements = ["b", "c", NavigableString("d"), "e"] + soup.a.extend(elements) + assert "<a>bcde</a>" == soup.decode() + + @pytest.mark.parametrize("get_tags", [lambda tag: tag, lambda tag: tag.contents]) + def test_extend_with_another_tags_contents(self, get_tags): + data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>' + soup = self.soup(data) + d1 = soup.find("div", id="d1") + d2 = soup.find("div", id="d2") + tags = get_tags(d1) + d2.extend(tags) + assert '<div id="d1"></div>' == d1.decode() + assert '<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>' == d2.decode() + + @pytest.mark.parametrize( + "string_source,result", + ( + [lambda soup: soup.a.string, "<a></a><b>1</b>"], + [lambda soup: "abcde", "<a>1</a><b>abcde</b>"], + ), + ) + def test_extend_with_a_single_non_tag_element(self, string_source, result): + data = "<div><a>1</a><b></b></div>" + soup = self.soup(data) + with warnings.catch_warnings(record=True) as w: + string = string_source(soup) + soup.b.extend(string) + assert soup.div.decode_contents() == result + [warning] = w + assert warning.filename == __file__ + msg = str(warning.message) + assert ( + msg + == "A single non-Tag item was passed into Tag.extend. Use Tag.append instead." + ) + + def test_move_tag_to_beginning_of_parent(self): + data = "<a><b></b><c></c><d></d></a>" + soup = self.soup(data) + soup.a.insert(0, soup.d) + assert "<a><d></d><b></b><c></c></a>" == soup.decode() + + def test_insert_works_on_empty_element_tag(self): + # This is a little strange, since most HTML parsers don't allow + # markup like this to come through. But in general, we don't + # know what the parser would or wouldn't have allowed, so + # I'm letting this succeed for now. + soup = self.soup("<br/>") + soup.br.insert(1, "Contents") + assert str(soup.br) == "<br>Contents</br>" + + def test_insert_before(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_before("BAZ") + soup.a.insert_before("QUUX") + assert soup.decode() == self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>") + + soup.a.insert_before(soup.b) + assert soup.decode() == self.document_for("QUUX<b>bar</b><a>foo</a>BAZ") + + # Can't insert an element before itself. + b = soup.b + with pytest.raises(ValueError): + b.insert_before(b) + + # Can't insert before if an element has no parent. + b.extract() + with pytest.raises(ValueError): + b.insert_before("nope") + + # Can insert an identical element + soup = self.soup("<a>") + soup.a.insert_before(soup.new_tag("a")) + + # TODO: OK but what happens? + + def test_insert_multiple_before(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_before("BAZ", " ", "QUUX") + soup.a.insert_before("QUUX", " ", "BAZ") + assert soup.decode() == self.document_for( + "QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>" + ) + + soup.a.insert_before(soup.b, "FOO") + assert soup.decode() == self.document_for( + "QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX" + ) + + def test_insert_after(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_after("BAZ") + soup.a.insert_after("QUUX") + assert soup.decode() == self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ") + soup.b.insert_after(soup.a) + assert soup.decode() == self.document_for("QUUX<b>bar</b><a>foo</a>BAZ") + + # Can't insert an element after itself. + b = soup.b + with pytest.raises(ValueError): + b.insert_after(b) + + # Can't insert after if an element has no parent. + b.extract() + with pytest.raises(ValueError): + b.insert_after("nope") + + # Can insert an identical element + soup = self.soup("<a>") + soup.a.insert_before(soup.new_tag("a")) + + # TODO: OK but what does it look like? + + def test_insert_multiple_after(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_after("BAZ", " ", "QUUX") + soup.a.insert_after("QUUX", " ", "BAZ") + assert soup.decode() == self.document_for( + "<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX" + ) + soup.b.insert_after(soup.a, "FOO ") + assert soup.decode() == self.document_for( + "QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX" + ) + + def test_insert_after_raises_exception_if_after_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + with pytest.raises(ValueError): + string.insert_after(tag) + with pytest.raises(NotImplementedError): + soup.insert_after(tag) + with pytest.raises(ValueError): + tag.insert_after(tag) + + def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + with pytest.raises(ValueError): + string.insert_before(tag) + with pytest.raises(NotImplementedError): + soup.insert_before(tag) + with pytest.raises(ValueError): + tag.insert_before(tag) + + def test_replace_with(self): + soup = self.soup("<p>There's <b>no</b> business like <b>show</b> business</p>") + no, show = soup.find_all("b") + show.replace_with(no) + assert soup.decode() == self.document_for( + "<p>There's business like <b>no</b> business</p>" + ) + + assert show.parent is None + assert no.parent == soup.p + assert no.next_element == "no" + assert no.next_sibling == " business" + + def test_replace_with_errors(self): + # Can't replace a tag that's not part of a tree. + a_tag = Tag(name="a") + with pytest.raises(ValueError): + a_tag.replace_with("won't work") + + # Can't replace a tag with its parent. + a_tag = self.soup("<a><b></b></a>").a + with pytest.raises(ValueError): + a_tag.b.replace_with(a_tag) + + # Or with a list that includes its parent. + with pytest.raises(ValueError): + a_tag.b.replace_with("string1", a_tag, "string2") + + def test_replace_with_multiple(self): + data = "<a><b></b><c></c></a>" + soup = self.soup(data) + d_tag = soup.new_tag("d") + d_tag.string = "Text In D Tag" + e_tag = soup.new_tag("e") + f_tag = soup.new_tag("f") + a_string = "Random Text" + soup.c.replace_with(d_tag, e_tag, a_string, f_tag) + assert ( + soup.decode() + == "<a><b></b><d>Text In D Tag</d><e></e>Random Text<f></f></a>" + ) + assert soup.b.next_element == d_tag + assert d_tag.string.next_element == e_tag + assert e_tag.next_element.string == a_string + assert e_tag.next_element.next_element == f_tag + + def test_replace_first_child(self): + data = "<a><b></b><c></c></a>" + soup = self.soup(data) + soup.b.replace_with(soup.c) + assert "<a><c></c></a>" == soup.decode() + + def test_replace_last_child(self): + data = "<a><b></b><c></c></a>" + soup = self.soup(data) + soup.c.replace_with(soup.b) + assert "<a><b></b></a>" == soup.decode() + + def test_nested_tag_replace_with(self): + soup = self.soup( + """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""" + ) + + # Replace the entire <b> tag and its contents ("reserve the + # right") with the <f> tag ("refuse"). + remove_tag = soup.b + move_tag = soup.f + remove_tag.replace_with(move_tag) + + assert soup.decode() == self.document_for( + "<a>We<f>refuse</f></a><e>to<g>service</g></e>" + ) + + # The <b> tag is now an orphan. + assert remove_tag.parent is None + assert remove_tag.find(string="right").next_element is None + assert remove_tag.previous_element is None + assert remove_tag.next_sibling is None + assert remove_tag.previous_sibling is None + + # The <f> tag is now connected to the <a> tag. + assert move_tag.parent == soup.a + assert move_tag.previous_element == "We" + assert move_tag.next_element.next_element == soup.e + assert move_tag.next_sibling is None + + # The gap where the <f> tag used to be has been mended, and + # the word "to" is now connected to the <g> tag. + to_text = soup.find(string="to") + g_tag = soup.g + assert to_text.next_element == g_tag + assert to_text.next_sibling == g_tag + assert g_tag.previous_element == to_text + assert g_tag.previous_sibling == to_text + + def test_unwrap(self): + tree = self.soup(""" + <p>Unneeded <em>formatting</em> is unneeded</p> + """) + tree.em.unwrap() + assert tree.em is None + assert tree.p.text == "Unneeded formatting is unneeded" + + def test_wrap(self): + soup = self.soup("I wish I was bold.") + value = soup.string.wrap(soup.new_tag("b")) + assert value.decode() == "<b>I wish I was bold.</b>" + assert soup.decode() == self.document_for("<b>I wish I was bold.</b>") + + def test_wrap_extracts_tag_from_elsewhere(self): + soup = self.soup("<b></b>I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + assert soup.decode() == self.document_for("<b>I wish I was bold.</b>") + + def test_wrap_puts_new_contents_at_the_end(self): + soup = self.soup("<b>I like being bold.</b>I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + assert 2 == len(soup.b.contents) + assert soup.decode() == self.document_for( + "<b>I like being bold.I wish I was bold.</b>" + ) + + def test_extract(self): + soup = self.soup( + '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>' + ) + + assert len(soup.body.contents) == 3 + extracted = soup.find(id="nav").extract() + + assert soup.decode() == "<html><body>Some content. More content.</body></html>" + assert extracted.decode() == '<div id="nav">Nav crap</div>' + + # The extracted tag is now an orphan. + assert len(soup.body.contents) == 2 + assert extracted.parent is None + assert extracted.previous_element is None + assert extracted.next_element.next_element is None + + # The gap where the extracted tag used to be has been mended. + content_1 = soup.find(string="Some content. ") + content_2 = soup.find(string=" More content.") + assert content_1.next_element == content_2 + assert content_1.next_sibling == content_2 + assert content_2.previous_element == content_1 + assert content_2.previous_sibling == content_1 + + def test_extract_distinguishes_between_identical_strings(self): + soup = self.soup("<a>foo</a><b>bar</b>") + foo_1 = soup.a.string + foo_2 = soup.new_string("foo") + bar_2 = soup.new_string("bar") + soup.a.append(foo_2) + soup.b.append(bar_2) + + # Now there are two identical strings in the <a> tag, and two + # in the <b> tag. Let's remove the first "foo" and the second + # "bar". + foo_1.extract() + bar_2.extract() + assert foo_2 == soup.a.string + assert bar_2 == soup.b.string + + def test_extract_multiples_of_same_tag(self): + soup = self.soup(""" +<html> +<head> +<script>foo</script> +</head> +<body> + <script>bar</script> + <a></a> +</body> +<script>baz</script> +</html>""") + [soup.script.extract() for i in soup.find_all("script")] + assert "<body>\n\n<a></a>\n</body>" == str(soup.body) + + def test_extract_works_when_element_is_surrounded_by_identical_strings(self): + soup = self.soup("<html>\n" "<body>hi</body>\n" "</html>") + soup.find("body").extract() + assert None is soup.find("body") + + def test_clear(self): + """Tag.clear()""" + soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") + # clear using extract() + a = soup.a + soup.p.clear() + assert len(soup.p.contents) == 0 + assert hasattr(a, "contents") + + # clear using decompose() + em = a.em + a.clear(decompose=True) + assert 0 == len(em.contents) + + @pytest.mark.parametrize( + "method_name,expected_result", + [ + ( + "descendants", + '<div><em>child1</em><p id="start"></p><p>child3</p></div>', + ), + ( + "next_siblings", + '<div><em>child1</em><p id="start"><a>Second <em>child</em></a></p></div>', + ), + # Confused about why child3 is still here in this test? It's because removing the <p id="start"> tag from the tree removes all of its children from the tree as well. 'child'.next_element becomes None, because 'child' is no longer in the tree, and iteration stops there. Don't do this kind of thing, is what I'm saying. + ( + "next_elements", + '<div><em>child1</em><p id="start"></p><p>child3</p></div>', + ), + ("children", '<div><em>child1</em><p id="start"></p><p>child3</p></div>'), + ("previous_elements", ""), + ( + "previous_siblings", + '<div><p id="start"><a>Second <em>child</em></a></p><p>child3</p></div>', + ), + ("parents", ""), + ], + ) + def test_extract_during_iteration(self, method_name, expected_result): + # The iterators should be able to proceed even if the most + # current yield got removed from the tree. This kind of code + # is a bad idea, but we should be able to run it without an exception. + soup = self.soup( + "<div><em>child1</em><p id='start'><a>Second <em>child</em></a></p><p>child3</p></div>" + ) + iterator = getattr(soup.p, method_name) + for i in iterator: + i.extract() + assert expected_result == soup.decode() + + def test_decompose(self): + # Test PageElement.decompose() and PageElement.decomposed + soup = self.soup("<p><a>String <em>Italicized</em></a></p><p>Another para</p>") + p1, p2 = soup.find_all("p") + a = p1.a + text = p1.em.string + for i in [p1, p2, a, text]: + assert False is i.decomposed + + # This sets p1 and everything beneath it to decomposed. + p1.decompose() + for i in [p1, a, text]: + assert True is i.decomposed + # p2 is unaffected. + assert False is p2.decomposed + + def test_decompose_string(self): + soup = self.soup("<div><p>String 1</p><p>String 2</p></p>") + div = soup.div + text = div.p.string + assert False is text.decomposed + text.decompose() + assert True is text.decomposed + assert "<div><p></p><p>String 2</p></div>" == div.decode() + + def test_string_set(self): + """Tag.string = 'string'""" + soup = self.soup("<a></a> <b><c></c></b>") + soup.a.string = "foo" + assert soup.a.contents == ["foo"] + soup.b.string = "bar" + assert soup.b.contents == ["bar"] + + def test_string_set_does_not_affect_original_string(self): + soup = self.soup("<a><b>foo</b><c>bar</c>") + soup.b.string = soup.c.string + assert soup.a.encode() == b"<a><b>bar</b><c>bar</c></a>" + + def test_set_string_preserves_class_of_string(self): + soup = self.soup("<a></a>") + cdata = CData("foo") + soup.a.string = cdata + assert isinstance(soup.a.string, CData) + + +all_find_type_methods = [ + "find", + "find_all", + "find_parent", + "find_parents", + "find_next", + "find_all_next", + "find_previous", + "find_all_previous", + "find_next_sibling", + "find_next_siblings", + "find_previous_sibling", + "find_previous_siblings", +] + + +class TestDeprecatedArguments(SoupTest): + @pytest.mark.parametrize("method_name", all_find_type_methods) + def test_find_type_method_string(self, method_name): + soup = self.soup("<a>some</a><b>markup</b>") + method = getattr(soup.b, method_name) + with warnings.catch_warnings(record=True) as w: + method(text="markup") + [warning] = w + assert warning.filename == __file__ + msg = str(warning.message) + assert ( + msg + == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead." + ) + + +class TestWarnings(SoupTest): + @pytest.mark.parametrize("method_name", all_find_type_methods) + def test_suspicious_syntax_warning(self, method_name): + soup = self.soup("<a>some</a><b>markup</b>") + method = getattr(soup.b, method_name) + with warnings.catch_warnings(record=True) as w: + method(_class="u") + [warning] = w + assert warning.filename == __file__ + assert isinstance(warning.message, AttributeResemblesVariableWarning) + msg = str(warning.message) + assert ( + "'_class' is an unusual attribute name and is a common misspelling for 'class_'" + in msg + ) |