diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py new file mode 100644 index 00000000..b2bd07fc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py @@ -0,0 +1,161 @@ +"""Tests to ensure that the html.parser tree builder generates good +trees.""" + +import pickle +import pytest +from bs4.builder._htmlparser import ( + _DuplicateAttributeHandler, + BeautifulSoupHTMLParser, + HTMLParserTreeBuilder, +) +from bs4.exceptions import ParserRejectedMarkup +from typing import Any +from . import HTMLTreeBuilderSmokeTest + + +class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest): + default_builder = HTMLParserTreeBuilder + + def test_rejected_input(self): + # Python's html.parser will occasionally reject markup, + # especially when there is a problem with the initial DOCTYPE + # declaration. Different versions of Python sound the alarm in + # different ways, but Beautiful Soup consistently raises + # errors as ParserRejectedMarkup exceptions. + bad_markup = [ + # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873 + # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700 + # https://github.com/python/cpython/issues/81928 + b"\n<![\xff\xfe\xfe\xcd\x00", + # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8 + # https://github.com/python/cpython/issues/78661 + # + b"<![n\x00", + b"<![UNKNOWN[]]>", + ] + for markup in bad_markup: + with pytest.raises(ParserRejectedMarkup): + self.soup(markup) + + def test_namespaced_system_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_namespaced_public_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_builder_is_pickled(self): + """Unlike most tree builders, HTMLParserTreeBuilder and will + be restored after pickling. + """ + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + assert isinstance(loaded.builder, type(tree.builder)) + + def test_redundant_empty_element_closing_tags(self): + self.assert_soup("<br></br><br></br><br></br>", "<br/><br/><br/>") + self.assert_soup("</br></br></br>", "") + + def test_empty_element(self): + # This verifies that any buffered data present when the parser + # finishes working is handled. + self.assert_soup("foo &# bar", "foo &# bar") + + def test_tracking_line_numbers(self): + # The html.parser TreeBuilder keeps track of line number and + # position of each element. + markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>" + soup = self.soup(markup) + assert 2 == soup.p.sourceline + assert 3 == soup.p.sourcepos + assert "sourceline" == soup.p.find("sourceline").name + + # You can deactivate this behavior. + soup = self.soup(markup, store_line_numbers=False) + assert None is soup.p.sourceline + assert None is soup.p.sourcepos + + def test_on_duplicate_attribute(self): + # The html.parser tree builder has a variety of ways of + # handling a tag that contains the same attribute multiple times. + + markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">' + + # If you don't provide any particular value for + # on_duplicate_attribute, later values replace earlier values. + soup = self.soup(markup) + assert "url3" == soup.a["href"] + assert ["cls"] == soup.a["class"] + assert "id" == soup.a["id"] + + # You can also get this behavior explicitly. + def assert_attribute( + on_duplicate_attribute: _DuplicateAttributeHandler, expected: Any + ) -> None: + soup = self.soup(markup, on_duplicate_attribute=on_duplicate_attribute) + assert soup.a is not None + assert expected == soup.a["href"] + + # Verify that non-duplicate attributes are treated normally. + assert ["cls"] == soup.a["class"] + assert "id" == soup.a["id"] + + assert_attribute(None, "url3") + assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") + + # You can ignore subsequent values in favor of the first. + assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1") + + # And you can pass in a callable that does whatever you want. + def accumulate(attrs, key, value): + if not isinstance(attrs[key], list): + attrs[key] = [attrs[key]] + attrs[key].append(value) + + assert_attribute(accumulate, ["url1", "url2", "url3"]) + + def test_html5_attributes(self): + # The html.parser TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + for input_element, output_unicode, output_element in ( + ("⇄", "\u21c4", b"⇄"), + ("⊧", "\u22a7", b"⊧"), + ("𝔑", "\U0001d511", b"𝔑"), + ("≧̸", "\u2267\u0338", b"≧̸"), + ("¬", "\xac", b"¬"), + ("⫬", "\u2aec", b"⫬"), + (""", '"', b'"'), + ("∴", "\u2234", b"∴"), + ("∴", "\u2234", b"∴"), + ("∴", "\u2234", b"∴"), + ("fj", "fj", b"fj"), + ("⊔", "\u2294", b"⊔"), + ("⊔︀", "\u2294\ufe00", b"⊔︀"), + ("'", "'", b"'"), + ("|", "|", b"|"), + ): + markup = "<div>%s</div>" % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"<div>%s</div>" % output_unicode.encode("utf8") + assert without_element == expect + + with_element = div.encode(formatter="html") + expect = b"<div>%s</div>" % output_element + assert with_element == expect + + def test_invalid_html_entity(self): + # The html.parser treebuilder can't distinguish between an invalid + # HTML entity with a semicolon and an invalid HTML entity with no + # semicolon. + markup = "<p>a &nosuchentity b</p>" + soup = self.soup(markup) + assert "<p>a &nosuchentity b</p>" == soup.p.decode() + + markup = "<p>a &nosuchentity; b</p>" + soup = self.soup(markup) + assert "<p>a &nosuchentity b</p>" == soup.p.decode() |