about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py')
-rw-r--r--.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py161
1 files changed, 161 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py
new file mode 100644
index 00000000..b2bd07fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py
@@ -0,0 +1,161 @@
+"""Tests to ensure that the html.parser tree builder generates good
+trees."""
+
+import pickle
+import pytest
+from bs4.builder._htmlparser import (
+    _DuplicateAttributeHandler,
+    BeautifulSoupHTMLParser,
+    HTMLParserTreeBuilder,
+)
+from bs4.exceptions import ParserRejectedMarkup
+from typing import Any
+from . import HTMLTreeBuilderSmokeTest
+
+
+class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
+    default_builder = HTMLParserTreeBuilder
+
+    def test_rejected_input(self):
+        # Python's html.parser will occasionally reject markup,
+        # especially when there is a problem with the initial DOCTYPE
+        # declaration. Different versions of Python sound the alarm in
+        # different ways, but Beautiful Soup consistently raises
+        # errors as ParserRejectedMarkup exceptions.
+        bad_markup = [
+            # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
+            # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
+            # https://github.com/python/cpython/issues/81928
+            b"\n<![\xff\xfe\xfe\xcd\x00",
+            # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
+            # https://github.com/python/cpython/issues/78661
+            #
+            b"<![n\x00",
+            b"<![UNKNOWN[]]>",
+        ]
+        for markup in bad_markup:
+            with pytest.raises(ParserRejectedMarkup):
+                self.soup(markup)
+
+    def test_namespaced_system_doctype(self):
+        # html.parser can't handle namespaced doctypes, so skip this one.
+        pass
+
+    def test_namespaced_public_doctype(self):
+        # html.parser can't handle namespaced doctypes, so skip this one.
+        pass
+
+    def test_builder_is_pickled(self):
+        """Unlike most tree builders, HTMLParserTreeBuilder and will
+        be restored after pickling.
+        """
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        assert isinstance(loaded.builder, type(tree.builder))
+
+    def test_redundant_empty_element_closing_tags(self):
+        self.assert_soup("<br></br><br></br><br></br>", "<br/><br/><br/>")
+        self.assert_soup("</br></br></br>", "")
+
+    def test_empty_element(self):
+        # This verifies that any buffered data present when the parser
+        # finishes working is handled.
+        self.assert_soup("foo &# bar", "foo &amp;# bar")
+
+    def test_tracking_line_numbers(self):
+        # The html.parser TreeBuilder keeps track of line number and
+        # position of each element.
+        markup = "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
+        soup = self.soup(markup)
+        assert 2 == soup.p.sourceline
+        assert 3 == soup.p.sourcepos
+        assert "sourceline" == soup.p.find("sourceline").name
+
+        # You can deactivate this behavior.
+        soup = self.soup(markup, store_line_numbers=False)
+        assert None is soup.p.sourceline
+        assert None is soup.p.sourcepos
+
+    def test_on_duplicate_attribute(self):
+        # The html.parser tree builder has a variety of ways of
+        # handling a tag that contains the same attribute multiple times.
+
+        markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
+
+        # If you don't provide any particular value for
+        # on_duplicate_attribute, later values replace earlier values.
+        soup = self.soup(markup)
+        assert "url3" == soup.a["href"]
+        assert ["cls"] == soup.a["class"]
+        assert "id" == soup.a["id"]
+
+        # You can also get this behavior explicitly.
+        def assert_attribute(
+            on_duplicate_attribute: _DuplicateAttributeHandler, expected: Any
+        ) -> None:
+            soup = self.soup(markup, on_duplicate_attribute=on_duplicate_attribute)
+            assert soup.a is not None
+            assert expected == soup.a["href"]
+
+            # Verify that non-duplicate attributes are treated normally.
+            assert ["cls"] == soup.a["class"]
+            assert "id" == soup.a["id"]
+
+        assert_attribute(None, "url3")
+        assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
+
+        # You can ignore subsequent values in favor of the first.
+        assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
+
+        # And you can pass in a callable that does whatever you want.
+        def accumulate(attrs, key, value):
+            if not isinstance(attrs[key], list):
+                attrs[key] = [attrs[key]]
+            attrs[key].append(value)
+
+        assert_attribute(accumulate, ["url1", "url2", "url3"])
+
+    def test_html5_attributes(self):
+        # The html.parser TreeBuilder can convert any entity named in
+        # the HTML5 spec to a sequence of Unicode characters, and
+        # convert those Unicode characters to a (potentially
+        # different) named entity on the way out.
+        for input_element, output_unicode, output_element in (
+            ("&RightArrowLeftArrow;", "\u21c4", b"&rlarr;"),
+            ("&models;", "\u22a7", b"&models;"),
+            ("&Nfr;", "\U0001d511", b"&Nfr;"),
+            ("&ngeqq;", "\u2267\u0338", b"&ngeqq;"),
+            ("&not;", "\xac", b"&not;"),
+            ("&Not;", "\u2aec", b"&Not;"),
+            ("&quot;", '"', b'"'),
+            ("&there4;", "\u2234", b"&there4;"),
+            ("&Therefore;", "\u2234", b"&there4;"),
+            ("&therefore;", "\u2234", b"&there4;"),
+            ("&fjlig;", "fj", b"fj"),
+            ("&sqcup;", "\u2294", b"&sqcup;"),
+            ("&sqcups;", "\u2294\ufe00", b"&sqcups;"),
+            ("&apos;", "'", b"'"),
+            ("&verbar;", "|", b"|"),
+        ):
+            markup = "<div>%s</div>" % input_element
+            div = self.soup(markup).div
+            without_element = div.encode()
+            expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+            assert without_element == expect
+
+            with_element = div.encode(formatter="html")
+            expect = b"<div>%s</div>" % output_element
+            assert with_element == expect
+
+    def test_invalid_html_entity(self):
+        # The html.parser treebuilder can't distinguish between an invalid
+        # HTML entity with a semicolon and an invalid HTML entity with no
+        # semicolon.
+        markup = "<p>a &nosuchentity b</p>"
+        soup = self.soup(markup)
+        assert "<p>a &amp;nosuchentity b</p>" == soup.p.decode()
+
+        markup = "<p>a &nosuchentity; b</p>"
+        soup = self.soup(markup)
+        assert "<p>a &amp;nosuchentity b</p>" == soup.p.decode()