two version of R2R are hereHEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
1 files changed, 196 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py
new file mode 100644
index 00000000..04a0ee88
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py
@@ -0,0 +1,196 @@
+"""Tests to ensure that the lxml tree builder generates good trees."""
+
+import pickle
+import pytest
+import warnings
+from . import LXML_PRESENT, LXML_VERSION
+
+if LXML_PRESENT:
+    from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML
+
+from bs4 import (
+    BeautifulStoneSoup,
+)
+from . import (
+    HTMLTreeBuilderSmokeTest,
+    XMLTreeBuilderSmokeTest,
+    SOUP_SIEVE_PRESENT,
+)
+
+
+@pytest.mark.skipif(
+    not LXML_PRESENT,
+    reason="lxml seems not to be present, not testing its tree builder.",
+)
+class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
+    """See ``HTMLTreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilder
+
+    def test_out_of_range_entity(self):
+        self.assert_soup("<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
+        self.assert_soup("<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
+        self.assert_soup("<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+
+    def test_entities_in_foreign_document_encoding(self):
+        # We can't implement this case correctly because by the time we
+        # hear about markup like "&#147;", it's been (incorrectly) converted into
+        # a string like u'\x93'
+        pass
+
+    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
+    # test if an old version of lxml is installed.
+
+    @pytest.mark.skipif(
+        not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0),
+        reason="Skipping doctype test for old version of lxml to avoid segfault.",
+    )
+    def test_empty_doctype(self):
+        soup = self.soup("<!DOCTYPE>")
+        doctype = soup.contents[0]
+        assert "" == doctype.strip()
+
+    def test_beautifulstonesoup_is_xml_parser(self):
+        # Make sure that the deprecated BSS class uses an xml builder
+        # if one is installed.
+        with warnings.catch_warnings(record=True) as w:
+            soup = BeautifulStoneSoup("<b />")
+        assert "<b/>" == str(soup.b)
+        [warning] = w
+        assert warning.filename == __file__
+        assert "The BeautifulStoneSoup class was deprecated" in str(warning.message)
+
+    def test_tracking_line_numbers(self):
+        # The lxml TreeBuilder cannot keep track of line numbers from
+        # the original markup. Even if you ask for line numbers, we
+        # don't have 'em.
+        #
+        # However, for consistency with other parsers, Tag.sourceline
+        # and Tag.sourcepos are always set to None, rather than being
+        # available as an alias for find().
+        soup = self.soup(
+            "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
+            store_line_numbers=True,
+        )
+        assert None is soup.p.sourceline
+        assert None is soup.p.sourcepos
+
+
+@pytest.mark.skipif(
+    not LXML_PRESENT,
+    reason="lxml seems not to be present, not testing its XML tree builder.",
+)
+class TestLXMLXMLTreeBuilder(XMLTreeBuilderSmokeTest):
+    """See ``HTMLTreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilderForXML
+
+    def test_namespace_indexing(self):
+        soup = self.soup(
+            '<?xml version="1.1"?>\n'
+            "<root>"
+            '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+            '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'
+            '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'
+            '<subtag xmlns="http://another-unprefixed-namespace.com">'
+            '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'
+            "</prefix2:tag3>"
+            "</root>"
+        )
+
+        # The BeautifulSoup object includes every namespace prefix
+        # defined in the entire document. This is the default set of
+        # namespaces used by soupsieve.
+        #
+        # Un-prefixed namespaces are not included, and if a given
+        # prefix is defined twice, only the first prefix encountered
+        # in the document shows up here.
+        assert soup._namespaces == {
+            "xml": "http://www.w3.org/XML/1998/namespace",
+            "prefix": "http://prefixed-namespace.com",
+            "prefix2": "http://another-namespace.com",
+        }
+
+        # A Tag object includes only the namespace prefixes
+        # that were in scope when it was parsed.
+
+        # We do not track un-prefixed namespaces as we can only hold
+        # one (the first one), and it will be recognized as the
+        # default namespace by soupsieve, even when operating from a
+        # tag with a different un-prefixed namespace.
+        assert soup.tag._namespaces == {
+            "xml": "http://www.w3.org/XML/1998/namespace",
+        }
+
+        assert soup.tag2._namespaces == {
+            "prefix": "http://prefixed-namespace.com",
+            "xml": "http://www.w3.org/XML/1998/namespace",
+        }
+
+        assert soup.subtag._namespaces == {
+            "prefix2": "http://another-namespace.com",
+            "xml": "http://www.w3.org/XML/1998/namespace",
+        }
+
+        assert soup.subsubtag._namespaces == {
+            "prefix2": "http://another-namespace.com",
+            "xml": "http://www.w3.org/XML/1998/namespace",
+        }
+
+    @pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
+    def test_namespace_interaction_with_select_and_find(self):
+        # Demonstrate how namespaces interact with select* and
+        # find* methods.
+
+        soup = self.soup(
+            '<?xml version="1.1"?>\n'
+            "<root>"
+            '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+            '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'
+            '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'
+            "<prefix:tag3>"
+            "</subtag>"
+            "</root>"
+        )
+
+        # soupselect uses namespace URIs.
+        assert soup.select_one("tag").name == "tag"
+        assert soup.select_one("prefix|tag2").name == "tag2"
+
+        # If a prefix is declared more than once, only the first usage
+        # is registered with the BeautifulSoup object.
+        assert soup.select_one("prefix|tag3") is None
+
+        # But you can always explicitly specify a namespace dictionary.
+        assert (
+            soup.select_one("prefix|tag3", namespaces=soup.subtag._namespaces).name
+            == "tag3"
+        )
+
+        # And a Tag (as opposed to the BeautifulSoup object) will
+        # have a set of default namespaces scoped to that Tag.
+        assert soup.subtag.select_one("prefix|tag3").name == "tag3"
+
+        # the find() methods aren't fully namespace-aware; they just
+        # look at prefixes.
+        assert soup.find("tag").name == "tag"
+        assert soup.find("prefix:tag2").name == "tag2"
+        assert soup.find("prefix:tag3").name == "tag3"
+        assert soup.subtag.find("prefix:tag3").name == "tag3"
+
+    def test_pickle_restores_builder(self):
+        # The lxml TreeBuilder is not picklable, so when unpickling
+        # a document created with it, a new TreeBuilder of the
+        # appropriate class is created.
+        soup = self.soup("<a>some markup</a>")
+        assert isinstance(soup.builder, self.default_builder)
+        pickled = pickle.dumps(soup)
+        unpickled = pickle.loads(pickled)
+
+        assert "some markup" == unpickled.a.string
+        assert unpickled.builder != soup.builder
+        assert isinstance(unpickled.builder, self.default_builder)
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz