aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py')
-rw-r--r--.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py196
1 files changed, 196 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py
new file mode 100644
index 00000000..04a0ee88
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_lxml.py
@@ -0,0 +1,196 @@
+"""Tests to ensure that the lxml tree builder generates good trees."""
+
+import pickle
+import pytest
+import warnings
+from . import LXML_PRESENT, LXML_VERSION
+
+if LXML_PRESENT:
+ from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML
+
+from bs4 import (
+ BeautifulStoneSoup,
+)
+from . import (
+ HTMLTreeBuilderSmokeTest,
+ XMLTreeBuilderSmokeTest,
+ SOUP_SIEVE_PRESENT,
+)
+
+
+@pytest.mark.skipif(
+ not LXML_PRESENT,
+ reason="lxml seems not to be present, not testing its tree builder.",
+)
+class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
+ """See ``HTMLTreeBuilderSmokeTest``."""
+
+ @property
+ def default_builder(self):
+ return LXMLTreeBuilder
+
+ def test_out_of_range_entity(self):
+ self.assert_soup("<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
+ self.assert_soup("<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
+ self.assert_soup("<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+
+ def test_entities_in_foreign_document_encoding(self):
+ # We can't implement this case correctly because by the time we
+ # hear about markup like "&#147;", it's been (incorrectly) converted into
+ # a string like u'\x93'
+ pass
+
+ # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
+ # test if an old version of lxml is installed.
+
+ @pytest.mark.skipif(
+ not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0),
+ reason="Skipping doctype test for old version of lxml to avoid segfault.",
+ )
+ def test_empty_doctype(self):
+ soup = self.soup("<!DOCTYPE>")
+ doctype = soup.contents[0]
+ assert "" == doctype.strip()
+
+ def test_beautifulstonesoup_is_xml_parser(self):
+ # Make sure that the deprecated BSS class uses an xml builder
+ # if one is installed.
+ with warnings.catch_warnings(record=True) as w:
+ soup = BeautifulStoneSoup("<b />")
+ assert "<b/>" == str(soup.b)
+ [warning] = w
+ assert warning.filename == __file__
+ assert "The BeautifulStoneSoup class was deprecated" in str(warning.message)
+
+ def test_tracking_line_numbers(self):
+ # The lxml TreeBuilder cannot keep track of line numbers from
+ # the original markup. Even if you ask for line numbers, we
+ # don't have 'em.
+ #
+ # However, for consistency with other parsers, Tag.sourceline
+ # and Tag.sourcepos are always set to None, rather than being
+ # available as an alias for find().
+ soup = self.soup(
+ "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
+ store_line_numbers=True,
+ )
+ assert None is soup.p.sourceline
+ assert None is soup.p.sourcepos
+
+
+@pytest.mark.skipif(
+ not LXML_PRESENT,
+ reason="lxml seems not to be present, not testing its XML tree builder.",
+)
+class TestLXMLXMLTreeBuilder(XMLTreeBuilderSmokeTest):
+ """See ``HTMLTreeBuilderSmokeTest``."""
+
+ @property
+ def default_builder(self):
+ return LXMLTreeBuilderForXML
+
+ def test_namespace_indexing(self):
+ soup = self.soup(
+ '<?xml version="1.1"?>\n'
+ "<root>"
+ '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+ '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'
+ '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'
+ '<subtag xmlns="http://another-unprefixed-namespace.com">'
+ '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'
+ "</prefix2:tag3>"
+ "</root>"
+ )
+
+ # The BeautifulSoup object includes every namespace prefix
+ # defined in the entire document. This is the default set of
+ # namespaces used by soupsieve.
+ #
+ # Un-prefixed namespaces are not included, and if a given
+ # prefix is defined twice, only the first prefix encountered
+ # in the document shows up here.
+ assert soup._namespaces == {
+ "xml": "http://www.w3.org/XML/1998/namespace",
+ "prefix": "http://prefixed-namespace.com",
+ "prefix2": "http://another-namespace.com",
+ }
+
+ # A Tag object includes only the namespace prefixes
+ # that were in scope when it was parsed.
+
+ # We do not track un-prefixed namespaces as we can only hold
+ # one (the first one), and it will be recognized as the
+ # default namespace by soupsieve, even when operating from a
+ # tag with a different un-prefixed namespace.
+ assert soup.tag._namespaces == {
+ "xml": "http://www.w3.org/XML/1998/namespace",
+ }
+
+ assert soup.tag2._namespaces == {
+ "prefix": "http://prefixed-namespace.com",
+ "xml": "http://www.w3.org/XML/1998/namespace",
+ }
+
+ assert soup.subtag._namespaces == {
+ "prefix2": "http://another-namespace.com",
+ "xml": "http://www.w3.org/XML/1998/namespace",
+ }
+
+ assert soup.subsubtag._namespaces == {
+ "prefix2": "http://another-namespace.com",
+ "xml": "http://www.w3.org/XML/1998/namespace",
+ }
+
+ @pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
+ def test_namespace_interaction_with_select_and_find(self):
+ # Demonstrate how namespaces interact with select* and
+ # find* methods.
+
+ soup = self.soup(
+ '<?xml version="1.1"?>\n'
+ "<root>"
+ '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+ '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'
+ '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'
+ "<prefix:tag3>"
+ "</subtag>"
+ "</root>"
+ )
+
+ # soupselect uses namespace URIs.
+ assert soup.select_one("tag").name == "tag"
+ assert soup.select_one("prefix|tag2").name == "tag2"
+
+ # If a prefix is declared more than once, only the first usage
+ # is registered with the BeautifulSoup object.
+ assert soup.select_one("prefix|tag3") is None
+
+ # But you can always explicitly specify a namespace dictionary.
+ assert (
+ soup.select_one("prefix|tag3", namespaces=soup.subtag._namespaces).name
+ == "tag3"
+ )
+
+ # And a Tag (as opposed to the BeautifulSoup object) will
+ # have a set of default namespaces scoped to that Tag.
+ assert soup.subtag.select_one("prefix|tag3").name == "tag3"
+
+ # the find() methods aren't fully namespace-aware; they just
+ # look at prefixes.
+ assert soup.find("tag").name == "tag"
+ assert soup.find("prefix:tag2").name == "tag2"
+ assert soup.find("prefix:tag3").name == "tag3"
+ assert soup.subtag.find("prefix:tag3").name == "tag3"
+
+ def test_pickle_restores_builder(self):
+ # The lxml TreeBuilder is not picklable, so when unpickling
+ # a document created with it, a new TreeBuilder of the
+ # appropriate class is created.
+ soup = self.soup("<a>some markup</a>")
+ assert isinstance(soup.builder, self.default_builder)
+ pickled = pickle.dumps(soup)
+ unpickled = pickle.loads(pickled)
+
+ assert "some markup" == unpickled.a.string
+ assert unpickled.builder != soup.builder
+ assert isinstance(unpickled.builder, self.default_builder)