two version of R2R are here HEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
1 files changed, 602 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py
new file mode 100644
index 00000000..5f771a40
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py
@@ -0,0 +1,602 @@
+# -*- coding: utf-8 -*-
+"""Tests of Beautiful Soup as a whole."""
+
+import logging
+import pickle
+import pytest
+from typing import Iterable
+
+from bs4 import (
+    BeautifulSoup,
+    GuessedAtParserWarning,
+    dammit,
+)
+from bs4.builder import (
+    TreeBuilder,
+)
+from bs4.element import (
+    AttributeValueList,
+    XMLAttributeDict,
+    Comment,
+    PYTHON_SPECIFIC_ENCODINGS,
+    Tag,
+    NavigableString,
+)
+from bs4.filter import SoupStrainer
+from bs4.exceptions import (
+    ParserRejectedMarkup,
+)
+from bs4._warnings import (
+    MarkupResemblesLocatorWarning,
+)
+
+
+from . import (
+    default_builder,
+    LXML_PRESENT,
+    SoupTest,
+)
+import warnings
+from typing import Type
+
+
+class TestConstructor(SoupTest):
+    def test_short_unicode_input(self):
+        data = "<h1>éé</h1>"
+        soup = self.soup(data)
+        assert "éé" == soup.h1.string
+
+    def test_embedded_null(self):
+        data = "<h1>foo\0bar</h1>"
+        soup = self.soup(data)
+        assert "foo\0bar" == soup.h1.string
+
+    def test_exclude_encodings(self):
+        utf8_data = "Räksmörgås".encode("utf-8")
+        soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
+        assert "windows-1252" == soup.original_encoding
+
+    def test_custom_builder_class(self):
+        # Verify that you can pass in a custom Builder class and
+        # it'll be instantiated with the appropriate keyword arguments.
+        class Mock(object):
+            def __init__(self, **kwargs):
+                self.called_with = kwargs
+                self.is_xml = True
+                self.store_line_numbers = False
+                self.cdata_list_attributes = []
+                self.preserve_whitespace_tags = []
+                self.string_containers = {}
+                self.attribute_dict_class = XMLAttributeDict
+                self.attribute_value_list_class = AttributeValueList
+
+            def initialize_soup(self, soup):
+                pass
+
+            def feed(self, markup):
+                self.fed = markup
+
+            def reset(self):
+                pass
+
+            def ignore(self, ignore):
+                pass
+
+            set_up_substitutions = can_be_empty_element = ignore
+
+            def prepare_markup(self, *args, **kwargs):
+                yield (
+                    "prepared markup",
+                    "original encoding",
+                    "declared encoding",
+                    "contains replacement characters",
+                )
+
+        kwargs = dict(
+            var="value",
+            # This is a deprecated BS3-era keyword argument, which
+            # will be stripped out.
+            convertEntities=True,
+        )
+        with warnings.catch_warnings(record=True):
+            soup = BeautifulSoup("", builder=Mock, **kwargs)
+        assert isinstance(soup.builder, Mock)
+        assert dict(var="value") == soup.builder.called_with
+        assert "prepared markup" == soup.builder.fed
+
+        # You can also instantiate the TreeBuilder yourself. In this
+        # case, that specific object is used and any keyword arguments
+        # to the BeautifulSoup constructor are ignored.
+        builder = Mock(**kwargs)
+        with warnings.catch_warnings(record=True) as w:
+            soup = BeautifulSoup(
+                "",
+                builder=builder,
+                ignored_value=True,
+            )
+        msg = str(w[0].message)
+        assert msg.startswith(
+            "Keyword arguments to the BeautifulSoup constructor will be ignored."
+        )
+        assert builder == soup.builder
+        assert kwargs == builder.called_with
+
+    def test_parser_markup_rejection(self):
+        # If markup is completely rejected by the parser, an
+        # explanatory ParserRejectedMarkup exception is raised.
+        class Mock(TreeBuilder):
+            def feed(self, *args, **kwargs):
+                raise ParserRejectedMarkup("Nope.")
+
+        def prepare_markup(self, markup, *args, **kwargs):
+            # We're going to try two different ways of preparing this markup,
+            # but feed() will reject both of them.
+            yield markup, None, None, False
+            yield markup, None, None, False
+
+
+        with pytest.raises(ParserRejectedMarkup) as exc_info:
+            BeautifulSoup("", builder=Mock)
+        assert (
+            "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help."
+            in str(exc_info.value)
+        )
+
+    def test_cdata_list_attributes(self):
+        # Most attribute values are represented as scalars, but the
+        # HTML standard says that some attributes, like 'class' have
+        # space-separated lists as values.
+        markup = '<a id=" an id " class=" a class "></a>'
+        soup = self.soup(markup)
+
+        # Note that the spaces are stripped for 'class' but not for 'id'.
+        a = soup.a
+        assert " an id " == a["id"]
+        assert ["a", "class"] == a["class"]
+
+        # TreeBuilder takes an argument called 'multi_valued_attributes'  which lets
+        # you customize or disable this. As always, you can customize the TreeBuilder
+        # by passing in a keyword argument to the BeautifulSoup constructor.
+        soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
+        assert " a class " == soup.a["class"]
+
+        # Here are two ways of saying that `id` is a multi-valued
+        # attribute in this context, but 'class' is not.
+        for switcheroo in ({"*": "id"}, {"a": "id"}):
+            with warnings.catch_warnings(record=True):
+                # This will create a warning about not explicitly
+                # specifying a parser, but we'll ignore it.
+                soup = self.soup(
+                    markup, builder=None, multi_valued_attributes=switcheroo
+                )
+            a = soup.a
+            assert ["an", "id"] == a["id"]
+            assert " a class " == a["class"]
+
+    def test_replacement_classes(self):
+        # Test the ability to pass in replacements for element classes
+        # which will be used when building the tree.
+        class TagPlus(Tag):
+            pass
+
+        class StringPlus(NavigableString):
+            pass
+
+        class CommentPlus(Comment):
+            pass
+
+        soup = self.soup(
+            "<a><b>foo</b>bar</a><!--whee-->",
+            element_classes={
+                Tag: TagPlus,
+                NavigableString: StringPlus,
+                Comment: CommentPlus,
+            },
+        )
+
+        # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
+        # rather than Tag, String, and Comment objects.
+        assert all(
+            isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.descendants
+        )
+
+    def test_alternate_string_containers(self):
+        # Test the ability to customize the string containers for
+        # different types of tags.
+        class PString(NavigableString):
+            pass
+
+        class BString(NavigableString):
+            pass
+
+        soup = self.soup(
+            "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
+            string_containers={
+                "b": BString,
+                "p": PString,
+            },
+        )
+
+        # The string before the <p> tag is a regular NavigableString.
+        assert isinstance(soup.div.contents[0], NavigableString)
+
+        # The string inside the <p> tag, but not inside the <i> tag,
+        # is a PString.
+        assert isinstance(soup.p.contents[0], PString)
+
+        # Every string inside the <b> tag is a BString, even the one that
+        # was also inside an <i> tag.
+        for s in soup.b.strings:
+            assert isinstance(s, BString)
+
+        # Now that parsing was complete, the string_container_stack
+        # (where this information was kept) has been cleared out.
+        assert [] == soup.string_container_stack
+
+    @pytest.mark.parametrize("bad_markup", [1, False, lambda x: False])
+    def test_invalid_markup_type(self, bad_markup):
+        with pytest.raises(TypeError) as exc_info:
+            BeautifulSoup(bad_markup, "html.parser")
+        assert (
+            f"Incoming markup is of an invalid type: {bad_markup!r}. Markup must be a string, a bytestring, or an open filehandle."
+            in str(exc_info.value)
+        )
+
+
+class TestOutput(SoupTest):
+    @pytest.mark.parametrize(
+        "eventual_encoding,actual_encoding",
+        [
+            ("utf-8", "utf-8"),
+            ("utf-16", "utf-16"),
+        ],
+    )
+    def test_decode_xml_declaration(self, eventual_encoding, actual_encoding):
+        # Most of the time, calling decode() on an XML document will
+        # give you a document declaration that mentions the encoding
+        # you intend to use when encoding the document as a
+        # bytestring.
+        soup = self.soup("<tag></tag>")
+        soup.is_xml = True
+        assert (
+            f'<?xml version="1.0" encoding="{actual_encoding}"?>\n<tag></tag>'
+            == soup.decode(eventual_encoding=eventual_encoding)
+        )
+
+    @pytest.mark.parametrize(
+        "eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None]
+    )
+    def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding(
+        self, eventual_encoding
+    ):
+        # But if you pass a Python internal encoding into decode(), or
+        # omit the eventual_encoding altogether, the document
+        # declaration won't mention any particular encoding.
+        soup = BeautifulSoup("<tag></tag>", "html.parser")
+        soup.is_xml = True
+        assert '<?xml version="1.0"?>\n<tag></tag>' == soup.decode(
+            eventual_encoding=eventual_encoding
+        )
+
+    def test(self):
+        # BeautifulSoup subclasses Tag and extends the decode() method.
+        # Make sure the other Tag methods which call decode() call
+        # it correctly.
+        soup = self.soup("<tag></tag>")
+        assert b"<tag></tag>" == soup.encode(encoding="utf-8")
+        assert b"<tag></tag>" == soup.encode_contents(encoding="utf-8")
+        assert "<tag></tag>" == soup.decode_contents()
+        assert "<tag>\n</tag>\n" == soup.prettify()
+
+
+class TestWarnings(SoupTest):
+    # Note that some of the tests in this class create BeautifulSoup
+    # objects directly rather than using self.soup(). That's
+    # because SoupTest.soup is defined in a different file,
+    # which will throw off the assertion in _assert_warning
+    # that the code that triggered the warning is in the same
+    # file as the test.
+
+    def _assert_warning(
+        self, warnings: Iterable[warnings.WarningMessage], cls: Type[Warning]
+    ) -> warnings.WarningMessage:
+        for w in warnings:
+            if isinstance(w.message, cls):
+                assert w.filename == __file__
+                return w
+        raise Exception("%s warning not found in %r" % (cls, warnings))
+
+    def _assert_no_parser_specified(self, w: Iterable[warnings.WarningMessage]) -> None:
+        warning = self._assert_warning(w, GuessedAtParserWarning)
+        message = str(warning.message)
+        assert message.startswith(GuessedAtParserWarning.MESSAGE[:60])
+
+    def test_warning_if_no_parser_specified(self):
+        with warnings.catch_warnings(record=True) as w:
+            BeautifulSoup("<a><b></b></a>")
+        self._assert_no_parser_specified(w)
+
+    def test_warning_if_parser_specified_too_vague(self):
+        with warnings.catch_warnings(record=True) as w:
+            BeautifulSoup("<a><b></b></a>", "html")
+        self._assert_no_parser_specified(w)
+
+    def test_no_warning_if_explicit_parser_specified(self):
+        with warnings.catch_warnings(record=True) as w:
+            self.soup("<a><b></b></a>")
+        assert [] == w
+
+    def test_warning_if_strainer_filters_everything(self):
+        strainer = SoupStrainer(name="a", string="b")
+        with warnings.catch_warnings(record=True) as w:
+            self.soup("<a><b></b></a>", parse_only=strainer)
+        warning = self._assert_warning(w, UserWarning)
+        msg = str(warning.message)
+        assert msg.startswith("The given value for parse_only will exclude everything:")
+
+    def test_parseOnlyThese_renamed_to_parse_only(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = BeautifulSoup(
+                "<a><b></b></a>",
+                "html.parser",
+                parseOnlyThese=SoupStrainer("b"),
+            )
+        warning = self._assert_warning(w, DeprecationWarning)
+        msg = str(warning.message)
+        assert "parseOnlyThese" in msg
+        assert "parse_only" in msg
+        assert b"<b></b>" == soup.encode()
+
+    def test_fromEncoding_renamed_to_from_encoding(self):
+        with warnings.catch_warnings(record=True) as w:
+            utf8 = b"\xc3\xa9"
+            soup = BeautifulSoup(utf8, "html.parser", fromEncoding="utf8")
+        warning = self._assert_warning(w, DeprecationWarning)
+        msg = str(warning.message)
+        assert "fromEncoding" in msg
+        assert "from_encoding" in msg
+        assert "utf8" == soup.original_encoding
+
+    def test_unrecognized_keyword_argument(self):
+        with pytest.raises(TypeError):
+            self.soup("<a>", no_such_argument=True)
+
+    @pytest.mark.parametrize(
+        "markup",
+        [
+            "markup.html",
+            "markup.htm",
+            "markup.HTML",
+            "markup.txt",
+            "markup.xhtml",
+            "markup.xml",
+            "/home/user/file.txt",
+            r"c:\user\file.html" r"\\server\share\path\file.XhTml",
+        ],
+    )
+    def test_resembles_filename_warning(self, markup):
+        # A warning is issued if the "markup" looks like the name of
+        # an HTML or text file, or a full path to a file on disk.
+        with warnings.catch_warnings(record=True) as w:
+            BeautifulSoup(markup, "html.parser")
+            warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
+            assert "looks more like a filename" in str(warning.message)
+
+    @pytest.mark.parametrize(
+        "markup",
+        [
+            "filename",
+            "markuphtml",
+            "markup.com",
+            "",
+            # Excluded due to an irrelevant file extension.
+            "markup.js",
+            "markup.jpg",
+            "markup.markup",
+            # Excluded due to the lack of any file extension.
+            "/home/user/file",
+            r"c:\user\file.html" r"\\server\share\path\file",
+            # Excluded because of two consecutive slashes _and_ the
+            # colon.
+            "log message containing a url http://www.url.com/ right there.html",
+            # Excluded for containing various characters or combinations
+            # not usually found in filenames.
+            "two  consecutive  spaces.html",
+            "two//consecutive//slashes.html",
+            "looks/like/a/filename/but/oops/theres/a#comment.html",
+            "two\nlines.html",
+            "contains?.html",
+            "contains*.html",
+            "contains#.html",
+            "contains&.html",
+            "contains;.html",
+            "contains>.html",
+            "contains<.html",
+            "contains$.html",
+            "contains|.html",
+            "contains:.html",
+            ":-at-the-front.html",
+        ],
+    )
+    def test_resembles_filename_no_warning(self, markup):
+        # The 'looks more like a filename' warning is not issued if
+        # the markup looks like a bare string, a domain name, or a
+        # file that's not an HTML file.
+        with warnings.catch_warnings(record=True) as w:
+            self.soup(markup)
+        assert [] == w
+
+    def test_url_warning_with_bytes_url(self):
+        url = b"http://www.crummybytes.com/"
+        with warnings.catch_warnings(record=True) as warning_list:
+            BeautifulSoup(url, "html.parser")
+        warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning)
+        assert "looks more like a URL" in str(warning.message)
+        assert url not in str(warning.message).encode("utf8")
+
+    def test_url_warning_with_unicode_url(self):
+        url = "http://www.crummyunicode.com/"
+        with warnings.catch_warnings(record=True) as warning_list:
+            # note - this url must differ from the bytes one otherwise
+            # python's warnings system swallows the second warning
+            BeautifulSoup(url, "html.parser")
+        warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning)
+        assert "looks more like a URL" in str(warning.message)
+        assert url not in str(warning.message)
+
+    def test_url_warning_with_bytes_and_space(self):
+        # Here the markup contains something besides a URL, so no warning
+        # is issued.
+        with warnings.catch_warnings(record=True) as warning_list:
+            self.soup(b"http://www.crummybytes.com/ is great")
+        assert not any("looks more like a URL" in str(w.message) for w in warning_list)
+
+    def test_url_warning_with_unicode_and_space(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            self.soup("http://www.crummyunicode.com/ is great")
+        assert not any("looks more like a URL" in str(w.message) for w in warning_list)
+
+
+class TestSelectiveParsing(SoupTest):
+    def test_parse_with_soupstrainer(self):
+        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
+        strainer = SoupStrainer("b")
+        soup = self.soup(markup, parse_only=strainer)
+        assert soup.encode() == b"<b>Yes</b><b>Yes <c>Yes</c></b>"
+
+
+class TestNewTag(SoupTest):
+    """Test the BeautifulSoup.new_tag() method."""
+
+    def test_new_tag(self):
+        soup = self.soup("")
+        new_tag = soup.new_tag("foo", string="txt", bar="baz", attrs={"name": "a name"})
+        assert isinstance(new_tag, Tag)
+        assert "foo" == new_tag.name
+        assert new_tag.string == "txt"
+        assert dict(bar="baz", name="a name") == new_tag.attrs
+        assert None is new_tag.parent
+
+        # string can be null
+        new_tag = soup.new_tag("foo")
+        assert None is new_tag.string
+        new_tag = soup.new_tag("foo", string=None)
+        assert None is new_tag.string
+
+        # Or the empty string
+        new_tag = soup.new_tag("foo", string="")
+        assert "" == new_tag.string
+
+    @pytest.mark.skipif(
+        not LXML_PRESENT, reason="lxml not installed, cannot parse XML document"
+    )
+    def test_xml_tag_inherits_self_closing_rules_from_builder(self):
+        xml_soup = BeautifulSoup("", "xml")
+        xml_br = xml_soup.new_tag("br")
+        xml_p = xml_soup.new_tag("p")
+
+        # Both the <br> and <p> tag are empty-element, just because
+        # they have no contents.
+        assert b"<br/>" == xml_br.encode()
+        assert b"<p/>" == xml_p.encode()
+
+    def test_tag_inherits_self_closing_rules_from_builder(self):
+        html_soup = BeautifulSoup("", "html.parser")
+        html_br = html_soup.new_tag("br")
+        html_p = html_soup.new_tag("p")
+
+        # The HTML builder users HTML's rules about which tags are
+        # empty-element tags, and the new tags reflect these rules.
+        assert b"<br/>" == html_br.encode()
+        assert b"<p></p>" == html_p.encode()
+
+
+class TestNewString(SoupTest):
+    """Test the BeautifulSoup.new_string() method."""
+
+    def test_new_string_creates_navigablestring(self):
+        soup = self.soup("")
+        s = soup.new_string("foo")
+        assert "foo" == s
+        assert isinstance(s, NavigableString)
+
+    def test_new_string_can_create_navigablestring_subclass(self):
+        soup = self.soup("")
+        s = soup.new_string("foo", Comment)
+        assert "foo" == s
+        assert isinstance(s, Comment)
+
+
+class TestPickle(SoupTest):
+    # Test our ability to pickle the BeautifulSoup object itself.
+
+    def test_normal_pickle(self):
+        soup = self.soup("<a>some markup</a>")
+        pickled = pickle.dumps(soup)
+        unpickled = pickle.loads(pickled)
+        assert "some markup" == unpickled.a.string
+
+    def test_pickle_with_no_builder(self):
+        # We had a bug that prevented pickling from working if
+        # the builder wasn't set.
+        soup = self.soup("some markup")
+        soup.builder = None
+        pickled = pickle.dumps(soup)
+        unpickled = pickle.loads(pickled)
+        assert "some markup" == unpickled.string
+
+
+class TestEncodingConversion(SoupTest):
+    # Test Beautiful Soup's ability to decode and encode from various
+    # encodings.
+
+    def setup_method(self):
+        self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
+        self.utf8_data = self.unicode_data.encode("utf-8")
+        # Just so you know what it looks like.
+        assert (
+            self.utf8_data
+            == b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>'
+        )
+
+    def test_ascii_in_unicode_out(self):
+        # ASCII input is converted to Unicode. The original_encoding
+        # attribute is set to 'utf-8', a superset of ASCII.
+        chardet = dammit._chardet_dammit
+        logging.disable(logging.WARNING)
+        try:
+
+            def noop(str):
+                return None
+
+            # Disable chardet, which will realize that the ASCII is ASCII.
+            dammit._chardet_dammit = noop
+            ascii = b"<foo>a</foo>"
+            soup_from_ascii = self.soup(ascii)
+            unicode_output = soup_from_ascii.decode()
+            assert isinstance(unicode_output, str)
+            assert unicode_output == self.document_for(ascii.decode())
+            assert soup_from_ascii.original_encoding.lower() == "utf-8"
+        finally:
+            logging.disable(logging.NOTSET)
+            dammit._chardet_dammit = chardet
+
+    def test_unicode_in_unicode_out(self):
+        # Unicode input is left alone. The original_encoding attribute
+        # is not set.
+        soup_from_unicode = self.soup(self.unicode_data)
+        assert soup_from_unicode.decode() == self.unicode_data
+        assert soup_from_unicode.foo.string == "Sacr\xe9 bleu!"
+        assert soup_from_unicode.original_encoding is None
+
+    def test_utf8_in_unicode_out(self):
+        # UTF-8 input is converted to Unicode. The original_encoding
+        # attribute is set.
+        soup_from_utf8 = self.soup(self.utf8_data)
+        assert soup_from_utf8.decode() == self.unicode_data
+        assert soup_from_utf8.foo.string == "Sacr\xe9 bleu!"
+
+    def test_utf8_out(self):
+        # The internal data structures can be encoded as UTF-8.
+        soup_from_unicode = self.soup(self.unicode_data)
+        assert soup_from_unicode.encode("utf-8") == self.utf8_data
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/tests/test_soup.py
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz