From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- .../site-packages/bs4/tests/test_pageelement.py | 437 +++++++++++++++++++++ 1 file changed, 437 insertions(+) create mode 100644 .venv/lib/python3.12/site-packages/bs4/tests/test_pageelement.py (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_pageelement.py') diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_pageelement.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_pageelement.py new file mode 100644 index 00000000..91d57792 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_pageelement.py @@ -0,0 +1,437 @@ +"""Tests of the bs4.element.PageElement class""" + +import copy +import pickle +import pytest +import sys +import warnings + +from bs4 import BeautifulSoup +from bs4.element import ( + AttributeValueList, + Comment, +) +from bs4.filter import SoupStrainer +from . import ( + SoupTest, +) + + +class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + assert soup.b.string.encode("utf-8") == "\N{SNOWMAN}".encode("utf-8") + + def test_tag_containing_unicode_string_can_be_encoded(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + assert soup.b.encode("utf-8") == html.encode("utf-8") + + def test_encoding_substitutes_unrecognized_characters_by_default(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + assert soup.b.encode("ascii") == b"" + + def test_encoding_can_be_made_strict(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + with pytest.raises(UnicodeEncodeError): + soup.encode("ascii", errors="strict") + + def test_decode_contents(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + assert "\N{SNOWMAN}" == soup.b.decode_contents() + + def test_encode_contents(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents(encoding="utf8") + + def test_encode_deeply_nested_document(self): + # This test verifies that encoding a string doesn't involve + # any recursive function calls. If it did, this test would + # overflow the Python interpreter stack. + limit = sys.getrecursionlimit() + 1 + markup = "" * limit + soup = self.soup(markup) + encoded = soup.encode() + assert limit == encoded.count(b"") + + def test_deprecated_renderContents(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + with warnings.catch_warnings(record=True) as w: + soup.renderContents() + assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents() + msgs = [str(warning.message) for warning in w] + assert all( + x + == "Call to deprecated method renderContents. (Replaced by encode_contents) -- Deprecated since version 4.0.0." + for x in msgs + ) + + def test_repr(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + assert html == repr(soup) + + +class TestFormatters(SoupTest): + """Test the formatting feature, used by methods like decode() and + prettify(), and the formatters themselves. + """ + + def test_default_formatter_is_minimal(self): + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + assert decoded == self.document_for( + "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + ) + + def test_formatter_html(self): + markup = ( + "
<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + ) + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + assert decoded == self.document_for( + "
<<Sacré bleu!>>" + ) + + def test_formatter_html5(self): + markup = ( + "
<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + ) + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + assert decoded == self.document_for( + "
<<Sacré bleu!>>" + ) + + def test_formatter_minimal(self): + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + assert decoded == self.document_for( + "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + ) + + def test_formatter_null(self): + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + assert decoded == self.document_for( + "<>" + ) + + def test_formatter_custom(self): + markup = "<foo>bar
" + soup = self.soup(markup) + decoded = soup.decode(formatter=lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + assert decoded == self.document_for("BAR
") + + def test_formatter_is_run_on_attribute_values(self): + markup = 'e' + soup = self.soup(markup) + a = soup.a + + expect_minimal = 'e' + + assert expect_minimal == a.decode() + assert expect_minimal == a.decode(formatter="minimal") + + expect_html = 'e' + assert expect_html == a.decode(formatter="html") + + assert markup == a.decode(formatter=None) + expect_upper = 'E' + assert expect_upper == a.decode(formatter=lambda x: x.upper()) + + def test_formatter_skips_script_tag_for_html_documents(self): + doc = """ + +""" + encoded = BeautifulSoup(doc, "html.parser").encode() + assert b"< < hey > >" in encoded + + def test_formatter_skips_style_tag_for_html_documents(self): + doc = """ + +""" + encoded = BeautifulSoup(doc, "html.parser").encode() + assert b"< < hey > >" in encoded + + def test_prettify_leaves_preformatted_text_alone(self): + soup = self.soup( + "
foo
  \tbar\n  \n  
baz
" + ) + # Everything outside the
 tag is reformatted, but everything
+        # inside is left alone.
+        assert (
+            "
\n foo\n
  \tbar\n  \n  
\n baz\n \n
\n" + == soup.div.prettify() + ) + + def test_prettify_handles_nested_string_literal_tags(self): + # Most of this markup is inside a
 tag, so prettify()
+        # only does three things to it:
+        # 1. Add a newline and a space between the 
and the
+        # 2. Add a newline after the 
+ # 3. Add a newline at the end. + # + # The contents of the
 tag are left completely alone.  In
+        # particular, we don't start adding whitespace again once we
+        # encounter the first 
tag, because we know it's not + # the one that put us into string literal mode. + markup = """
some
+ for you 
+
""" + + expect = """
+
some
+ for you 
+
+
+""" + soup = self.soup(markup) + assert expect == soup.div.prettify() + + def test_prettify_accepts_formatter_function(self): + soup = BeautifulSoup("foo", "html.parser") + pretty = soup.prettify(formatter=lambda x: x.upper()) + assert "FOO" in pretty + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("") + assert str is type(soup.prettify()) + + def test_prettify_can_encode_data(self): + soup = self.soup("") + assert bytes is type(soup.prettify("utf-8")) + + def test_html_entity_substitution_off_by_default(self): + markup = "Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + assert encoded == markup.encode("utf-8") + + def test_encoding_substitution(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ( + '' + ) + soup = self.soup(meta_tag) + + # Parse the document, and the charset apprears unchanged. + assert soup.meta["content"] == "text/html; charset=x-sjis" + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + assert b"charset=utf-8" in utf_8 + + euc_jp = soup.encode("euc_jp") + assert b"charset=euc_jp" in euc_jp + + shift_jis = soup.encode("shift-jis") + assert b"charset=shift-jis" in shift_jis + + utf_16_u = soup.encode("utf-16").decode("utf-16") + assert "charset=utf-16" in utf_16_u + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ( + '
foo
' + ) + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer("pre") + soup = self.soup(markup, parse_only=strainer) + assert soup.contents[0].name == "pre" + + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setup_method(self): + self.page = """ + + + +Beautiful Soup: We called him Tortoise because he taught us. + + + + + + +foo +bar + +""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + assert loaded.__class__ == BeautifulSoup + assert loaded.decode() == self.tree.decode() + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + assert copied.decode() == self.tree.decode() + + def test_copy_deeply_nested_document(self): + # This test verifies that copy and deepcopy don't involve any + # recursive function calls. If they did, this test would + # overflow the Python interpreter stack. + limit = sys.getrecursionlimit() + 1 + markup = "" * limit + + soup = self.soup(markup) + + copy.copy(soup) + copy.deepcopy(soup) + + def test_copy_preserves_encoding(self): + soup = BeautifulSoup(b"

 

", "html.parser") + encoding = soup.original_encoding + copy = soup.__copy__() + assert "

 

" == str(copy) + assert encoding == copy.original_encoding + + def test_copy_preserves_builder_information(self): + tag = self.soup("

").p + + # Simulate a tag obtained from a source file. + tag.sourceline = 10 + tag.sourcepos = 33 + + copied = tag.__copy__() + + # The TreeBuilder object is no longer availble, but information + # obtained from it gets copied over to the new Tag object. + assert tag.sourceline == copied.sourceline + assert tag.sourcepos == copied.sourcepos + assert tag.can_be_empty_element == copied.can_be_empty_element + assert tag.cdata_list_attributes == copied.cdata_list_attributes + assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags + assert tag.interesting_string_types == copied.interesting_string_types + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = "\N{SNOWMAN}" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + assert loaded.decode() == soup.decode() + + def test_copy_navigablestring_is_not_attached_to_tree(self): + html = "FooBar" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) + assert s1 == s2 + assert None is s2.parent + assert None is s2.next_element + assert None is not s1.next_sibling + assert None is s2.next_sibling + assert None is s2.previous_element + + def test_copy_navigablestring_subclass_has_same_type(self): + html = "" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) + assert s1 == s2 + assert isinstance(s2, Comment) + + def test_copy_entire_soup(self): + html = "
FooBar
end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + assert soup == soup_copy + + def test_copy_tag_copies_contents(self): + html = "
FooBar
end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + assert str(div) == str(div_copy) + assert div == div_copy + + # But they're not the same object. + assert div is not div_copy + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + assert None is div_copy.parent + assert None is div_copy.previous_element + assert None is div_copy.find(string="Bar").next_element + assert None is not div.find(string="Bar").next_element + + # Modifying one of the tag's multi-valued attributes + # doesn't modify the other. + assert div["class"] is not div_copy["class"] + div["class"].append("d") + assert "a b c d".split() == div["class"] + assert "a b c".split() == div_copy["class"] + assert isinstance(div_copy["class"], AttributeValueList) + + +class TestEquality(SoupTest): + + def test_comparison(self): + soup = self.soup("string string") + first_a, second_a = soup.find_all('a') + first_string, second_string = soup.find_all(string='string') + + # Tags with the same markup are equal. + assert first_a == second_a + + # NavigableStrings with the same content are equal, and also + # equal to a Python string with the same content... + assert first_string == second_string == "string" + + # ...but not equivalent to a bytestring with the same content. + assert first_string != b"string" + + def test_hash(self): + soup = self.soup("string string") + first_a, second_a = soup.find_all('a') + first_string, second_string = soup.find_all(string='string') + + # Tags with the same markup hash to the same value. + assert hash(first_a) == hash(second_a) + + # But they're not the same object. + assert id(first_a) != id(second_a) + + # NavigableStrings with the same contents hash to the value of + # the contents. + assert hash(first_string) == hash(second_string) == hash("string") -- cgit v1.2.3