From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- .../site-packages/bs4/tests/test_tree.py | 1452 ++++++++++++++++++++ 1 file changed, 1452 insertions(+) create mode 100644 .venv/lib/python3.12/site-packages/bs4/tests/test_tree.py (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py') diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py new file mode 100644 index 00000000..06d62981 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py @@ -0,0 +1,1452 @@ +# -*- coding: utf-8 -*- +"""Tests for Beautiful Soup's tree traversal methods. + +The tree traversal methods are the main advantage of using Beautiful +Soup over just using a parser. + +Different parsers will build different Beautiful Soup trees given the +same markup, but all Beautiful Soup trees can be traversed with the +methods tested here. +""" + +import pytest +import re +import warnings +from bs4 import BeautifulSoup +from bs4.builder import builder_registry +from bs4.element import ( + AttributeResemblesVariableWarning, + CData, + Comment, + NavigableString, + Tag, +) +from bs4.filter import SoupStrainer +from . import ( + SoupTest, +) + + +class TestFind(SoupTest): + """Basic tests of the find() method. + """ + + def test_find_tag(self): + soup = self.soup("1234") + assert soup.find("b").string == "2" + + def test_unicode_text_find(self): + soup = self.soup("
text
") + assert "div" == soup.find().name + assert "div" == soup.find("p").find_previous_sibling().name + assert "p" == soup.find("div").find_next_sibling().name + + +class TestFindAll(SoupTest): + """Basic tests of the find_all() method.""" + + def test_find_all_with_no_arguments_only_finds_tags(self): + soup = self.soup("texttext
") + assert 2 == len(soup.body.find_all()) + assert 1 == len(soup.find("p").find_previous_siblings()) + assert 1 == len(soup.find("div").find_next_siblings()) + + def test_find_all_text_nodes(self): + """You can search the tree for text nodes.""" + soup = self.soup("Foobar\xbb") + # Exact match. + assert soup.find_all(string="bar") == ["bar"] + + # Match any of a number of strings. + assert soup.find_all(string=["Foo", "bar"]) == ["Foo", "bar"] + # Match a regular expression. + assert soup.find_all(string=re.compile(".*")) == ["Foo", "bar", "\xbb"] + # Match anything. + assert soup.find_all(string=True) == ["Foo", "bar", "\xbb"] + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" + soup = self.soup("12345") + self.assert_selects(soup.find_all("a", limit=3), ["1", "2", "3"]) + self.assert_selects(soup.find_all("a", limit=1), ["1"]) + self.assert_selects(soup.find_all("a", limit=10), ["1", "2", "3", "4", "5"]) + + # A limit of 0 means no limit. + self.assert_selects(soup.find_all("a", limit=0), ["1", "2", "3", "4", "5"]) + + def test_calling_a_tag_is_calling_findall(self): + soup = self.soup("123") + self.assert_selects(soup("a", limit=1), ["1"]) + self.assert_selects(soup.b(id="foo"), ["3"]) + + def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion( + self, + ): + soup = self.soup("") + # Create a self-referential list. + selfref = [] + selfref.append(selfref) + + # Without special code in SoupStrainer, this would cause infinite + # recursion. + with warnings.catch_warnings(record=True) as w: + assert [] == soup.find_all(selfref) + [warning] = w + assert warning.filename == __file__ + msg = str(warning.message) + assert ( + msg + == "Ignoring nested list [[...]] to avoid the possibility of infinite recursion." + ) + + def test_find_all_resultset(self): + """All find_all calls return a ResultSet""" + soup = self.soup("") + result = soup.find_all("a") + assert hasattr(result, "source") + + result = soup.find_all(True) + assert hasattr(result, "source") + + result = soup.find_all(string="foo") + assert hasattr(result, "source") + + +class TestFindAllBasicNamespaces(SoupTest): + def test_find_by_namespaced_name(self): + soup = self.soup('Don't leave me here.
+Don\'t leave!
""" + soup = self.soup(doc) + second_para = soup.find(id="2") + bold = soup.b + + # Move the tag to the end of the second paragraph. + soup.find(id="2").append(soup.b) + + # The tag is now a child of the second paragraph. + assert bold.parent == second_para + + assert soup.decode() == self.document_for( + 'Don\'t leave me .
\n' 'Don\'t leave!here
' + ) + + def test_insertion_returns_inserted_things(self): + soup = self.soup("") + html = soup.find('html') + head = html.append(soup.new_tag('head')) + assert head.name == 'head' + + [title] = head.insert(0, soup.new_tag('title')) + assert title.name == 'title' + + text5 = title.append('5') + assert text5 == '5' + text34 = text5.insert_before('3', '4') + assert text34 == ['3', '4'] + text67 = text5.insert_after('6', '7') + assert text67 == ['6', '7'] + text89 = title.extend(['8', '9']) + assert text89 == ['8', '9'] + assert title.get_text() == '3456789' + + def test_replace_with_returns_thing_that_was_replaced(self): + text = "And now, a word:
And we're back.
") + p2, p3 = soup.insert(1, soup.new_tag("p", string="p2"), soup.new_tag("p", string="p3")) + assert "p2" == p2.string + assert "p3" == p3.string + + p1, p2, p3, p4 = list(soup.children) + assert "And now, a word:" == p1.string + assert "p2" == p2.string + assert "p3" == p3.string + assert "And we're back." == p4.string + + def test_insert_beautifulsoup_object_inserts_children(self): + """Inserting one BeautifulSoup object into another actually inserts all + of its children -- you'll never combine BeautifulSoup objects. + """ + soup = self.soup("And now, a word:
And we're back.
") + + text = "p2
p3
" + to_insert = self.soup(text) + p2, p3 = soup.insert(1, to_insert) + assert "p2" == p2.string + assert "p3" == p3.string + + for i in soup.descendants: + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) + assert "And now, a word:" == p1.string + assert "p2" == p2.string + assert "p3" == p3.string + assert "And we're back." == p4.string + + def test_replace_with_maintains_next_element_throughout(self): + soup = self.soup("onethree
") + a = soup.a + # Make it so the tag has two text children. + a.insert(1, "two") + + # Now replace each one with the empty string. + left, right = a.contents + left.replace_with("") + right.replace_with("") + + # The tag is still connected to the tree. + assert "three" == soup.b.string + + def test_replace_final_node(self): + soup = self.soup("Argh!") + soup.find(string="Argh!").replace_with("Hooray!") + new_text = soup.find(string="Hooray!") + b = soup.b + assert new_text.previous_element == b + assert new_text.parent == b + assert new_text.previous_element.next_element == new_text + assert new_text.next_element is None + + def test_consecutive_text_nodes(self): + # A builder should never create two consecutive text nodes, + # but if you insert one next to another, Beautiful Soup will + # handle it correctly. + soup = self.soup("Argh!There's no business like show business
") + no, show = soup.find_all("b") + show.replace_with(no) + assert soup.decode() == self.document_for( + "There's business like no business
" + ) + + assert show.parent is None + assert no.parent == soup.p + assert no.next_element == "no" + assert no.next_sibling == " business" + + def test_replace_with_errors(self): + # Can't replace a tag that's not part of a tree. + a_tag = Tag(name="a") + with pytest.raises(ValueError): + a_tag.replace_with("won't work") + + # Can't replace a tag with its parent. + a_tag = self.soup("").a + with pytest.raises(ValueError): + a_tag.b.replace_with(a_tag) + + # Or with a list that includes its parent. + with pytest.raises(ValueError): + a_tag.b.replace_with("string1", a_tag, "string2") + + def test_replace_with_multiple(self): + data = "Unneeded formatting is unneeded
+ """) + tree.em.unwrap() + assert tree.em is None + assert tree.p.text == "Unneeded formatting is unneeded" + + def test_wrap(self): + soup = self.soup("I wish I was bold.") + value = soup.string.wrap(soup.new_tag("b")) + assert value.decode() == "I wish I was bold." + assert soup.decode() == self.document_for("I wish I was bold.") + + def test_wrap_extracts_tag_from_elsewhere(self): + soup = self.soup("I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + assert soup.decode() == self.document_for("I wish I was bold.") + + def test_wrap_puts_new_contents_at_the_end(self): + soup = self.soup("I like being bold.I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + assert 2 == len(soup.b.contents) + assert soup.decode() == self.document_for( + "I like being bold.I wish I was bold." + ) + + def test_extract(self): + soup = self.soup( + 'Some content. More content.' + ) + + assert len(soup.body.contents) == 3 + extracted = soup.find(id="nav").extract() + + assert soup.decode() == "Some content. More content." + assert extracted.decode() == ' ' + + # The extracted tag is now an orphan. + assert len(soup.body.contents) == 2 + assert extracted.parent is None + assert extracted.previous_element is None + assert extracted.next_element.next_element is None + + # The gap where the extracted tag used to be has been mended. + content_1 = soup.find(string="Some content. ") + content_2 = soup.find(string=" More content.") + assert content_1.next_element == content_2 + assert content_1.next_sibling == content_2 + assert content_2.previous_element == content_1 + assert content_2.previous_sibling == content_1 + + def test_extract_distinguishes_between_identical_strings(self): + soup = self.soup("foobar") + foo_1 = soup.a.string + foo_2 = soup.new_string("foo") + bar_2 = soup.new_string("bar") + soup.a.append(foo_2) + soup.b.append(bar_2) + + # Now there are two identical strings in the tag, and two + # in the tag. Let's remove the first "foo" and the second + # "bar". + foo_1.extract() + bar_2.extract() + assert foo_2 == soup.a.string + assert bar_2 == soup.b.string + + def test_extract_multiples_of_same_tag(self): + soup = self.soup(""" + + + + + + + + + +""") + [soup.script.extract() for i in soup.find_all("script")] + assert "\n\n\n" == str(soup.body) + + def test_extract_works_when_element_is_surrounded_by_identical_strings(self): + soup = self.soup("\n" "hi\n" "") + soup.find("body").extract() + assert None is soup.find("body") + + def test_clear(self): + """Tag.clear()""" + soup = self.soup("String Italicized and another
") + # clear using extract() + a = soup.a + soup.p.clear() + assert len(soup.p.contents) == 0 + assert hasattr(a, "contents") + + # clear using decompose() + em = a.em + a.clear(decompose=True) + assert 0 == len(em.contents) + + @pytest.mark.parametrize( + "method_name,expected_result", + [ + ( + "descendants", + 'child3
tag from the tree removes all of its children from the tree as well. 'child'.next_element becomes None, because 'child' is no longer in the tree, and iteration stops there. Don't do this kind of thing, is what I'm saying. + ( + "next_elements", + '
child3
child3
child3
child3
Another para
") + p1, p2 = soup.find_all("p") + a = p1.a + text = p1.em.string + for i in [p1, p2, a, text]: + assert False is i.decomposed + + # This sets p1 and everything beneath it to decomposed. + p1.decompose() + for i in [p1, a, text]: + assert True is i.decomposed + # p2 is unaffected. + assert False is p2.decomposed + + def test_decompose_string(self): + soup = self.soup("String 1
String 2
") + div = soup.div + text = div.p.string + assert False is text.decomposed + text.decompose() + assert True is text.decomposed + assert "String 2