From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- .../site-packages/bs4/tests/test_tree.py | 1452 ++++++++++++++++++++ 1 file changed, 1452 insertions(+) create mode 100644 .venv/lib/python3.12/site-packages/bs4/tests/test_tree.py (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py') diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py new file mode 100644 index 00000000..06d62981 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_tree.py @@ -0,0 +1,1452 @@ +# -*- coding: utf-8 -*- +"""Tests for Beautiful Soup's tree traversal methods. + +The tree traversal methods are the main advantage of using Beautiful +Soup over just using a parser. + +Different parsers will build different Beautiful Soup trees given the +same markup, but all Beautiful Soup trees can be traversed with the +methods tested here. +""" + +import pytest +import re +import warnings +from bs4 import BeautifulSoup +from bs4.builder import builder_registry +from bs4.element import ( + AttributeResemblesVariableWarning, + CData, + Comment, + NavigableString, + Tag, +) +from bs4.filter import SoupStrainer +from . import ( + SoupTest, +) + + +class TestFind(SoupTest): + """Basic tests of the find() method. + """ + + def test_find_tag(self): + soup = self.soup("1234") + assert soup.find("b").string == "2" + + def test_unicode_text_find(self): + soup = self.soup("

Räksmörgås

") + assert soup.find(string="Räksmörgås") == "Räksmörgås" + + def test_unicode_attribute_find(self): + soup = self.soup('

here it is

') + str(soup) + assert "here it is" == soup.find(id="Räksmörgås").text + + def test_find_everything(self): + """Test an optimization that finds all tags.""" + soup = self.soup("foobar") + assert 2 == len(soup.find_all()) + + def test_find_everything_with_name(self): + """Test an optimization that finds all tags with a given name.""" + soup = self.soup("foobarbaz") + assert 2 == len(soup.find_all("a")) + + def test_find_with_no_arguments(self): + soup = self.soup("

") + assert "div" == soup.find().name + assert "div" == soup.find("p").find_previous_sibling().name + assert "p" == soup.find("div").find_next_sibling().name + + def test_find_with_no_arguments_only_finds_tags(self): + soup = self.soup("text

text

") + assert "div" == soup.find().name + assert "div" == soup.find("p").find_previous_sibling().name + assert "p" == soup.find("div").find_next_sibling().name + + +class TestFindAll(SoupTest): + """Basic tests of the find_all() method.""" + + def test_find_all_with_no_arguments_only_finds_tags(self): + soup = self.soup("text

text

") + assert 2 == len(soup.body.find_all()) + assert 1 == len(soup.find("p").find_previous_siblings()) + assert 1 == len(soup.find("div").find_next_siblings()) + + def test_find_all_text_nodes(self): + """You can search the tree for text nodes.""" + soup = self.soup("Foobar\xbb") + # Exact match. + assert soup.find_all(string="bar") == ["bar"] + + # Match any of a number of strings. + assert soup.find_all(string=["Foo", "bar"]) == ["Foo", "bar"] + # Match a regular expression. + assert soup.find_all(string=re.compile(".*")) == ["Foo", "bar", "\xbb"] + # Match anything. + assert soup.find_all(string=True) == ["Foo", "bar", "\xbb"] + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" + soup = self.soup("1 2 3 4 5") + self.assert_selects(soup.find_all("a", limit=3), ["1", "2", "3"]) + self.assert_selects(soup.find_all("a", limit=1), ["1"]) + self.assert_selects(soup.find_all("a", limit=10), ["1", "2", "3", "4", "5"]) + + # A limit of 0 means no limit. + self.assert_selects(soup.find_all("a", limit=0), ["1", "2", "3", "4", "5"]) + + def test_calling_a_tag_is_calling_findall(self): + soup = self.soup("123") + self.assert_selects(soup("a", limit=1), ["1"]) + self.assert_selects(soup.b(id="foo"), ["3"]) + + def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion( + self, + ): + soup = self.soup("") + # Create a self-referential list. + selfref = [] + selfref.append(selfref) + + # Without special code in SoupStrainer, this would cause infinite + # recursion. + with warnings.catch_warnings(record=True) as w: + assert [] == soup.find_all(selfref) + [warning] = w + assert warning.filename == __file__ + msg = str(warning.message) + assert ( + msg + == "Ignoring nested list [[...]] to avoid the possibility of infinite recursion." + ) + + def test_find_all_resultset(self): + """All find_all calls return a ResultSet""" + soup = self.soup("") + result = soup.find_all("a") + assert hasattr(result, "source") + + result = soup.find_all(True) + assert hasattr(result, "source") + + result = soup.find_all(string="foo") + assert hasattr(result, "source") + + +class TestFindAllBasicNamespaces(SoupTest): + def test_find_by_namespaced_name(self): + soup = self.soup('4') + assert "4" == soup.find("mathml:msqrt").string + assert "a" == soup.find(attrs={"svg:fill": "red"}).name + + +class TestFindAllByName(SoupTest): + """Test ways of finding tags by tag name.""" + + def setup_method(self) -> None: + self.tree = self.soup("""First tag. + Second tag. + Third Nested tag. tag.""") + + def test_find_all_by_tag_name(self): + # Find all the tags. + self.assert_selects(self.tree.find_all("a"), ["First tag.", "Nested tag."]) + + def test_find_all_by_name_and_text(self): + self.assert_selects( + self.tree.find_all("a", string="First tag."), ["First tag."] + ) + + self.assert_selects( + self.tree.find_all("a", string=True), ["First tag.", "Nested tag."] + ) + + self.assert_selects( + self.tree.find_all("a", string=re.compile("tag")), + ["First tag.", "Nested tag."], + ) + + def test_find_all_on_non_root_element(self): + # You can call find_all on any node, not just the root. + self.assert_selects(self.tree.c.find_all("a"), ["Nested tag."]) + + def test_calling_element_invokes_find_all(self): + self.assert_selects(self.tree("a"), ["First tag.", "Nested tag."]) + + def test_find_all_by_tag_strainer(self): + self.assert_selects( + self.tree.find_all(SoupStrainer("a")), ["First tag.", "Nested tag."] + ) + + def test_find_all_by_tag_names(self): + self.assert_selects( + self.tree.find_all(["a", "b"]), ["First tag.", "Second tag.", "Nested tag."] + ) + + def test_find_all_by_tag_dict(self): + self.assert_selects( + self.tree.find_all({"a": True, "b": True}), + ["First tag.", "Second tag.", "Nested tag."], + ) + + def test_find_all_by_tag_re(self): + self.assert_selects( + self.tree.find_all(re.compile("^[ab]$")), + ["First tag.", "Second tag.", "Nested tag."], + ) + + def test_find_all_with_tags_matching_method(self): + # You can define an oracle method that determines whether + # a tag matches the search. + def id_matches_name(tag): + return tag.name == tag.get("id") + + tree = self.soup("""Match 1. + Does not match. + Match 2.""") + + self.assert_selects(tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) + + def test_find_with_multi_valued_attribute(self): + soup = self.soup( + "
1
2
3
" + ) + r1 = soup.find("div", "a d") + r2 = soup.find("div", re.compile(r"a d")) + r3, r4 = soup.find_all("div", ["a b", "a d"]) + assert "3" == r1.string + assert "3" == r2.string + assert "1" == r3.string + assert "3" == r4.string + + +class TestFindAllByAttribute(SoupTest): + def test_find_all_by_attribute_name(self): + # You can pass in keyword arguments to find_all to search by + # attribute. + tree = self.soup(""" + Matching a. + + Non-matching Matching b.a. + """) + self.assert_selects(tree.find_all(id="first"), ["Matching a.", "Matching b."]) + + def test_find_all_by_utf8_attribute_value(self): + peace = "םולש".encode("utf8") + data = ''.encode("utf8") + soup = self.soup(data) + assert [soup.a] == soup.find_all(title=peace) + assert [soup.a] == soup.find_all(title=peace.decode("utf8")) + assert [soup.a], soup.find_all(title=[peace, "something else"]) + + def test_find_all_by_attribute_dict(self): + # You can pass in a dictionary as the argument 'attrs'. This + # lets you search for attributes like 'name' (a fixed argument + # to find_all) and 'class' (a reserved word in Python.) + tree = self.soup(""" + Name match. + Class match. + Non-match. + A tag called 'name1'. + """) + + # This doesn't do what you want. + self.assert_selects(tree.find_all(name="name1"), ["A tag called 'name1'."]) + # This does what you want. + self.assert_selects(tree.find_all(attrs={"name": "name1"}), ["Name match."]) + + self.assert_selects(tree.find_all(attrs={"class": "class2"}), ["Class match."]) + + def test_find_all_by_class(self): + tree = self.soup(""" + Class 1. + Class 2. + Class 1. + Class 3 and 4. + """) + + # Passing in the class_ keyword argument will search against + # the 'class' attribute. + self.assert_selects(tree.find_all("a", class_="1"), ["Class 1."]) + self.assert_selects(tree.find_all("c", class_="3"), ["Class 3 and 4."]) + self.assert_selects(tree.find_all("c", class_="4"), ["Class 3 and 4."]) + + # Passing in a string to 'attrs' will also search the CSS class. + self.assert_selects(tree.find_all("a", "1"), ["Class 1."]) + self.assert_selects(tree.find_all(attrs="1"), ["Class 1.", "Class 1."]) + self.assert_selects(tree.find_all("c", "3"), ["Class 3 and 4."]) + self.assert_selects(tree.find_all("c", "4"), ["Class 3 and 4."]) + + def test_find_by_class_when_multiple_classes_present(self): + tree = self.soup("Found it") + + f = tree.find_all("gar", class_=re.compile("o")) + self.assert_selects(f, ["Found it"]) + + f = tree.find_all("gar", class_=re.compile("a")) + self.assert_selects(f, ["Found it"]) + + # If the search fails to match the individual strings "foo" and "bar", + # it will be tried against the combined string "foo bar". + f = tree.find_all("gar", class_=re.compile("o b")) + self.assert_selects(f, ["Found it"]) + + def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): + soup = self.soup("Found it") + + self.assert_selects(soup.find_all("a", re.compile("ba")), ["Found it"]) + + def big_attribute_value(value): + return len(value) > 3 + + self.assert_selects(soup.find_all("a", big_attribute_value), []) + + def small_attribute_value(value): + return len(value) <= 3 + + self.assert_selects(soup.find_all("a", small_attribute_value), ["Found it"]) + + def test_find_all_with_string_for_attrs_finds_multiple_classes(self): + soup = self.soup('') + a, a2 = soup.find_all("a") + assert [a, a2], soup.find_all("a", "foo") + assert [a], soup.find_all("a", "bar") + + # If you specify the class as a string that contains a + # space, only that specific value will be found. + assert [a] == soup.find_all("a", class_="foo bar") + assert [a] == soup.find_all("a", "foo bar") + assert [] == soup.find_all("a", "bar foo") + + def test_find_all_by_attribute_soupstrainer(self): + tree = self.soup(""" + Match. + Non-match.""") + + strainer = SoupStrainer(attrs={"id": "first"}) + self.assert_selects(tree.find_all(strainer), ["Match."]) + + def test_find_all_with_missing_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that do not have that attribute set. + tree = self.soup("""ID present. + No ID present. + ID is empty.""") + self.assert_selects(tree.find_all("a", id=None), ["No ID present."]) + + def test_find_all_with_defined_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that have that attribute set to any value. + tree = self.soup("""ID present. + No ID present. + ID is empty.""") + self.assert_selects(tree.find_all(id=True), ["ID present.", "ID is empty."]) + + def test_find_all_with_numeric_attribute(self): + # If you search for a number, it's treated as a string. + tree = self.soup("""Unquoted attribute. + Quoted attribute.""") + + expected = ["Unquoted attribute.", "Quoted attribute."] + self.assert_selects(tree.find_all(id=1), expected) + self.assert_selects(tree.find_all(id="1"), expected) + + def test_find_all_with_list_attribute_values(self): + # You can pass a list of attribute values instead of just one, + # and you'll get tags that match any of the values. + tree = self.soup("""1 + 2 + 3 + No ID.""") + self.assert_selects(tree.find_all(id=["1", "3", "4"]), ["1", "3"]) + + # If you pass in an empty list, you get nothing. + self.assert_selects(tree.find_all(id=[]), []) + + def test_find_all_with_regular_expression_attribute_value(self): + # You can pass a regular expression as an attribute value, and + # you'll get tags whose values for that attribute match the + # regular expression. + tree = self.soup("""One a. + Two as. + Mixed as and bs. + One b. + No ID.""") + + self.assert_selects(tree.find_all(id=re.compile("^a+$")), ["One a.", "Two as."]) + + def test_find_by_name_and_containing_string(self): + soup = self.soup("foobarfoo") + a = soup.a + + assert [a] == soup.find_all("a", string="foo") + assert [] == soup.find_all("a", string="bar") + + def test_find_by_name_and_containing_string_when_string_is_buried(self): + soup = self.soup("foo foo") + assert soup.find_all("a") == soup.find_all("a", string="foo") + + def test_find_by_attribute_and_containing_string(self): + soup = self.soup('foofoo') + a = soup.a + + assert [a] == soup.find_all(id=2, string="foo") + assert [] == soup.find_all(id=1, string="bar") + + +class TestSmooth(SoupTest): + """Test Tag.smooth.""" + + def test_smooth(self): + soup = self.soup("
a
") + div = soup.div + div.append("b") + div.append("c") + div.append(Comment("Comment 1")) + div.append(Comment("Comment 2")) + div.append("d") + builder = self.default_builder() + span = Tag(soup, builder, "span") + span.append("1") + span.append("2") + div.append(span) + + # At this point the tree has a bunch of adjacent + # NavigableStrings. This is normal, but it has no meaning in + # terms of HTML, so we may want to smooth things out for + # output. + + # Since the tag has two children, its .string is None. + assert None is div.span.string + + assert 7 == len(div.contents) + div.smooth() + assert 5 == len(div.contents) + + # The three strings at the beginning of div.contents have been + # merged into on string. + # + assert "abc" == div.contents[0] + + # The call is recursive -- the tag was also smoothed. + assert "12" == div.span.string + + # The two comments have _not_ been merged, even though + # comments are strings. Merging comments would change the + # meaning of the HTML. + assert "Comment 1" == div.contents[1] + assert "Comment 2" == div.contents[2] + + +class TestIndex(SoupTest): + """Test Tag.index""" + + def test_index(self): + tree = self.soup("""
+ Identical + Not identical + Identical + + Identical with child + Also not identical + Identical with child +
""") + div = tree.div + for i, element in enumerate(div.contents): + assert i == div.index(element) + with pytest.raises(ValueError): + tree.index(1) + + +class TestParentOperations(SoupTest): + """Test navigation and searching through an element's parents.""" + + def setup_method(self) -> None: + self.tree = self.soup("""
+
+
+
+ Start here +
+
""") + self.start = self.tree.b + + def test_parent(self): + assert self.start.parent["id"] == "bottom" + assert self.start.parent.parent["id"] == "middle" + assert self.start.parent.parent.parent["id"] == "top" + + def test_parent_of_top_tag_is_soup_object(self): + top_tag = self.tree.contents[0] + assert top_tag.parent == self.tree + + def test_soup_object_has_no_parent(self): + assert None is self.tree.parent + + def test_find_parents(self): + self.assert_selects_ids( + self.start.find_parents("ul"), ["bottom", "middle", "top"] + ) + self.assert_selects_ids(self.start.find_parents("ul", id="middle"), ["middle"]) + assert self.start.find_parents(id="start") == [] + + def test_find_parent(self): + # assert self.start.find_parent('ul')['id'] == 'bottom' + assert self.start.find_parent("ul", id="top")["id"] == "top" + + assert self.start.find_parent(id="start") is None + + def test_parent_of_text_element(self): + text = self.tree.find(string="Start here") + assert text.parent.name == "b" + + def test_text_element_find_parent(self): + text = self.tree.find(string="Start here") + assert text.find_parent("ul")["id"] == "bottom" + + def test_parent_generator(self): + parents = [ + parent["id"] + for parent in self.start.parents + if parent is not None and "id" in parent.attrs + ] + assert parents == ["bottom", "middle", "top"] + + def test_self_and_parent_generator(self): + results = [ + parent["id"] + for parent in self.start.self_and_parents + if parent is not None and "id" in parent.attrs + ] + assert results == ["start", "bottom", "middle", "top"] + + +class ProximityTest(SoupTest): + def setup_method(self) -> None: + self.tree = self.soup( + 'OneTwoThree' + ) + + +class TestNextOperations(ProximityTest): + def setup_method(self) -> None: + super(TestNextOperations, self).setup_method() + self.start = self.tree.b + + def test_next(self): + assert self.start.next_element == "One" + assert self.start.next_element.next_element["id"] == "2" + + def test_next_of_last_item_is_none(self): + last = self.tree.find(string="Three") + assert last.next_element is None + + def test_next_of_root_is_none(self): + # The document root is outside the next/previous chain. + assert self.tree.next_element is None + + def test_find_all_next(self): + self.assert_selects(self.start.find_all_next("b"), ["Two", "Three"]) + self.start.find_all_next(id=3) + self.assert_selects(self.start.find_all_next(id=3), ["Three"]) + + def test_find_next(self): + assert self.start.find_next("b")["id"] == "2" + assert self.start.find_next(string="Three") == "Three" + + def test_find_next_for_text_element(self): + text = self.tree.find(string="One") + assert text.find_next("b").string == "Two" + self.assert_selects(text.find_all_next("b"), ["Two", "Three"]) + + def test_next_generators(self): + start = self.tree.find(string="Two") + successors = [node for node in start.next_elements] + # There are two successors: the final tag and its text contents. + tag, contents = successors + assert tag["id"] == "3" + assert contents == "Three" + + successors2 = [node for node in start.self_and_next_elements] + assert successors2[1:] == successors + assert successors2[0] == start + + +class TestPreviousOperations(ProximityTest): + def setup_method(self) -> None: + super(TestPreviousOperations, self).setup_method() + self.end = self.tree.find(string="Three") + + def test_previous(self): + assert self.end.previous_element["id"] == "3" + assert self.end.previous_element.previous_element == "Two" + + def test_previous_of_first_item_is_none(self): + first = self.tree.find("html") + assert first.previous_element is None + + def test_previous_of_root_is_none(self): + # The document root is outside the next/previous chain. + assert self.tree.previous_element is None + + def test_find_all_previous(self): + # The tag containing the "Three" node is the predecessor + # of the "Three" node itself, which is why "Three" shows up + # here. + self.assert_selects(self.end.find_all_previous("b"), ["Three", "Two", "One"]) + self.assert_selects(self.end.find_all_previous(id=1), ["One"]) + + def test_find_previous(self): + assert self.end.find_previous("b")["id"] == "3" + assert self.end.find_previous(string="One") == "One" + + def test_find_previous_for_text_element(self): + text = self.tree.find(string="Three") + assert text.find_previous("b").string == "Three" + self.assert_selects(text.find_all_previous("b"), ["Three", "Two", "One"]) + + def test_previous_generators(self): + start = self.tree.find("b", string="One") + self.assert_selects_ids(start.previous_elements, ["bodytag", "headtag", 'start']) + self.assert_selects_ids(start.self_and_previous_elements, ["1", "bodytag", "headtag", "start"]) + + +class SiblingTest(SoupTest): + def setup_method(self) -> None: + markup = """ + + + + + + + + + + + """ + # All that whitespace looks good but makes the tests more + # difficult. Get rid of it. + markup = re.compile(r"\n\s*").sub("", markup) + self.tree = self.soup(markup) + + +class TestNextSibling(SiblingTest): + def setup_method(self) -> None: + super(TestNextSibling, self).setup_method() + self.start = self.tree.find(id="1") + + def test_next_sibling_of_root_is_none(self): + assert self.tree.next_sibling is None + + def test_next_sibling(self): + assert self.start.next_sibling["id"] == "2" + assert self.start.next_sibling.next_sibling["id"] == "3" + + # Note the difference between next_sibling and next_element. + assert self.start.next_element["id"] == "1.1" + + def test_next_sibling_may_not_exist(self): + assert self.tree.html.next_sibling is None + + nested_span = self.tree.find(id="1.1") + assert nested_span.next_sibling is None + + last_span = self.tree.find(id="4") + assert last_span.next_sibling is None + + def test_find_next_sibling(self): + assert self.start.find_next_sibling("span")["id"] == "2" + + def test_next_siblings(self): + self.assert_selects_ids(self.start.find_next_siblings("span"), ["2", "3", "4"]) + + self.assert_selects_ids(self.start.find_next_siblings(id="3"), ["3"]) + + def test_next_siblings_generators(self): + self.assert_selects_ids(self.start.next_siblings, ["2", "3", "4"]) + self.assert_selects_ids(self.start.self_and_next_siblings, ["1", "2", "3", "4"]) + + def test_next_sibling_for_text_element(self): + soup = self.soup("Foobarbaz") + start = soup.find(string="Foo") + assert start.next_sibling.name == "b" + assert start.next_sibling.next_sibling == "baz" + + self.assert_selects(start.find_next_siblings("b"), ["bar"]) + assert start.find_next_sibling(string="baz") == "baz" + assert start.find_next_sibling(string="nonesuch") is None + + +class TestPreviousSibling(SiblingTest): + def setup_method(self) -> None: + super(TestPreviousSibling, self).setup_method() + self.end = self.tree.find(id="4") + + def test_previous_sibling_of_root_is_none(self): + assert self.tree.previous_sibling is None + + def test_previous_sibling(self): + assert self.end.previous_sibling["id"] == "3" + assert self.end.previous_sibling.previous_sibling["id"] == "2" + + # Note the difference between previous_sibling and previous_element. + assert self.end.previous_element["id"] == "3.1" + + def test_previous_sibling_may_not_exist(self): + assert self.tree.html.previous_sibling is None + + nested_span = self.tree.find(id="1.1") + assert nested_span.previous_sibling is None + + first_span = self.tree.find(id="1") + assert first_span.previous_sibling is None + + def test_find_previous_sibling(self): + assert self.end.find_previous_sibling("span")["id"] == "3" + + def test_previous_siblings(self): + self.assert_selects_ids( + self.end.find_previous_siblings("span"), ["3", "2", "1"] + ) + + self.assert_selects_ids(self.end.find_previous_siblings(id="1"), ["1"]) + + def test_previous_siblings_generators(self): + self.assert_selects_ids(self.end.previous_siblings, ["3", "2", "1"]) + self.assert_selects_ids(self.end.self_and_previous_siblings, ["4", "3", "2", "1"]) + + def test_previous_sibling_for_text_element(self): + soup = self.soup("Foobarbaz") + start = soup.find(string="baz") + assert start.previous_sibling.name == "b" + assert start.previous_sibling.previous_sibling == "Foo" + + self.assert_selects(start.find_previous_siblings("b"), ["bar"]) + assert start.find_previous_sibling(string="Foo") == "Foo" + assert start.find_previous_sibling(string="nonesuch") is None + + +class TestTreeModification(SoupTest): + def test_attribute_modification(self): + soup = self.soup('') + soup.a["id"] = 2 + assert soup.decode() == self.document_for('') + del soup.a["id"] + assert soup.decode() == self.document_for("") + soup.a["id2"] = "foo" + assert soup.decode() == self.document_for('') + + def test_new_tag_creation(self): + builder = builder_registry.lookup("html")() + soup = self.soup("", builder=builder) + a = Tag(soup, builder, "a") + ol = Tag(soup, builder, "ol") + a["href"] = "http://foo.com/" + soup.body.insert(0, a) + soup.body.insert(1, ol) + assert ( + soup.body.encode() + == b'
' + ) + + def test_append_to_contents_moves_tag(self): + doc = """
Don't leave me here.
+
Don\'t leave!
""" + soup = self.soup(doc) + second_para = soup.find(id="2") + bold = soup.b + + # Move the tag to the end of the second paragraph. + soup.find(id="2").append(soup.b) + + # The tag is now a child of the second paragraph. + assert bold.parent == second_para + + assert soup.decode() == self.document_for( + '
Don\'t leave me .
\n' '
Don\'t leave!here
' + ) + + def test_insertion_returns_inserted_things(self): + soup = self.soup("") + html = soup.find('html') + head = html.append(soup.new_tag('head')) + assert head.name == 'head' + + [title] = head.insert(0, soup.new_tag('title')) + assert title.name == 'title' + + text5 = title.append('5') + assert text5 == '5' + text34 = text5.insert_before('3', '4') + assert text34 == ['3', '4'] + text67 = text5.insert_after('6', '7') + assert text67 == ['6', '7'] + text89 = title.extend(['8', '9']) + assert text89 == ['8', '9'] + assert title.get_text() == '3456789' + + def test_replace_with_returns_thing_that_was_replaced(self): + text = "" + soup = self.soup(text) + a = soup.a + new_a = a.replace_with(soup.c) + assert a == new_a + + def test_unwrap_returns_thing_that_was_replaced(self): + text = "" + soup = self.soup(text) + a = soup.a + new_a = a.unwrap() + assert a == new_a + + def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): + soup = self.soup("FooBar") + a = soup.a + a.extract() + assert None is a.parent + with pytest.raises(ValueError): + a.unwrap() + with pytest.raises(ValueError): + a.replace_with(soup.c) + + def test_replace_tag_with_itself(self): + text = "Foo " + soup = self.soup(text) + c = soup.c + result = soup.c.replace_with(c) + assert result == c + assert soup.decode() == self.document_for(text) + + def test_replace_tag_with_its_parent_raises_exception(self): + text = "" + soup = self.soup(text) + with pytest.raises(ValueError): + soup.b.replace_with(soup.a) + + def test_insert_tag_into_itself_raises_exception(self): + text = "" + soup = self.soup(text) + with pytest.raises(ValueError): + soup.a.insert(0, soup.a) + + def test_insert_multiple_elements(self): + soup = self.soup("
And now, a word:
And we're back.
") + p2, p3 = soup.insert(1, soup.new_tag("p", string="p2"), soup.new_tag("p", string="p3")) + assert "p2" == p2.string + assert "p3" == p3.string + + p1, p2, p3, p4 = list(soup.children) + assert "And now, a word:" == p1.string + assert "p2" == p2.string + assert "p3" == p3.string + assert "And we're back." == p4.string + + def test_insert_beautifulsoup_object_inserts_children(self): + """Inserting one BeautifulSoup object into another actually inserts all + of its children -- you'll never combine BeautifulSoup objects. + """ + soup = self.soup("
And now, a word:
And we're back.
") + + text = "
p2
p3
" + to_insert = self.soup(text) + p2, p3 = soup.insert(1, to_insert) + assert "p2" == p2.string + assert "p3" == p3.string + + for i in soup.descendants: + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) + assert "And now, a word:" == p1.string + assert "p2" == p2.string + assert "p3" == p3.string + assert "And we're back." == p4.string + + def test_replace_with_maintains_next_element_throughout(self): + soup = self.soup("
onethree
") + a = soup.a + # Make it so the tag has two text children. + a.insert(1, "two") + + # Now replace each one with the empty string. + left, right = a.contents + left.replace_with("") + right.replace_with("") + + # The tag is still connected to the tree. + assert "three" == soup.b.string + + def test_replace_final_node(self): + soup = self.soup("Argh!") + soup.find(string="Argh!").replace_with("Hooray!") + new_text = soup.find(string="Hooray!") + b = soup.b + assert new_text.previous_element == b + assert new_text.parent == b + assert new_text.previous_element.next_element == new_text + assert new_text.next_element is None + + def test_consecutive_text_nodes(self): + # A builder should never create two consecutive text nodes, + # but if you insert one next to another, Beautiful Soup will + # handle it correctly. + soup = self.soup("Argh!") + soup.b.insert(1, "Hooray!") + + assert soup.decode() == self.document_for("Argh!Hooray!") + + new_text = soup.find(string="Hooray!") + assert new_text.previous_element == "Argh!" + assert new_text.previous_element.next_element == new_text + + assert new_text.previous_sibling == "Argh!" + assert new_text.previous_sibling.next_sibling == new_text + + assert new_text.next_sibling is None + assert new_text.next_element == soup.c + + def test_insert_string(self): + soup = self.soup("") + soup.a.insert(0, "bar") + soup.a.insert(0, "foo") + # The string were added to the tag. + assert ["foo", "bar"] == soup.a.contents + # And they were converted to NavigableStrings. + assert soup.a.contents[0].next_element == "bar" + + def test_append(self): + soup = self.soup("1") + result = soup.b.append("2") + assert result == "2" + assert soup.b.decode() == "12" + + def test_insert_tag(self): + builder = self.default_builder() + soup = self.soup("Findlady!", builder=builder) + magic_tag = Tag(soup, builder, "magictag") + magic_tag.insert(0, "the") + soup.a.insert(1, magic_tag) + + assert soup.decode() == self.document_for( + "Findthelady!" + ) + + # Make sure all the relationships are hooked up correctly. + b_tag = soup.b + assert b_tag.next_sibling == magic_tag + assert magic_tag.previous_sibling == b_tag + + find = b_tag.find(string="Find") + assert find.next_element == magic_tag + assert magic_tag.previous_element == find + + c_tag = soup.c + assert magic_tag.next_sibling == c_tag + assert c_tag.previous_sibling == magic_tag + + the = magic_tag.find(string="the") + assert the.parent == magic_tag + assert the.next_element == c_tag + assert c_tag.previous_element == the + + def test_insert_into_the_current_location(self): + data = "bd" + soup = self.soup(data) + soup.a.insert(1, soup.c) + assert data == soup.decode() + + def test_append_child_thats_already_at_the_end(self): + data = "" + soup = self.soup(data) + soup.a.append(soup.b) + assert data == soup.decode() + + def test_extend_with_a_list_of_elements(self): + data = "" + soup = self.soup(data) + elements = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b] + soup.a.extend(elements) + assert "" == soup.decode() + + def test_extend_with_a_list_of_strings(self): + data = "" + soup = self.soup(data) + elements = ["b", "c", NavigableString("d"), "e"] + soup.a.extend(elements) + assert "bcde" == soup.decode() + + @pytest.mark.parametrize("get_tags", [lambda tag: tag, lambda tag: tag.contents]) + def test_extend_with_another_tags_contents(self, get_tags): + data = '
1 2 3 4
' + soup = self.soup(data) + d1 = soup.find("div", id="d1") + d2 = soup.find("div", id="d2") + tags = get_tags(d1) + d2.extend(tags) + assert '
' == d1.decode() + assert '
1 2 3 4
' == d2.decode() + + @pytest.mark.parametrize( + "string_source,result", + ( + [lambda soup: soup.a.string, "1"], + [lambda soup: "abcde", "1abcde"], + ), + ) + def test_extend_with_a_single_non_tag_element(self, string_source, result): + data = "
1
" + soup = self.soup(data) + with warnings.catch_warnings(record=True) as w: + string = string_source(soup) + soup.b.extend(string) + assert soup.div.decode_contents() == result + [warning] = w + assert warning.filename == __file__ + msg = str(warning.message) + assert ( + msg + == "A single non-Tag item was passed into Tag.extend. Use Tag.append instead." + ) + + def test_move_tag_to_beginning_of_parent(self): + data = "" + soup = self.soup(data) + soup.a.insert(0, soup.d) + assert "" == soup.decode() + + def test_insert_works_on_empty_element_tag(self): + # This is a little strange, since most HTML parsers don't allow + # markup like this to come through. But in general, we don't + # know what the parser would or wouldn't have allowed, so + # I'm letting this succeed for now. + soup = self.soup("
") + soup.br.insert(1, "Contents") + assert str(soup.br) == "
Contents
" + + def test_insert_before(self): + soup = self.soup("foobar") + soup.b.insert_before("BAZ") + soup.a.insert_before("QUUX") + assert soup.decode() == self.document_for("QUUXfooBAZbar") + + soup.a.insert_before(soup.b) + assert soup.decode() == self.document_for("QUUXbarfooBAZ") + + # Can't insert an element before itself. + b = soup.b + with pytest.raises(ValueError): + b.insert_before(b) + + # Can't insert before if an element has no parent. + b.extract() + with pytest.raises(ValueError): + b.insert_before("nope") + + # Can insert an identical element + soup = self.soup("") + soup.a.insert_before(soup.new_tag("a")) + + # TODO: OK but what happens? + + def test_insert_multiple_before(self): + soup = self.soup("foobar") + soup.b.insert_before("BAZ", " ", "QUUX") + soup.a.insert_before("QUUX", " ", "BAZ") + assert soup.decode() == self.document_for( + "QUUX BAZfooBAZ QUUXbar" + ) + + soup.a.insert_before(soup.b, "FOO") + assert soup.decode() == self.document_for( + "QUUX BAZbarFOOfooBAZ QUUX" + ) + + def test_insert_after(self): + soup = self.soup("foobar") + soup.b.insert_after("BAZ") + soup.a.insert_after("QUUX") + assert soup.decode() == self.document_for("fooQUUXbarBAZ") + soup.b.insert_after(soup.a) + assert soup.decode() == self.document_for("QUUXbarfooBAZ") + + # Can't insert an element after itself. + b = soup.b + with pytest.raises(ValueError): + b.insert_after(b) + + # Can't insert after if an element has no parent. + b.extract() + with pytest.raises(ValueError): + b.insert_after("nope") + + # Can insert an identical element + soup = self.soup("") + soup.a.insert_before(soup.new_tag("a")) + + # TODO: OK but what does it look like? + + def test_insert_multiple_after(self): + soup = self.soup("foobar") + soup.b.insert_after("BAZ", " ", "QUUX") + soup.a.insert_after("QUUX", " ", "BAZ") + assert soup.decode() == self.document_for( + "fooQUUX BAZbarBAZ QUUX" + ) + soup.b.insert_after(soup.a, "FOO ") + assert soup.decode() == self.document_for( + "QUUX BAZbarfooFOO BAZ QUUX" + ) + + def test_insert_after_raises_exception_if_after_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + with pytest.raises(ValueError): + string.insert_after(tag) + with pytest.raises(NotImplementedError): + soup.insert_after(tag) + with pytest.raises(ValueError): + tag.insert_after(tag) + + def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + with pytest.raises(ValueError): + string.insert_before(tag) + with pytest.raises(NotImplementedError): + soup.insert_before(tag) + with pytest.raises(ValueError): + tag.insert_before(tag) + + def test_replace_with(self): + soup = self.soup("
There's no business like show business
") + no, show = soup.find_all("b") + show.replace_with(no) + assert soup.decode() == self.document_for( + "
There's business like no business
" + ) + + assert show.parent is None + assert no.parent == soup.p + assert no.next_element == "no" + assert no.next_sibling == " business" + + def test_replace_with_errors(self): + # Can't replace a tag that's not part of a tree. + a_tag = Tag(name="a") + with pytest.raises(ValueError): + a_tag.replace_with("won't work") + + # Can't replace a tag with its parent. + a_tag = self.soup("").a + with pytest.raises(ValueError): + a_tag.b.replace_with(a_tag) + + # Or with a list that includes its parent. + with pytest.raises(ValueError): + a_tag.b.replace_with("string1", a_tag, "string2") + + def test_replace_with_multiple(self): + data = "" + soup = self.soup(data) + d_tag = soup.new_tag("d") + d_tag.string = "Text In D Tag" + e_tag = soup.new_tag("e") + f_tag = soup.new_tag("f") + a_string = "Random Text" + soup.c.replace_with(d_tag, e_tag, a_string, f_tag) + assert ( + soup.decode() + == "Text In D TagRandom Text" + ) + assert soup.b.next_element == d_tag + assert d_tag.string.next_element == e_tag + assert e_tag.next_element.string == a_string + assert e_tag.next_element.next_element == f_tag + + def test_replace_first_child(self): + data = "" + soup = self.soup(data) + soup.b.replace_with(soup.c) + assert "" == soup.decode() + + def test_replace_last_child(self): + data = "" + soup = self.soup(data) + soup.c.replace_with(soup.b) + assert "" == soup.decode() + + def test_nested_tag_replace_with(self): + soup = self.soup( + """Wereservetherighttorefuseservice""" + ) + + # Replace the entire tag and its contents ("reserve the + # right") with the tag ("refuse"). + remove_tag = soup.b + move_tag = soup.f + remove_tag.replace_with(move_tag) + + assert soup.decode() == self.document_for( + "Werefusetoservice" + ) + + # The tag is now an orphan. + assert remove_tag.parent is None + assert remove_tag.find(string="right").next_element is None + assert remove_tag.previous_element is None + assert remove_tag.next_sibling is None + assert remove_tag.previous_sibling is None + + # The tag is now connected to the tag. + assert move_tag.parent == soup.a + assert move_tag.previous_element == "We" + assert move_tag.next_element.next_element == soup.e + assert move_tag.next_sibling is None + + # The gap where the tag used to be has been mended, and + # the word "to" is now connected to the tag. + to_text = soup.find(string="to") + g_tag = soup.g + assert to_text.next_element == g_tag + assert to_text.next_sibling == g_tag + assert g_tag.previous_element == to_text + assert g_tag.previous_sibling == to_text + + def test_unwrap(self): + tree = self.soup(""" +
Unneeded formatting is unneeded
+ """) + tree.em.unwrap() + assert tree.em is None + assert tree.p.text == "Unneeded formatting is unneeded" + + def test_wrap(self): + soup = self.soup("I wish I was bold.") + value = soup.string.wrap(soup.new_tag("b")) + assert value.decode() == "I wish I was bold." + assert soup.decode() == self.document_for("I wish I was bold.") + + def test_wrap_extracts_tag_from_elsewhere(self): + soup = self.soup("I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + assert soup.decode() == self.document_for("I wish I was bold.") + + def test_wrap_puts_new_contents_at_the_end(self): + soup = self.soup("I like being bold.I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + assert 2 == len(soup.b.contents) + assert soup.decode() == self.document_for( + "I like being bold.I wish I was bold." + ) + + def test_extract(self): + soup = self.soup( + 'Some content.
Nav crap
More content.' + ) + + assert len(soup.body.contents) == 3 + extracted = soup.find(id="nav").extract() + + assert soup.decode() == "Some content. More content." + assert extracted.decode() == '
Nav crap
' + + # The extracted tag is now an orphan. + assert len(soup.body.contents) == 2 + assert extracted.parent is None + assert extracted.previous_element is None + assert extracted.next_element.next_element is None + + # The gap where the extracted tag used to be has been mended. + content_1 = soup.find(string="Some content. ") + content_2 = soup.find(string=" More content.") + assert content_1.next_element == content_2 + assert content_1.next_sibling == content_2 + assert content_2.previous_element == content_1 + assert content_2.previous_sibling == content_1 + + def test_extract_distinguishes_between_identical_strings(self): + soup = self.soup("foobar") + foo_1 = soup.a.string + foo_2 = soup.new_string("foo") + bar_2 = soup.new_string("bar") + soup.a.append(foo_2) + soup.b.append(bar_2) + + # Now there are two identical strings in the tag, and two + # in the tag. Let's remove the first "foo" and the second + # "bar". + foo_1.extract() + bar_2.extract() + assert foo_2 == soup.a.string + assert bar_2 == soup.b.string + + def test_extract_multiples_of_same_tag(self): + soup = self.soup(""" + + + + + + + + + +""") + [soup.script.extract() for i in soup.find_all("script")] + assert "\n\n\n" == str(soup.body) + + def test_extract_works_when_element_is_surrounded_by_identical_strings(self): + soup = self.soup("\n" "hi\n" "") + soup.find("body").extract() + assert None is soup.find("body") + + def test_clear(self): + """Tag.clear()""" + soup = self.soup("
String Italicized and another
") + # clear using extract() + a = soup.a + soup.p.clear() + assert len(soup.p.contents) == 0 + assert hasattr(a, "contents") + + # clear using decompose() + em = a.em + a.clear(decompose=True) + assert 0 == len(em.contents) + + @pytest.mark.parametrize( + "method_name,expected_result", + [ + ( + "descendants", + '
child1
child3
', + ), + ( + "next_siblings", + '
child1
Second child
', + ), + # Confused about why child3 is still here in this test? It's because removing the
tag from the tree removes all of its children from the tree as well. 'child'.next_element becomes None, because 'child' is no longer in the tree, and iteration stops there. Don't do this kind of thing, is what I'm saying. + ( + "next_elements", + '
child1
child3
', + ), + ("children", '
child1
child3
'), + ("previous_elements", ""), + ( + "previous_siblings", + '
Second child
child3
', + ), + ("parents", ""), + ], + ) + def test_extract_during_iteration(self, method_name, expected_result): + # The iterators should be able to proceed even if the most + # current yield got removed from the tree. This kind of code + # is a bad idea, but we should be able to run it without an exception. + soup = self.soup( + "
child1
Second child
child3
" + ) + iterator = getattr(soup.p, method_name) + for i in iterator: + i.extract() + assert expected_result == soup.decode() + + def test_decompose(self): + # Test PageElement.decompose() and PageElement.decomposed + soup = self.soup("
String Italicized
Another para
") + p1, p2 = soup.find_all("p") + a = p1.a + text = p1.em.string + for i in [p1, p2, a, text]: + assert False is i.decomposed + + # This sets p1 and everything beneath it to decomposed. + p1.decompose() + for i in [p1, a, text]: + assert True is i.decomposed + # p2 is unaffected. + assert False is p2.decomposed + + def test_decompose_string(self): + soup = self.soup("
String 1
String 2
") + div = soup.div + text = div.p.string + assert False is text.decomposed + text.decompose() + assert True is text.decomposed + assert "
String 2
" == div.decode() + + def test_string_set(self): + """Tag.string = 'string'""" + soup = self.soup(" ") + soup.a.string = "foo" + assert soup.a.contents == ["foo"] + soup.b.string = "bar" + assert soup.b.contents == ["bar"] + + def test_string_set_does_not_affect_original_string(self): + soup = self.soup("foobar") + soup.b.string = soup.c.string + assert soup.a.encode() == b"barbar" + + def test_set_string_preserves_class_of_string(self): + soup = self.soup("") + cdata = CData("foo") + soup.a.string = cdata + assert isinstance(soup.a.string, CData) + + +all_find_type_methods = [ + "find", + "find_all", + "find_parent", + "find_parents", + "find_next", + "find_all_next", + "find_previous", + "find_all_previous", + "find_next_sibling", + "find_next_siblings", + "find_previous_sibling", + "find_previous_siblings", +] + + +class TestDeprecatedArguments(SoupTest): + @pytest.mark.parametrize("method_name", all_find_type_methods) + def test_find_type_method_string(self, method_name): + soup = self.soup("somemarkup") + method = getattr(soup.b, method_name) + with warnings.catch_warnings(record=True) as w: + method(text="markup") + [warning] = w + assert warning.filename == __file__ + msg = str(warning.message) + assert ( + msg + == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead." + ) + + +class TestWarnings(SoupTest): + @pytest.mark.parametrize("method_name", all_find_type_methods) + def test_suspicious_syntax_warning(self, method_name): + soup = self.soup("somemarkup") + method = getattr(soup.b, method_name) + with warnings.catch_warnings(record=True) as w: + method(_class="u") + [warning] = w + assert warning.filename == __file__ + assert isinstance(warning.message, AttributeResemblesVariableWarning) + msg = str(warning.message) + assert ( + "'_class' is an unusual attribute name and is a common misspelling for 'class_'" + in msg + ) -- cgit v1.2.3