\tbar\n \nbaz
tag is reformatted, but everything + # inside is left alone. + assert ( + "\n foo\n\n" + == soup.div.prettify() + ) + + def test_prettify_handles_nested_string_literal_tags(self): + # Most of this markup is inside a\tbar\n \n\n baz\n \ntag, so prettify() + # only does three things to it: + # 1. Add a newline and a space between theand the+ # 2. Add a newline after the+ # 3. Add a newline at the end. + # + # The contents of thetag are left completely alone. In + # particular, we don't start adding whitespace again once we + # encounter the firsttag, because we know it's not + # the one that put us into string literal mode. + markup = """""" + + expect = """some + for you +
++""" + soup = self.soup(markup) + assert expect == soup.div.prettify() + + def test_prettify_accepts_formatter_function(self): + soup = BeautifulSoup("foo", "html.parser") + pretty = soup.prettify(formatter=lambda x: x.upper()) + assert "FOO" in pretty + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("") + assert str is type(soup.prettify()) + + def test_prettify_can_encode_data(self): + soup = self.soup("") + assert bytes is type(soup.prettify("utf-8")) + + def test_html_entity_substitution_off_by_default(self): + markup = "Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + assert encoded == markup.encode("utf-8") + + def test_encoding_substitution(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ( + '' + ) + soup = self.soup(meta_tag) + + # Parse the document, and the charset apprears unchanged. + assert soup.meta["content"] == "text/html; charset=x-sjis" + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + assert b"charset=utf-8" in utf_8 + + euc_jp = soup.encode("euc_jp") + assert b"charset=euc_jp" in euc_jp + + shift_jis = soup.encode("shift-jis") + assert b"charset=shift-jis" in shift_jis + + utf_16_u = soup.encode("utf-16").decode("utf-16") + assert "charset=utf-16" in utf_16_u + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ( + '+some + for you +
foo' + ) + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer("pre") + soup = self.soup(markup, parse_only=strainer) + assert soup.contents[0].name == "pre" + + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setup_method(self): + self.page = """ + + + +Beautiful Soup: We called him Tortoise because he taught us. + + + + + + +foo +bar + +""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + assert loaded.__class__ == BeautifulSoup + assert loaded.decode() == self.tree.decode() + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + assert copied.decode() == self.tree.decode() + + def test_copy_deeply_nested_document(self): + # This test verifies that copy and deepcopy don't involve any + # recursive function calls. If they did, this test would + # overflow the Python interpreter stack. + limit = sys.getrecursionlimit() + 1 + markup = "" * limit + + soup = self.soup(markup) + + copy.copy(soup) + copy.deepcopy(soup) + + def test_copy_preserves_encoding(self): + soup = BeautifulSoup(b"", "html.parser") + encoding = soup.original_encoding + copy = soup.__copy__() + assert "
" == str(copy) + assert encoding == copy.original_encoding + + def test_copy_preserves_builder_information(self): + tag = self.soup("").p + + # Simulate a tag obtained from a source file. + tag.sourceline = 10 + tag.sourcepos = 33 + + copied = tag.__copy__() + + # The TreeBuilder object is no longer availble, but information + # obtained from it gets copied over to the new Tag object. + assert tag.sourceline == copied.sourceline + assert tag.sourcepos == copied.sourcepos + assert tag.can_be_empty_element == copied.can_be_empty_element + assert tag.cdata_list_attributes == copied.cdata_list_attributes + assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags + assert tag.interesting_string_types == copied.interesting_string_types + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = "\N{SNOWMAN}" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + assert loaded.decode() == soup.decode() + + def test_copy_navigablestring_is_not_attached_to_tree(self): + html = "FooBar" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) + assert s1 == s2 + assert None is s2.parent + assert None is s2.next_element + assert None is not s1.next_sibling + assert None is s2.next_sibling + assert None is s2.previous_element + + def test_copy_navigablestring_subclass_has_same_type(self): + html = "" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) + assert s1 == s2 + assert isinstance(s2, Comment) + + def test_copy_entire_soup(self): + html = "end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + assert soup == soup_copy + + def test_copy_tag_copies_contents(self): + html = "end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + assert str(div) == str(div_copy) + assert div == div_copy + + # But they're not the same object. + assert div is not div_copy + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + assert None is div_copy.parent + assert None is div_copy.previous_element + assert None is div_copy.find(string="Bar").next_element + assert None is not div.find(string="Bar").next_element + + # Modifying one of the tag's multi-valued attributes + # doesn't modify the other. + assert div["class"] is not div_copy["class"] + div["class"].append("d") + assert "a b c d".split() == div["class"] + assert "a b c".split() == div_copy["class"] + assert isinstance(div_copy["class"], AttributeValueList) + + +class TestEquality(SoupTest): + + def test_comparison(self): + soup = self.soup("string string") + first_a, second_a = soup.find_all('a') + first_string, second_string = soup.find_all(string='string') + + # Tags with the same markup are equal. + assert first_a == second_a + + # NavigableStrings with the same content are equal, and also + # equal to a Python string with the same content... + assert first_string == second_string == "string" + + # ...but not equivalent to a bytestring with the same content. + assert first_string != b"string" + + def test_hash(self): + soup = self.soup("string string") + first_a, second_a = soup.find_all('a') + first_string, second_string = soup.find_all(string='string') + + # Tags with the same markup hash to the same value. + assert hash(first_a) == hash(second_a) + + # But they're not the same object. + assert id(first_a) != id(second_a) + + # NavigableStrings with the same contents hash to the value of + # the contents. + assert hash(first_string) == hash(second_string) == hash("string") -- cgit v1.2.3