From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- .../site-packages/bs4/tests/test_html5lib.py | 264 +++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 .venv/lib/python3.12/site-packages/bs4/tests/test_html5lib.py (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_html5lib.py') diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_html5lib.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_html5lib.py new file mode 100644 index 00000000..593c12bd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_html5lib.py @@ -0,0 +1,264 @@ +"""Tests to ensure that the html5lib tree builder generates good trees.""" + +import pytest +import warnings + +from bs4 import BeautifulSoup +from bs4.filter import SoupStrainer +from . import ( + HTML5LIB_PRESENT, + HTML5TreeBuilderSmokeTest, +) + + +@pytest.mark.skipif( + not HTML5LIB_PRESENT, + reason="html5lib seems not to be present, not testing its tree builder.", +) +class TestHTML5LibBuilder(HTML5TreeBuilderSmokeTest): + """See ``HTML5TreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + from bs4.builder import HTML5TreeBuilder + + return HTML5TreeBuilder + + def test_soupstrainer(self): + # The html5lib tree builder does not support parse_only. + strainer = SoupStrainer("b") + markup = "

A bold statement.

" + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup(markup, "html5lib", parse_only=strainer) + assert soup.decode() == self.document_for(markup) + + [warning] = w + assert warning.filename == __file__ + assert "the html5lib tree builder doesn't support parse_only" in str( + warning.message + ) + + def test_correctly_nested_tables(self): + """html5lib inserts tags where other parsers don't.""" + markup = ( + '' + "" + "" + ) + + self.assert_soup( + markup, + '

Here's another table:" + '' + "" + "

foo

Here\'s another table:' + '

foo

' + "

", + ) + + self.assert_soup( + "" + "" + "

Foo
Bar
Baz

" + ) + + def test_xml_declaration_followed_by_doctype(self): + markup = """ + + + + + +

foo

+ +""" + soup = self.soup(markup) + # Verify that we can reach the

tag; this means the tree is connected. + assert b"

foo

" == soup.p.encode() + + def test_reparented_markup(self): + markup = "

foo

bar

" + soup = self.soup(markup) + assert ( + "

foo

bar

" + == soup.body.decode() + ) + assert 2 == len(soup.find_all("p")) + + def test_reparented_markup_ends_with_whitespace(self): + markup = "

foo

bar

\n" + soup = self.soup(markup) + assert ( + "

foo

bar

\n" + == soup.body.decode() + ) + assert 2 == len(soup.find_all("p")) + + def test_reparented_markup_containing_identical_whitespace_nodes(self): + """Verify that we keep the two whitespace nodes in this + document distinct when reparenting the adjacent tags. + """ + markup = "

" + soup = self.soup(markup) + space1, space2 = soup.find_all(string=" ") + tbody1, tbody2 = soup.find_all("tbody") + assert space1.next_element is tbody1 + assert tbody2.next_element is space2 + + def test_reparented_markup_containing_children(self): + markup = ( + "

aftermath

" + ) + soup = self.soup(markup) + noscript = soup.noscript + assert "target" == noscript.next_element + target = soup.find(string="target") + + # The 'aftermath' string was duplicated; we want the second one. + final_aftermath = soup.find_all(string="aftermath")[-1] + + # The

tag, + # but the 'target' string within is still connected to the + # (second) 'aftermath' string. + assert final_aftermath == target.next_element + assert target == final_aftermath.previous_element + + def test_processing_instruction(self): + """Processing instructions become comments.""" + markup = b"""""" + soup = self.soup(markup) + assert str(soup).startswith("") + + def test_cloned_multivalue_node(self): + markup = b"""""" + soup = self.soup(markup) + a1, a2 = soup.find_all("a") + assert a1 == a2 + assert a1 is not a2 + + def test_foster_parenting(self): + markup = b"""

+hello + +\n\n\ntext%s%s%s