From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001
From: S. Solomon Darnell
Date: Fri, 28 Mar 2025 21:52:21 -0500
Subject: two version of R2R are here
---
.../site-packages/bs4/tests/test_htmlparser.py | 161 +++++++++++++++++++++
1 file changed, 161 insertions(+)
create mode 100644 .venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py
(limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py')
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py
new file mode 100644
index 00000000..b2bd07fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_htmlparser.py
@@ -0,0 +1,161 @@
+"""Tests to ensure that the html.parser tree builder generates good
+trees."""
+
+import pickle
+import pytest
+from bs4.builder._htmlparser import (
+ _DuplicateAttributeHandler,
+ BeautifulSoupHTMLParser,
+ HTMLParserTreeBuilder,
+)
+from bs4.exceptions import ParserRejectedMarkup
+from typing import Any
+from . import HTMLTreeBuilderSmokeTest
+
+
+class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
+ default_builder = HTMLParserTreeBuilder
+
+ def test_rejected_input(self):
+ # Python's html.parser will occasionally reject markup,
+ # especially when there is a problem with the initial DOCTYPE
+ # declaration. Different versions of Python sound the alarm in
+ # different ways, but Beautiful Soup consistently raises
+ # errors as ParserRejectedMarkup exceptions.
+ bad_markup = [
+ # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
+ # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
+ # https://github.com/python/cpython/issues/81928
+ b"\n",
+ ]
+ for markup in bad_markup:
+ with pytest.raises(ParserRejectedMarkup):
+ self.soup(markup)
+
+ def test_namespaced_system_doctype(self):
+ # html.parser can't handle namespaced doctypes, so skip this one.
+ pass
+
+ def test_namespaced_public_doctype(self):
+ # html.parser can't handle namespaced doctypes, so skip this one.
+ pass
+
+ def test_builder_is_pickled(self):
+ """Unlike most tree builders, HTMLParserTreeBuilder and will
+ be restored after pickling.
+ """
+ tree = self.soup("foo")
+ dumped = pickle.dumps(tree, 2)
+ loaded = pickle.loads(dumped)
+ assert isinstance(loaded.builder, type(tree.builder))
+
+ def test_redundant_empty_element_closing_tags(self):
+ self.assert_soup("
", "
")
+ self.assert_soup("", "")
+
+ def test_empty_element(self):
+ # This verifies that any buffered data present when the parser
+ # finishes working is handled.
+ self.assert_soup("foo bar", "foo &# bar")
+
+ def test_tracking_line_numbers(self):
+ # The html.parser TreeBuilder keeps track of line number and
+ # position of each element.
+ markup = "\n
\n\n
a &nosuchentity b
" + soup = self.soup(markup) + assert "a &nosuchentity b
" == soup.p.decode() + + markup = "a &nosuchentity; b
" + soup = self.soup(markup) + assert "a &nosuchentity b
" == soup.p.decode() -- cgit v1.2.3