diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py | 181 |
1 files changed, 181 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py new file mode 100644 index 00000000..f5b0990d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py @@ -0,0 +1,181 @@ +"""This file contains test cases reported by third parties using +fuzzing tools, primarily from Google's oss-fuzz project. Some of these +represent real problems with Beautiful Soup, but many are problems in +libraries that Beautiful Soup depends on, and many of the test cases +represent different ways of triggering the same problem. + +Grouping these test cases together makes it easy to see which test +cases represent the same problem, and puts the test cases in close +proximity to code that can trigger the problems. +""" + +import os +import importlib +import pytest +from bs4 import ( + BeautifulSoup, + ParserRejectedMarkup, +) + +try: + from soupsieve.util import SelectorSyntaxError + has_lxml = importlib.util.find_spec("lxml") + has_html5lib = importlib.util.find_spec("html5lib") + fully_fuzzable = has_lxml != None and has_html5lib != None +except ImportError: + fully_fuzzable = False + + +@pytest.mark.skipif( + not fully_fuzzable, reason="Prerequisites for fuzz tests are not installed." +) +class TestFuzz(object): + # Test case markup files from fuzzers are given this extension so + # they can be included in builds. + TESTCASE_SUFFIX = ".testcase" + + # Copied 20230512 from + # https://github.com/google/oss-fuzz/blob/4ac6a645a197a695fe76532251feb5067076b3f3/projects/bs4/bs4_fuzzer.py + # + # Copying the code lets us precisely duplicate the behavior of + # oss-fuzz. The downside is that this code changes over time, so + # multiple copies of the code must be kept around to run against + # older tests. I'm not sure what to do about this, but I may + # retire old tests after a time. + def fuzz_test_with_css(self, filename: str) -> None: + data = self.__markup(filename) + parsers = ["lxml-xml", "html5lib", "html.parser", "lxml"] + try: + idx = int(data[0]) % len(parsers) + except ValueError: + return + + css_selector, data = data[1:10], data[10:] + + try: + soup = BeautifulSoup(data[1:], features=parsers[idx]) + except ParserRejectedMarkup: + return + except ValueError: + return + + list(soup.find_all(True)) + try: + soup.css.select(css_selector.decode("utf-8", "replace")) + except SelectorSyntaxError: + return + soup.prettify() + + # This class of error has been fixed by catching a less helpful + # exception from html.parser and raising ParserRejectedMarkup + # instead. + @pytest.mark.parametrize( + "filename", + [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912", + "crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a", + ], + ) + def test_rejected_markup(self, filename): + markup = self.__markup(filename) + with pytest.raises(ParserRejectedMarkup): + BeautifulSoup(markup, "html.parser") + + # This class of error has to do with very deeply nested documents + # which overflow the Python call stack when the tree is converted + # to a string. This is an issue with Beautiful Soup which was fixed + # as part of [bug=1471755]. + # + # These test cases are in the older format that doesn't specify + # which parser to use or give a CSS selector. + @pytest.mark.parametrize( + "filename", + [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632", + "clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464", + "clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400", + ], + ) + def test_deeply_nested_document_without_css(self, filename): + # Parsing the document and encoding it back to a string is + # sufficient to demonstrate that the overflow problem has + # been fixed. + markup = self.__markup(filename) + BeautifulSoup(markup, "html.parser").encode() + + # This class of error has to do with very deeply nested documents + # which overflow the Python call stack when the tree is converted + # to a string. This is an issue with Beautiful Soup which was fixed + # as part of [bug=1471755]. + @pytest.mark.parametrize( + "filename", + [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624", + ], + ) + def test_deeply_nested_document(self, filename): + self.fuzz_test_with_css(filename) + + @pytest.mark.parametrize( + "filename", + [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824", + ], + ) + def test_soupsieve_errors(self, filename): + self.fuzz_test_with_css(filename) + + # This class of error represents problems with html5lib's parser, + # not Beautiful Soup. I use + # https://github.com/html5lib/html5lib-python/issues/568 to notify + # the html5lib developers of these issues. + # + # These test cases are in the older format that doesn't specify + # which parser to use or give a CSS selector. + @pytest.mark.skip(reason="html5lib-specific problems") + @pytest.mark.parametrize( + "filename", + [ + # b"""ÿ<!DOCTyPEV PUBLIC'''Ð'""" + "clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320", + # b')<a><math><TR><a><mI><a><p><a>' + "clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456", + # b'-<math><sElect><mi><sElect><sElect>' + "clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896", + # b'ñ<table><svg><html>' + "clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224", + # <TABLE>, some ^@ characters, some <math> tags. + "clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744", + # Nested table + "crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08", + ], + ) + def test_html5lib_parse_errors_without_css(self, filename): + markup = self.__markup(filename) + print(BeautifulSoup(markup, "html5lib").encode()) + + # This class of error represents problems with html5lib's parser, + # not Beautiful Soup. I use + # https://github.com/html5lib/html5lib-python/issues/568 to notify + # the html5lib developers of these issues. + @pytest.mark.skip(reason="html5lib-specific problems") + @pytest.mark.parametrize( + "filename", + [ + # b'- \xff\xff <math>\x10<select><mi><select><select>t' + "clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640", + ], + ) + def test_html5lib_parse_errors(self, filename): + self.fuzz_test_with_css(filename) + + def __markup(self, filename: str) -> bytes: + if not filename.endswith(self.TESTCASE_SUFFIX): + filename += self.TESTCASE_SUFFIX + this_dir = os.path.split(__file__)[0] + path = os.path.join(this_dir, "fuzz", filename) + return open(path, "rb").read() |