about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py')
-rw-r--r--.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py181
1 files changed, 181 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py b/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py
new file mode 100644
index 00000000..f5b0990d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/tests/test_fuzz.py
@@ -0,0 +1,181 @@
+"""This file contains test cases reported by third parties using
+fuzzing tools, primarily from Google's oss-fuzz project. Some of these
+represent real problems with Beautiful Soup, but many are problems in
+libraries that Beautiful Soup depends on, and many of the test cases
+represent different ways of triggering the same problem.
+
+Grouping these test cases together makes it easy to see which test
+cases represent the same problem, and puts the test cases in close
+proximity to code that can trigger the problems.
+"""
+
+import os
+import importlib
+import pytest
+from bs4 import (
+    BeautifulSoup,
+    ParserRejectedMarkup,
+)
+
+try:
+    from soupsieve.util import SelectorSyntaxError
+    has_lxml = importlib.util.find_spec("lxml")
+    has_html5lib = importlib.util.find_spec("html5lib")
+    fully_fuzzable = has_lxml != None and has_html5lib != None
+except ImportError:
+    fully_fuzzable = False
+
+
+@pytest.mark.skipif(
+    not fully_fuzzable, reason="Prerequisites for fuzz tests are not installed."
+)
+class TestFuzz(object):
+    # Test case markup files from fuzzers are given this extension so
+    # they can be included in builds.
+    TESTCASE_SUFFIX = ".testcase"
+
+    # Copied 20230512 from
+    # https://github.com/google/oss-fuzz/blob/4ac6a645a197a695fe76532251feb5067076b3f3/projects/bs4/bs4_fuzzer.py
+    #
+    # Copying the code lets us precisely duplicate the behavior of
+    # oss-fuzz.  The downside is that this code changes over time, so
+    # multiple copies of the code must be kept around to run against
+    # older tests. I'm not sure what to do about this, but I may
+    # retire old tests after a time.
+    def fuzz_test_with_css(self, filename: str) -> None:
+        data = self.__markup(filename)
+        parsers = ["lxml-xml", "html5lib", "html.parser", "lxml"]
+        try:
+            idx = int(data[0]) % len(parsers)
+        except ValueError:
+            return
+
+        css_selector, data = data[1:10], data[10:]
+
+        try:
+            soup = BeautifulSoup(data[1:], features=parsers[idx])
+        except ParserRejectedMarkup:
+            return
+        except ValueError:
+            return
+
+        list(soup.find_all(True))
+        try:
+            soup.css.select(css_selector.decode("utf-8", "replace"))
+        except SelectorSyntaxError:
+            return
+        soup.prettify()
+
+    # This class of error has been fixed by catching a less helpful
+    # exception from html.parser and raising ParserRejectedMarkup
+    # instead.
+    @pytest.mark.parametrize(
+        "filename",
+        [
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912",
+            "crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a",
+        ],
+    )
+    def test_rejected_markup(self, filename):
+        markup = self.__markup(filename)
+        with pytest.raises(ParserRejectedMarkup):
+            BeautifulSoup(markup, "html.parser")
+
+    # This class of error has to do with very deeply nested documents
+    # which overflow the Python call stack when the tree is converted
+    # to a string. This is an issue with Beautiful Soup which was fixed
+    # as part of [bug=1471755].
+    #
+    # These test cases are in the older format that doesn't specify
+    # which parser to use or give a CSS selector.
+    @pytest.mark.parametrize(
+        "filename",
+        [
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400",
+        ],
+    )
+    def test_deeply_nested_document_without_css(self, filename):
+        # Parsing the document and encoding it back to a string is
+        # sufficient to demonstrate that the overflow problem has
+        # been fixed.
+        markup = self.__markup(filename)
+        BeautifulSoup(markup, "html.parser").encode()
+
+    # This class of error has to do with very deeply nested documents
+    # which overflow the Python call stack when the tree is converted
+    # to a string. This is an issue with Beautiful Soup which was fixed
+    # as part of [bug=1471755].
+    @pytest.mark.parametrize(
+        "filename",
+        [
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624",
+        ],
+    )
+    def test_deeply_nested_document(self, filename):
+        self.fuzz_test_with_css(filename)
+
+    @pytest.mark.parametrize(
+        "filename",
+        [
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824",
+        ],
+    )
+    def test_soupsieve_errors(self, filename):
+        self.fuzz_test_with_css(filename)
+
+    # This class of error represents problems with html5lib's parser,
+    # not Beautiful Soup. I use
+    # https://github.com/html5lib/html5lib-python/issues/568 to notify
+    # the html5lib developers of these issues.
+    #
+    # These test cases are in the older format that doesn't specify
+    # which parser to use or give a CSS selector.
+    @pytest.mark.skip(reason="html5lib-specific problems")
+    @pytest.mark.parametrize(
+        "filename",
+        [
+            # b"""ÿ<!DOCTyPEV PUBLIC'''Ð'"""
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320",
+            # b')<a><math><TR><a><mI><a><p><a>'
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456",
+            # b'-<math><sElect><mi><sElect><sElect>'
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896",
+            # b'ñ<table><svg><html>'
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224",
+            # <TABLE>, some ^@ characters, some <math> tags.
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744",
+            # Nested table
+            "crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08",
+        ],
+    )
+    def test_html5lib_parse_errors_without_css(self, filename):
+        markup = self.__markup(filename)
+        print(BeautifulSoup(markup, "html5lib").encode())
+
+    # This class of error represents problems with html5lib's parser,
+    # not Beautiful Soup. I use
+    # https://github.com/html5lib/html5lib-python/issues/568 to notify
+    # the html5lib developers of these issues.
+    @pytest.mark.skip(reason="html5lib-specific problems")
+    @pytest.mark.parametrize(
+        "filename",
+        [
+            # b'-      \xff\xff  <math>\x10<select><mi><select><select>t'
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640",
+        ],
+    )
+    def test_html5lib_parse_errors(self, filename):
+        self.fuzz_test_with_css(filename)
+
+    def __markup(self, filename: str) -> bytes:
+        if not filename.endswith(self.TESTCASE_SUFFIX):
+            filename += self.TESTCASE_SUFFIX
+        this_dir = os.path.split(__file__)[0]
+        path = os.path.join(this_dir, "fuzz", filename)
+        return open(path, "rb").read()