Hello.

Here is some bolded text", + string_containers={ + "b": BString, + "p": PString, + }, + ) + + # The string before the

tag is a regular NavigableString. + assert isinstance(soup.div.contents[0], NavigableString) + + # The string inside the

tag, but not inside the tag, + # is a PString. + assert isinstance(soup.p.contents[0], PString) + + # Every string inside the tag is a BString, even the one that + # was also inside an tag. + for s in soup.b.strings: + assert isinstance(s, BString) + + # Now that parsing was complete, the string_container_stack + # (where this information was kept) has been cleared out. + assert [] == soup.string_container_stack + + @pytest.mark.parametrize("bad_markup", [1, False, lambda x: False]) + def test_invalid_markup_type(self, bad_markup): + with pytest.raises(TypeError) as exc_info: + BeautifulSoup(bad_markup, "html.parser") + assert ( + f"Incoming markup is of an invalid type: {bad_markup!r}. Markup must be a string, a bytestring, or an open filehandle." + in str(exc_info.value) + ) + + +class TestOutput(SoupTest): + @pytest.mark.parametrize( + "eventual_encoding,actual_encoding", + [ + ("utf-8", "utf-8"), + ("utf-16", "utf-16"), + ], + ) + def test_decode_xml_declaration(self, eventual_encoding, actual_encoding): + # Most of the time, calling decode() on an XML document will + # give you a document declaration that mentions the encoding + # you intend to use when encoding the document as a + # bytestring. + soup = self.soup("") + soup.is_xml = True + assert ( + f'\n' + == soup.decode(eventual_encoding=eventual_encoding) + ) + + @pytest.mark.parametrize( + "eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None] + ) + def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding( + self, eventual_encoding + ): + # But if you pass a Python internal encoding into decode(), or + # omit the eventual_encoding altogether, the document + # declaration won't mention any particular encoding. + soup = BeautifulSoup("", "html.parser") + soup.is_xml = True + assert '\n' == soup.decode( + eventual_encoding=eventual_encoding + ) + + def test(self): + # BeautifulSoup subclasses Tag and extends the decode() method. + # Make sure the other Tag methods which call decode() call + # it correctly. + soup = self.soup("") + assert b"" == soup.encode(encoding="utf-8") + assert b"" == soup.encode_contents(encoding="utf-8") + assert "" == soup.decode_contents() + assert "\n\n" == soup.prettify() + + +class TestWarnings(SoupTest): + # Note that some of the tests in this class create BeautifulSoup + # objects directly rather than using self.soup(). That's + # because SoupTest.soup is defined in a different file, + # which will throw off the assertion in _assert_warning + # that the code that triggered the warning is in the same + # file as the test. + + def _assert_warning( + self, warnings: Iterable[warnings.WarningMessage], cls: Type[Warning] + ) -> warnings.WarningMessage: + for w in warnings: + if isinstance(w.message, cls): + assert w.filename == __file__ + return w + raise Exception("%s warning not found in %r" % (cls, warnings)) + + def _assert_no_parser_specified(self, w: Iterable[warnings.WarningMessage]) -> None: + warning = self._assert_warning(w, GuessedAtParserWarning) + message = str(warning.message) + assert message.startswith(GuessedAtParserWarning.MESSAGE[:60]) + + def test_warning_if_no_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + BeautifulSoup("") + self._assert_no_parser_specified(w) + + def test_warning_if_parser_specified_too_vague(self): + with warnings.catch_warnings(record=True) as w: + BeautifulSoup("", "html") + self._assert_no_parser_specified(w) + + def test_no_warning_if_explicit_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + self.soup("") + assert [] == w + + def test_warning_if_strainer_filters_everything(self): + strainer = SoupStrainer(name="a", string="b") + with warnings.catch_warnings(record=True) as w: + self.soup("", parse_only=strainer) + warning = self._assert_warning(w, UserWarning) + msg = str(warning.message) + assert msg.startswith("The given value for parse_only will exclude everything:") + + def test_parseOnlyThese_renamed_to_parse_only(self): + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup( + "", + "html.parser", + parseOnlyThese=SoupStrainer("b"), + ) + warning = self._assert_warning(w, DeprecationWarning) + msg = str(warning.message) + assert "parseOnlyThese" in msg + assert "parse_only" in msg + assert b"" == soup.encode() + + def test_fromEncoding_renamed_to_from_encoding(self): + with warnings.catch_warnings(record=True) as w: + utf8 = b"\xc3\xa9" + soup = BeautifulSoup(utf8, "html.parser", fromEncoding="utf8") + warning = self._assert_warning(w, DeprecationWarning) + msg = str(warning.message) + assert "fromEncoding" in msg + assert "from_encoding" in msg + assert "utf8" == soup.original_encoding + + def test_unrecognized_keyword_argument(self): + with pytest.raises(TypeError): + self.soup("", no_such_argument=True) + + @pytest.mark.parametrize( + "markup", + [ + "markup.html", + "markup.htm", + "markup.HTML", + "markup.txt", + "markup.xhtml", + "markup.xml", + "/home/user/file.txt", + r"c:\user\file.html" r"\\server\share\path\file.XhTml", + ], + ) + def test_resembles_filename_warning(self, markup): + # A warning is issued if the "markup" looks like the name of + # an HTML or text file, or a full path to a file on disk. + with warnings.catch_warnings(record=True) as w: + BeautifulSoup(markup, "html.parser") + warning = self._assert_warning(w, MarkupResemblesLocatorWarning) + assert "looks more like a filename" in str(warning.message) + + @pytest.mark.parametrize( + "markup", + [ + "filename", + "markuphtml", + "markup.com", + "", + # Excluded due to an irrelevant file extension. + "markup.js", + "markup.jpg", + "markup.markup", + # Excluded due to the lack of any file extension. + "/home/user/file", + r"c:\user\file.html" r"\\server\share\path\file", + # Excluded because of two consecutive slashes _and_ the + # colon. + "log message containing a url http://www.url.com/ right there.html", + # Excluded for containing various characters or combinations + # not usually found in filenames. + "two consecutive spaces.html", + "two//consecutive//slashes.html", + "looks/like/a/filename/but/oops/theres/a#comment.html", + "two\nlines.html", + "contains?.html", + "contains*.html", + "contains#.html", + "contains&.html", + "contains;.html", + "contains>.html", + "contains<.html", + "contains$.html", + "contains|.html", + "contains:.html", + ":-at-the-front.html", + ], + ) + def test_resembles_filename_no_warning(self, markup): + # The 'looks more like a filename' warning is not issued if + # the markup looks like a bare string, a domain name, or a + # file that's not an HTML file. + with warnings.catch_warnings(record=True) as w: + self.soup(markup) + assert [] == w + + def test_url_warning_with_bytes_url(self): + url = b"http://www.crummybytes.com/" + with warnings.catch_warnings(record=True) as warning_list: + BeautifulSoup(url, "html.parser") + warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning) + assert "looks more like a URL" in str(warning.message) + assert url not in str(warning.message).encode("utf8") + + def test_url_warning_with_unicode_url(self): + url = "http://www.crummyunicode.com/" + with warnings.catch_warnings(record=True) as warning_list: + # note - this url must differ from the bytes one otherwise + # python's warnings system swallows the second warning + BeautifulSoup(url, "html.parser") + warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning) + assert "looks more like a URL" in str(warning.message) + assert url not in str(warning.message) + + def test_url_warning_with_bytes_and_space(self): + # Here the markup contains something besides a URL, so no warning + # is issued. + with warnings.catch_warnings(record=True) as warning_list: + self.soup(b"http://www.crummybytes.com/ is great") + assert not any("looks more like a URL" in str(w.message) for w in warning_list) + + def test_url_warning_with_unicode_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + self.soup("http://www.crummyunicode.com/ is great") + assert not any("looks more like a URL" in str(w.message) for w in warning_list) + + +class TestSelectiveParsing(SoupTest): + def test_parse_with_soupstrainer(self): + markup = "NoYes NoYes Yes" + strainer = SoupStrainer("b") + soup = self.soup(markup, parse_only=strainer) + assert soup.encode() == b"YesYes Yes" + + +class TestNewTag(SoupTest): + """Test the BeautifulSoup.new_tag() method.""" + + def test_new_tag(self): + soup = self.soup("") + new_tag = soup.new_tag("foo", string="txt", bar="baz", attrs={"name": "a name"}) + assert isinstance(new_tag, Tag) + assert "foo" == new_tag.name + assert new_tag.string == "txt" + assert dict(bar="baz", name="a name") == new_tag.attrs + assert None is new_tag.parent + + # string can be null + new_tag = soup.new_tag("foo") + assert None is new_tag.string + new_tag = soup.new_tag("foo", string=None) + assert None is new_tag.string + + # Or the empty string + new_tag = soup.new_tag("foo", string="") + assert "" == new_tag.string + + @pytest.mark.skipif( + not LXML_PRESENT, reason="lxml not installed, cannot parse XML document" + ) + def test_xml_tag_inherits_self_closing_rules_from_builder(self): + xml_soup = BeautifulSoup("", "xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the
and

tag are empty-element, just because + # they have no contents. + assert b"
" == xml_br.encode() + assert b"

" == xml_p.encode() + + def test_tag_inherits_self_closing_rules_from_builder(self): + html_soup = BeautifulSoup("", "html.parser") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + assert b"
" == html_br.encode() + assert b"

" == html_p.encode() + + +class TestNewString(SoupTest): + """Test the BeautifulSoup.new_string() method.""" + + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + assert "foo" == s + assert isinstance(s, NavigableString) + + def test_new_string_can_create_navigablestring_subclass(self): + soup = self.soup("") + s = soup.new_string("foo", Comment) + assert "foo" == s + assert isinstance(s, Comment) + + +class TestPickle(SoupTest): + # Test our ability to pickle the BeautifulSoup object itself. + + def test_normal_pickle(self): + soup = self.soup("some markup") + pickled = pickle.dumps(soup) + unpickled = pickle.loads(pickled) + assert "some markup" == unpickled.a.string + + def test_pickle_with_no_builder(self): + # We had a bug that prevented pickling from working if + # the builder wasn't set. + soup = self.soup("some markup") + soup.builder = None + pickled = pickle.dumps(soup) + unpickled = pickle.loads(pickled) + assert "some markup" == unpickled.string + + +class TestEncodingConversion(SoupTest): + # Test Beautiful Soup's ability to decode and encode from various + # encodings. + + def setup_method(self): + self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + self.utf8_data = self.unicode_data.encode("utf-8") + # Just so you know what it looks like. + assert ( + self.utf8_data + == b'Sacr\xc3\xa9 bleu!' + ) + + def test_ascii_in_unicode_out(self): + # ASCII input is converted to Unicode. The original_encoding + # attribute is set to 'utf-8', a superset of ASCII. + chardet = dammit._chardet_dammit + logging.disable(logging.WARNING) + try: + + def noop(str): + return None + + # Disable chardet, which will realize that the ASCII is ASCII. + dammit._chardet_dammit = noop + ascii = b"a" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + assert isinstance(unicode_output, str) + assert unicode_output == self.document_for(ascii.decode()) + assert soup_from_ascii.original_encoding.lower() == "utf-8" + finally: + logging.disable(logging.NOTSET) + dammit._chardet_dammit = chardet + + def test_unicode_in_unicode_out(self): + # Unicode input is left alone. The original_encoding attribute + # is not set. + soup_from_unicode = self.soup(self.unicode_data) + assert soup_from_unicode.decode() == self.unicode_data + assert soup_from_unicode.foo.string == "Sacr\xe9 bleu!" + assert soup_from_unicode.original_encoding is None + + def test_utf8_in_unicode_out(self): + # UTF-8 input is converted to Unicode. The original_encoding + # attribute is set. + soup_from_utf8 = self.soup(self.utf8_data) + assert soup_from_utf8.decode() == self.unicode_data + assert soup_from_utf8.foo.string == "Sacr\xe9 bleu!" + + def test_utf8_out(self): + # The internal data structures can be encoded as UTF-8. + soup_from_unicode = self.soup(self.unicode_data) + assert soup_from_unicode.encode("utf-8") == self.utf8_data -- cgit v1.2.3

éé

foo\0bar