import pytest import types from bs4 import ( BeautifulSoup, ResultSet, ) from typing import ( Any, List, Tuple, Type, ) from packaging.version import Version from . import ( SoupTest, SOUP_SIEVE_PRESENT, ) SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS: Type[Exception] if SOUP_SIEVE_PRESENT: from soupsieve import __version__, SelectorSyntaxError # Some behavior changes in soupsieve 2.6 that affects one of our # tests. For the test to run under all versions of Python # supported by Beautiful Soup (which includes versions of Python # not supported by soupsieve 2.6) we need to check both behaviors. SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS = SelectorSyntaxError if Version(__version__) < Version("2.6"): SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS = NotImplementedError @pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed") class TestCSSSelectors(SoupTest): """Test basic CSS selector functionality. This functionality is implemented in soupsieve, which has a much more comprehensive test suite, so this is basically an extra check that soupsieve works as expected. """ HTML = """ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <title>The title</title> <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> </head> <body> <custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> <div id="main" class="fancy"> <div id="inner"> <h1 id="header1">An H1</h1> <p>Some text</p> <p class="onep" id="p1">Some more text</p> <h2 id="header2">An H2</h2> <p class="class1 class2 class3" id="pmulti">Another</p> <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> <h2 id="header3">Another H2</h2> <a id="me" href="http://simonwillison.net/" rel="me">me</a> <span class="s1"> <a href="#" id="s1a1">span1a1</a> <a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> <span class="span2"> <a href="#" id="s2a1">span2a1</a> </span> <span class="span3"></span> <custom-dashed-tag class="dashed" id="dash2"/> <div data-tag="dashedvalue" id="data1"/> </span> </div> <x id="xid"> <z id="zida"/> <z id="zidab"/> <z id="zidac"/> </x> <y id="yid"> <z id="zidb"/> </y> <p lang="en" id="lang-en">English</p> <p lang="en-gb" id="lang-en-gb">English UK</p> <p lang="en-us" id="lang-en-us">English US</p> <p lang="fr" id="lang-fr">French</p> </div> <div id="footer"> </div> """ def setup_method(self): self._soup = BeautifulSoup(self.HTML, "html.parser") def assert_css_selects( self, selector: str, expected_ids: List[str], **kwargs: Any ) -> None: results = self._soup.select(selector, **kwargs) assert isinstance(results, ResultSet) el_ids = [el["id"] for el in results] el_ids.sort() expected_ids.sort() assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % ( selector, ", ".join(expected_ids), ", ".join(el_ids), ) assertSelect = assert_css_selects def assert_css_select_multiple(self, *tests: Tuple[str, List[str]]): for selector, expected_ids in tests: self.assert_css_selects(selector, expected_ids) def test_precompiled(self): sel = self._soup.css.compile("div") els = self._soup.select(sel) assert len(els) == 4 for div in els: assert div.name == "div" el = self._soup.select_one(sel) assert "main" == el["id"] def test_one_tag_one(self): els = self._soup.select("title") assert len(els) == 1 assert els[0].name == "title" assert els[0].contents == ["The title"] def test_one_tag_many(self): els = self._soup.select("div") assert len(els) == 4 for div in els: assert div.name == "div" el = self._soup.select_one("div") assert "main" == el["id"] def test_select_one_returns_none_if_no_match(self): match = self._soup.select_one("nonexistenttag") assert None is match def test_tag_in_tag_one(self): self.assert_css_selects("div div", ["inner", "data1"]) def test_tag_in_tag_many(self): for selector in ("html div", "html body div", "body div"): self.assert_css_selects(selector, ["data1", "main", "inner", "footer"]) def test_limit(self): self.assert_css_selects("html div", ["main"], limit=1) self.assert_css_selects("html body div", ["inner", "main"], limit=2) self.assert_css_selects( "body div", ["data1", "main", "inner", "footer"], limit=10 ) def test_tag_no_match(self): assert len(self._soup.select("del")) == 0 def test_invalid_tag(self): with pytest.raises(SelectorSyntaxError): self._soup.select("tag%t") def test_select_dashed_tag_ids(self): self.assert_css_selects("custom-dashed-tag", ["dash1", "dash2"]) def test_select_dashed_by_id(self): dashed = self._soup.select('custom-dashed-tag[id="dash2"]') assert dashed[0].name == "custom-dashed-tag" assert dashed[0]["id"] == "dash2" def test_dashed_tag_text(self): assert self._soup.select("body > custom-dashed-tag")[0].text == "Hello there." def test_select_dashed_matches_find_all(self): assert self._soup.select("custom-dashed-tag") == self._soup.find_all( "custom-dashed-tag" ) def test_header_tags(self): self.assert_css_select_multiple( ("h1", ["header1"]), ("h2", ["header2", "header3"]), ) def test_class_one(self): for selector in (".onep", "p.onep", "html p.onep"): els = self._soup.select(selector) assert len(els) == 1 assert els[0].name == "p" assert els[0]["class"] == ["onep"] def test_class_mismatched_tag(self): els = self._soup.select("div.onep") assert len(els) == 0 def test_one_id(self): for selector in ("div#inner", "#inner", "div div#inner"): self.assert_css_selects(selector, ["inner"]) def test_bad_id(self): els = self._soup.select("#doesnotexist") assert len(els) == 0 def test_items_in_id(self): els = self._soup.select("div#inner p") assert len(els) == 3 for el in els: assert el.name == "p" assert els[1]["class"] == ["onep"] assert not els[0].has_attr("class") def test_a_bunch_of_emptys(self): for selector in ("div#main del", "div#main div.oops", "div div#main"): assert len(self._soup.select(selector)) == 0 def test_multi_class_support(self): for selector in ( ".class1", "p.class1", ".class2", "p.class2", ".class3", "p.class3", "html p.class2", "div#inner .class2", ): self.assert_css_selects(selector, ["pmulti"]) def test_multi_class_selection(self): for selector in (".class1.class3", ".class3.class2", ".class1.class2.class3"): self.assert_css_selects(selector, ["pmulti"]) def test_child_selector(self): self.assert_css_selects(".s1 > a", ["s1a1", "s1a2"]) self.assert_css_selects(".s1 > a span", ["s1a2s1"]) def test_child_selector_id(self): self.assert_css_selects(".s1 > a#s1a2 span", ["s1a2s1"]) def test_attribute_equals(self): self.assert_css_select_multiple( ('p[class="onep"]', ["p1"]), ('p[id="p1"]', ["p1"]), ('[class="onep"]', ["p1"]), ('[id="p1"]', ["p1"]), ('link[rel="stylesheet"]', ["l1"]), ('link[type="text/css"]', ["l1"]), ('link[href="blah.css"]', ["l1"]), ('link[href="no-blah.css"]', []), ('[rel="stylesheet"]', ["l1"]), ('[type="text/css"]', ["l1"]), ('[href="blah.css"]', ["l1"]), ('[href="no-blah.css"]', []), ('p[href="no-blah.css"]', []), ('[href="no-blah.css"]', []), ) def test_attribute_tilde(self): self.assert_css_select_multiple( ('p[class~="class1"]', ["pmulti"]), ('p[class~="class2"]', ["pmulti"]), ('p[class~="class3"]', ["pmulti"]), ('[class~="class1"]', ["pmulti"]), ('[class~="class2"]', ["pmulti"]), ('[class~="class3"]', ["pmulti"]), ('a[rel~="friend"]', ["bob"]), ('a[rel~="met"]', ["bob"]), ('[rel~="friend"]', ["bob"]), ('[rel~="met"]', ["bob"]), ) def test_attribute_startswith(self): self.assert_css_select_multiple( ('[rel^="style"]', ["l1"]), ('link[rel^="style"]', ["l1"]), ('notlink[rel^="notstyle"]', []), ('[rel^="notstyle"]', []), ('link[rel^="notstyle"]', []), ('link[href^="bla"]', ["l1"]), ('a[href^="http://"]', ["bob", "me"]), ('[href^="http://"]', ["bob", "me"]), ('[id^="p"]', ["pmulti", "p1"]), ('[id^="m"]', ["me", "main"]), ('div[id^="m"]', ["main"]), ('a[id^="m"]', ["me"]), ('div[data-tag^="dashed"]', ["data1"]), ) def test_attribute_endswith(self): self.assert_css_select_multiple( ('[href$=".css"]', ["l1"]), ('link[href$=".css"]', ["l1"]), ('link[id$="1"]', ["l1"]), ( '[id$="1"]', ["data1", "l1", "p1", "header1", "s1a1", "s2a1", "s1a2s1", "dash1"], ), ('div[id$="1"]', ["data1"]), ('[id$="noending"]', []), ) def test_attribute_contains(self): self.assert_css_select_multiple( # From test_attribute_startswith ('[rel*="style"]', ["l1"]), ('link[rel*="style"]', ["l1"]), ('notlink[rel*="notstyle"]', []), ('[rel*="notstyle"]', []), ('link[rel*="notstyle"]', []), ('link[href*="bla"]', ["l1"]), ('[href*="http://"]', ["bob", "me"]), ('[id*="p"]', ["pmulti", "p1"]), ('div[id*="m"]', ["main"]), ('a[id*="m"]', ["me"]), # From test_attribute_endswith ('[href*=".css"]', ["l1"]), ('link[href*=".css"]', ["l1"]), ('link[id*="1"]', ["l1"]), ( '[id*="1"]', [ "data1", "l1", "p1", "header1", "s1a1", "s1a2", "s2a1", "s1a2s1", "dash1", ], ), ('div[id*="1"]', ["data1"]), ('[id*="noending"]', []), # New for this test ('[href*="."]', ["bob", "me", "l1"]), ('a[href*="."]', ["bob", "me"]), ('link[href*="."]', ["l1"]), ('div[id*="n"]', ["main", "inner"]), ('div[id*="nn"]', ["inner"]), ('div[data-tag*="edval"]', ["data1"]), ) def test_attribute_exact_or_hypen(self): self.assert_css_select_multiple( ('p[lang|="en"]', ["lang-en", "lang-en-gb", "lang-en-us"]), ('[lang|="en"]', ["lang-en", "lang-en-gb", "lang-en-us"]), ('p[lang|="fr"]', ["lang-fr"]), ('p[lang|="gb"]', []), ) def test_attribute_exists(self): self.assert_css_select_multiple( ("[rel]", ["l1", "bob", "me"]), ("link[rel]", ["l1"]), ("a[rel]", ["bob", "me"]), ("[lang]", ["lang-en", "lang-en-gb", "lang-en-us", "lang-fr"]), ("p[class]", ["p1", "pmulti"]), ("[blah]", []), ("p[blah]", []), ("div[data-tag]", ["data1"]), ) def test_quoted_space_in_selector_name(self): html = """<div style="display: wrong">nope</div> <div style="display: right">yes</div> """ soup = BeautifulSoup(html, "html.parser") [chosen] = soup.select('div[style="display: right"]') assert "yes" == chosen.string def test_unsupported_pseudoclass(self): with pytest.raises(SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS): self._soup.select("a:no-such-pseudoclass") with pytest.raises(SelectorSyntaxError): self._soup.select("a:nth-of-type(a)") def test_nth_of_type(self): # Try to select first paragraph els = self._soup.select("div#inner p:nth-of-type(1)") assert len(els) == 1 assert els[0].string == "Some text" # Try to select third paragraph els = self._soup.select("div#inner p:nth-of-type(3)") assert len(els) == 1 assert els[0].string == "Another" # Try to select (non-existent!) fourth paragraph els = self._soup.select("div#inner p:nth-of-type(4)") assert len(els) == 0 # Zero will select no tags. els = self._soup.select("div p:nth-of-type(0)") assert len(els) == 0 def test_nth_of_type_direct_descendant(self): els = self._soup.select("div#inner > p:nth-of-type(1)") assert len(els) == 1 assert els[0].string == "Some text" def test_id_child_selector_nth_of_type(self): self.assert_css_selects("#inner > p:nth-of-type(2)", ["p1"]) def test_select_on_element(self): # Other tests operate on the tree; this operates on an element # within the tree. inner = self._soup.find("div", id="main") selected = inner.select("div") # The <div id="inner"> tag was selected. The <div id="footer"> # tag was not. self.assert_selects_ids(selected, ["inner", "data1"]) def test_overspecified_child_id(self): self.assert_css_selects(".fancy #inner", ["inner"]) self.assert_css_selects(".normal #inner", []) def test_adjacent_sibling_selector(self): self.assert_css_selects("#p1 + h2", ["header2"]) self.assert_css_selects("#p1 + h2 + p", ["pmulti"]) self.assert_css_selects("#p1 + #header2 + .class1", ["pmulti"]) assert [] == self._soup.select("#p1 + p") def test_general_sibling_selector(self): self.assert_css_selects("#p1 ~ h2", ["header2", "header3"]) self.assert_css_selects("#p1 ~ #header2", ["header2"]) self.assert_css_selects("#p1 ~ h2 + a", ["me"]) self.assert_css_selects('#p1 ~ h2 + [rel="me"]', ["me"]) assert [] == self._soup.select("#inner ~ h2") def test_dangling_combinator(self): with pytest.raises(SelectorSyntaxError): self._soup.select("h1 >") def test_sibling_combinator_wont_select_same_tag_twice(self): self.assert_css_selects("p[lang] ~ p", ["lang-en-gb", "lang-en-us", "lang-fr"]) # Test the selector grouping operator (the comma) def test_multiple_select(self): self.assert_css_selects("x, y", ["xid", "yid"]) def test_multiple_select_with_no_space(self): self.assert_css_selects("x,y", ["xid", "yid"]) def test_multiple_select_with_more_space(self): self.assert_css_selects("x, y", ["xid", "yid"]) def test_multiple_select_duplicated(self): self.assert_css_selects("x, x", ["xid"]) def test_multiple_select_sibling(self): self.assert_css_selects("x, y ~ p[lang=fr]", ["xid", "lang-fr"]) def test_multiple_select_tag_and_direct_descendant(self): self.assert_css_selects("x, y > z", ["xid", "zidb"]) def test_multiple_select_direct_descendant_and_tags(self): self.assert_css_selects( "div > x, y, z", ["xid", "yid", "zida", "zidb", "zidab", "zidac"] ) def test_multiple_select_indirect_descendant(self): self.assert_css_selects( "div x,y, z", ["xid", "yid", "zida", "zidb", "zidab", "zidac"] ) def test_invalid_multiple_select(self): with pytest.raises(SelectorSyntaxError): self._soup.select(",x, y") with pytest.raises(SelectorSyntaxError): self._soup.select("x,,y") def test_multiple_select_attrs(self): self.assert_css_selects("p[lang=en], p[lang=en-gb]", ["lang-en", "lang-en-gb"]) def test_multiple_select_ids(self): self.assert_css_selects( "x, y > z[id=zida], z[id=zidab], z[id=zidb]", ["xid", "zidb", "zidab"] ) def test_multiple_select_nested(self): self.assert_css_selects("body > div > x, y > z", ["xid", "zidb"]) def test_select_duplicate_elements(self): # When markup contains duplicate elements, a multiple select # will find all of them. markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' soup = BeautifulSoup(markup, "html.parser") selected = soup.select(".c1, .c2") assert 3 == len(selected) # Verify that find_all finds the same elements, though because # of an implementation detail it finds them in a different # order. for element in soup.find_all(class_=["c1", "c2"]): assert element in selected def test_closest(self): inner = self._soup.find("div", id="inner") closest = inner.css.closest("div[id=main]") assert closest == self._soup.find("div", id="main") def test_match(self): inner = self._soup.find("div", id="inner") main = self._soup.find("div", id="main") assert inner.css.match("div[id=main]") is False assert main.css.match("div[id=main]") is True def test_iselect(self): gen = self._soup.css.iselect("h2") assert isinstance(gen, types.GeneratorType) [header2, header3] = gen assert header2["id"] == "header2" assert header3["id"] == "header3" def test_filter(self): inner = self._soup.find("div", id="inner") results = inner.css.filter("h2") assert len(inner.css.filter("h2")) == 2 results = inner.css.filter("h2[id=header3]") assert isinstance(results, ResultSet) [result] = results assert result["id"] == "header3" def test_escape(self): m = self._soup.css.escape assert m(".foo#bar") == "\\.foo\\#bar" assert m("()[]{}") == "\\(\\)\\[\\]\\{\\}" assert m(".foo") == self._soup.css.escape(".foo")