1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
|
import pytest
from bs4.element import Tag
from bs4.formatter import (
Formatter,
HTMLFormatter,
XMLFormatter,
)
from . import SoupTest
class TestFormatter(SoupTest):
def test_default_attributes(self):
# Test the default behavior of Formatter.attributes().
formatter = Formatter()
tag = Tag(name="tag")
tag["b"] = "1"
tag["a"] = "2"
# Attributes come out sorted by name. In Python 3, attributes
# normally come out of a dictionary in the order they were
# added.
assert [("a", "2"), ("b", "1")] == formatter.attributes(tag)
# This works even if Tag.attrs is None, though this shouldn't
# normally happen.
tag.attrs = None
assert [] == formatter.attributes(tag)
assert " " == formatter.indent
def test_sort_attributes(self):
# Test the ability to override Formatter.attributes() to,
# e.g., disable the normal sorting of attributes.
class UnsortedFormatter(Formatter):
def attributes(self, tag):
self.called_with = tag
for k, v in sorted(tag.attrs.items()):
if k == "ignore":
continue
yield k, v
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
formatter = UnsortedFormatter()
decoded = soup.decode(formatter=formatter)
# attributes() was called on the <p> tag. It filtered out one
# attribute and sorted the other two.
assert formatter.called_with == soup.p
assert '<p aval="2" cval="1"></p>' == decoded
def test_empty_attributes_are_booleans(self):
# Test the behavior of empty_attributes_are_booleans as well
# as which Formatters have it enabled.
for name in ("html", "minimal", None):
formatter = HTMLFormatter.REGISTRY[name]
assert False is formatter.empty_attributes_are_booleans
formatter = XMLFormatter.REGISTRY[None]
assert False is formatter.empty_attributes_are_booleans
formatter = HTMLFormatter.REGISTRY["html5"]
assert True is formatter.empty_attributes_are_booleans
# Verify that the constructor sets the value.
formatter = Formatter(empty_attributes_are_booleans=True)
assert True is formatter.empty_attributes_are_booleans
# Now demonstrate what it does to markup.
for markup in ("<option selected></option>", '<option selected=""></option>'):
soup = self.soup(markup)
for formatter in ("html", "minimal", "xml", None):
assert b'<option selected=""></option>' == soup.option.encode(
formatter="html"
)
assert b"<option selected></option>" == soup.option.encode(
formatter="html5"
)
@pytest.mark.parametrize(
"indent,expect",
[
(None, "<a>\n<b>\ntext\n</b>\n</a>\n"),
(-1, "<a>\n<b>\ntext\n</b>\n</a>\n"),
(0, "<a>\n<b>\ntext\n</b>\n</a>\n"),
("", "<a>\n<b>\ntext\n</b>\n</a>\n"),
(1, "<a>\n <b>\n text\n </b>\n</a>\n"),
(2, "<a>\n <b>\n text\n </b>\n</a>\n"),
("\t", "<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>\n"),
("abc", "<a>\nabc<b>\nabcabctext\nabc</b>\n</a>\n"),
# Some invalid inputs -- the default behavior is used.
(object(), "<a>\n <b>\n text\n </b>\n</a>\n"),
(b"bytes", "<a>\n <b>\n text\n </b>\n</a>\n"),
],
)
def test_indent(self, indent, expect):
# Pretty-print a tree with a Formatter set to
# indent in a certain way and verify the results.
soup = self.soup("<a><b>text</b></a>")
formatter = Formatter(indent=indent)
assert soup.prettify(formatter=formatter) == expect
# Pretty-printing only happens with prettify(), not
# encode().
assert soup.encode(formatter=formatter) != expect
def test_default_indent_value(self):
formatter = Formatter()
assert formatter.indent == " "
@pytest.mark.parametrize("formatter,expect",
[
(HTMLFormatter(indent=1), "<p>\n a\n</p>\n"),
(HTMLFormatter(indent=2), "<p>\n a\n</p>\n"),
(XMLFormatter(indent=1), "<p>\n a\n</p>\n"),
(XMLFormatter(indent="\t"), "<p>\n\ta\n</p>\n"),
] )
def test_indent_subclasses(self, formatter, expect):
soup = self.soup("<p>a</p>")
assert expect == soup.p.prettify(formatter=formatter)
@pytest.mark.parametrize(
"s,expect_html,expect_html5",
[
# The html5 formatter is much less aggressive about escaping ampersands
# than the html formatter.
("foo & bar", "foo & bar", "foo & bar"),
("foo&", "foo&", "foo&"),
("foo&&& bar", "foo&&& bar", "foo&&& bar"),
("x=1&y=2", "x=1&y=2", "x=1&y=2"),
("&123", "&123", "&123"),
("&abc", "&abc", "&abc"),
("foo &0 bar", "foo &0 bar", "foo &0 bar"),
("foo &lolwat bar", "foo &lolwat bar", "foo &lolwat bar"),
# But both formatters escape what the HTML5 spec considers ambiguous ampersands.
("&nosuchentity;", "&nosuchentity;", "&nosuchentity;"),
],
)
def test_entity_substitution(self, s, expect_html, expect_html5):
assert HTMLFormatter.REGISTRY["html"].substitute(s) == expect_html
assert HTMLFormatter.REGISTRY["html5"].substitute(s) == expect_html5
assert HTMLFormatter.REGISTRY["html5-4.12"].substitute(s) == expect_html
def test_entity_round_trip(self):
# This is more an explanatory test and a way to avoid regressions than a test of functionality.
markup = "<p>Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ &divide; &#247;</p>"
soup = self.soup(markup)
assert (
"Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ ÷ ÷"
== soup.p.string
)
# Oops, I forgot to mention the entity.
soup.p.string = soup.p.string + " ÷"
assert (
"Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ ÷ ÷ ÷"
== soup.p.string
)
expect = "<p>Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ &divide; &#247; &#xf7;</p>"
assert expect == soup.p.decode(formatter="html")
assert expect == soup.p.decode(formatter="html5")
markup = "<p>a & b</p>"
soup = self.soup(markup)
assert "<p>a & b</p>" == soup.p.decode(formatter="html")
assert "<p>a & b</p>" == soup.p.decode(formatter="html5")
|