1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
|
from __future__ import annotations
from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union
from typing_extensions import TypeAlias
from bs4.dammit import EntitySubstitution
if TYPE_CHECKING:
from bs4._typing import _AttributeValue
class Formatter(EntitySubstitution):
"""Describes a strategy to use when outputting a parse tree to a string.
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
Formatters are passed in as the `formatter` argument to methods
like `bs4.element.Tag.encode`. Most people won't need to
think about formatters, and most people who need to think about
them can pass in one of these predefined strings as `formatter`
rather than making a new Formatter object:
For HTML documents:
* 'html' - HTML entity substitution for generic HTML documents. (default)
* 'html5' - HTML entity substitution for HTML5 documents, as
well as some optimizations in the way tags are rendered.
* 'html5-4.12.0' - The version of the 'html5' formatter used prior to
Beautiful Soup 4.13.0.
* 'minimal' - Only make the substitutions necessary to guarantee
valid HTML.
* None - Do not perform any substitution. This will be faster
but may result in invalid markup.
For XML documents:
* 'html' - Entity substitution for XHTML documents.
* 'minimal' - Only make the substitutions necessary to guarantee
valid XML. (default)
* None - Do not perform any substitution. This will be faster
but may result in invalid markup.
"""
#: Constant name denoting HTML markup
HTML: str = "html"
#: Constant name denoting XML markup
XML: str = "xml"
#: Default values for the various constructor options when the
#: markup language is HTML.
HTML_DEFAULTS: Dict[str, Set[str]] = dict(
cdata_containing_tags=set(["script", "style"]),
)
language: Optional[str] #: :meta private:
entity_substitution: Optional[_EntitySubstitutionFunction] #: :meta private:
void_element_close_prefix: str #: :meta private:
cdata_containing_tags: Set[str] #: :meta private:
indent: str #: :meta private:
#: If this is set to true by the constructor, then attributes whose
#: values are sent to the empty string will be treated as HTML
#: boolean attributes. (Attributes whose value is None are always
#: rendered this way.)
empty_attributes_are_booleans: bool
def _default(
self, language: str, value: Optional[Set[str]], kwarg: str
) -> Set[str]:
if value is not None:
return value
if language == self.XML:
# When XML is the markup language in use, all of the
# defaults are the empty list.
return set()
# Otherwise, it depends on what's in HTML_DEFAULTS.
return self.HTML_DEFAULTS[kwarg]
def __init__(
self,
language: Optional[str] = None,
entity_substitution: Optional[_EntitySubstitutionFunction] = None,
void_element_close_prefix: str = "/",
cdata_containing_tags: Optional[Set[str]] = None,
empty_attributes_are_booleans: bool = False,
indent: Union[int,str] = 1,
):
r"""Constructor.
:param language: This should be `Formatter.XML` if you are formatting
XML markup and `Formatter.HTML` if you are formatting HTML markup.
:param entity_substitution: A function to call to replace special
characters with XML/HTML entities. For examples, see
bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
:param void_element_close_prefix: By default, void elements
are represented as <tag/> (XML rules) rather than <tag>
(HTML rules). To get <tag>, pass in the empty string.
:param cdata_containing_tags: The set of tags that are defined
as containing CDATA in this dialect. For example, in HTML,
<script> and <style> tags are defined as containing CDATA,
and their contents should not be formatted.
:param empty_attributes_are_booleans: If this is set to true,
then attributes whose values are sent to the empty string
will be treated as `HTML boolean
attributes<https://dev.w3.org/html5/spec-LC/common-microsyntaxes.html#boolean-attributes>`_. (Attributes
whose value is None are always rendered this way.)
:param indent: If indent is a non-negative integer or string,
then the contents of elements will be indented
appropriately when pretty-printing. An indent level of 0,
negative, or "" will only insert newlines. Using a
positive integer indent indents that many spaces per
level. If indent is a string (such as "\t"), that string
is used to indent each level. The default behavior is to
indent one space per level.
"""
self.language = language or self.HTML
self.entity_substitution = entity_substitution
self.void_element_close_prefix = void_element_close_prefix
self.cdata_containing_tags = self._default(
self.language, cdata_containing_tags, "cdata_containing_tags"
)
self.empty_attributes_are_booleans = empty_attributes_are_booleans
if indent is None:
indent = 0
indent_str: str
if isinstance(indent, int):
if indent < 0:
indent = 0
indent_str = " " * indent
elif isinstance(indent, str):
indent_str = indent
else:
indent_str = " "
self.indent = indent_str
def substitute(self, ns: str) -> str:
"""Process a string that needs to undergo entity substitution.
This may be a string encountered in an attribute value or as
text.
:param ns: A string.
:return: The same string but with certain characters replaced by named
or numeric entities.
"""
if not self.entity_substitution:
return ns
from .element import NavigableString
if (
isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in self.cdata_containing_tags
):
# Do nothing.
return ns
# Substitute.
return self.entity_substitution(ns)
def attribute_value(self, value: str) -> str:
"""Process the value of an attribute.
:param ns: A string.
:return: A string with certain characters replaced by named
or numeric entities.
"""
return self.substitute(value)
def attributes(
self, tag: bs4.element.Tag
) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
"""Reorder a tag's attributes however you want.
By default, attributes are sorted alphabetically. This makes
behavior consistent between Python 2 and Python 3, and preserves
backwards compatibility with older versions of Beautiful Soup.
If `empty_attributes_are_booleans` is True, then
attributes whose values are set to the empty string will be
treated as boolean attributes.
"""
if tag.attrs is None:
return []
items: Iterable[Tuple[str, _AttributeValue]] = list(tag.attrs.items())
return sorted(
(k, (None if self.empty_attributes_are_booleans and v == "" else v))
for k, v in items
)
class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
REGISTRY: Dict[Optional[str], HTMLFormatter] = {}
def __init__(
self,
entity_substitution: Optional[_EntitySubstitutionFunction] = None,
void_element_close_prefix: str = "/",
cdata_containing_tags: Optional[Set[str]] = None,
empty_attributes_are_booleans: bool = False,
indent: Union[int,str] = 1,
):
super(HTMLFormatter, self).__init__(
self.HTML,
entity_substitution,
void_element_close_prefix,
cdata_containing_tags,
empty_attributes_are_booleans,
indent=indent
)
class XMLFormatter(Formatter):
"""A generic Formatter for XML."""
REGISTRY: Dict[Optional[str], XMLFormatter] = {}
def __init__(
self,
entity_substitution: Optional[_EntitySubstitutionFunction] = None,
void_element_close_prefix: str = "/",
cdata_containing_tags: Optional[Set[str]] = None,
empty_attributes_are_booleans: bool = False,
indent: Union[int,str] = 1,
):
super(XMLFormatter, self).__init__(
self.XML,
entity_substitution,
void_element_close_prefix,
cdata_containing_tags,
empty_attributes_are_booleans,
indent=indent,
)
# Set up aliases for the default formatters.
HTMLFormatter.REGISTRY["html"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html5,
void_element_close_prefix="",
empty_attributes_are_booleans=True,
)
HTMLFormatter.REGISTRY["html5-4.12"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
void_element_close_prefix="",
empty_attributes_are_booleans=True,
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
HTMLFormatter.REGISTRY[None] = HTMLFormatter(entity_substitution=None)
XMLFormatter.REGISTRY["html"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
XMLFormatter.REGISTRY[None] = XMLFormatter(entity_substitution=None)
# Define type aliases to improve readability.
#
#: A function to call to replace special characters with XML or HTML
#: entities.
_EntitySubstitutionFunction: TypeAlias = Callable[[str], str]
# Many of the output-centered methods take an argument that can either
# be a Formatter object or the name of a Formatter to be looked up.
_FormatterOrName = Union[Formatter, str]
|