aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/bs4/formatter.py
blob: bfa08764956f76773317663f2bb6ed05057c36f8 (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
from __future__ import annotations
from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union
from typing_extensions import TypeAlias
from bs4.dammit import EntitySubstitution

if TYPE_CHECKING:
    from bs4._typing import _AttributeValue


class Formatter(EntitySubstitution):
    """Describes a strategy to use when outputting a parse tree to a string.

    Some parts of this strategy come from the distinction between
    HTML4, HTML5, and XML. Others are configurable by the user.

    Formatters are passed in as the `formatter` argument to methods
    like `bs4.element.Tag.encode`. Most people won't need to
    think about formatters, and most people who need to think about
    them can pass in one of these predefined strings as `formatter`
    rather than making a new Formatter object:

    For HTML documents:
     * 'html' - HTML entity substitution for generic HTML documents. (default)
     * 'html5' - HTML entity substitution for HTML5 documents, as
                 well as some optimizations in the way tags are rendered.
     * 'html5-4.12.0' - The version of the 'html5' formatter used prior to
                        Beautiful Soup 4.13.0.
     * 'minimal' - Only make the substitutions necessary to guarantee
                   valid HTML.
     * None - Do not perform any substitution. This will be faster
              but may result in invalid markup.

    For XML documents:
     * 'html' - Entity substitution for XHTML documents.
     * 'minimal' - Only make the substitutions necessary to guarantee
                   valid XML. (default)
     * None - Do not perform any substitution. This will be faster
              but may result in invalid markup.

    """

    #: Constant name denoting HTML markup
    HTML: str = "html"

    #: Constant name denoting XML markup
    XML: str = "xml"

    #: Default values for the various constructor options when the
    #: markup language is HTML.
    HTML_DEFAULTS: Dict[str, Set[str]] = dict(
        cdata_containing_tags=set(["script", "style"]),
    )

    language: Optional[str]  #: :meta private:
    entity_substitution: Optional[_EntitySubstitutionFunction]  #: :meta private:
    void_element_close_prefix: str  #: :meta private:
    cdata_containing_tags: Set[str]  #: :meta private:
    indent: str  #: :meta private:

    #: If this is set to true by the constructor, then attributes whose
    #: values are sent to the empty string will be treated as HTML
    #: boolean attributes. (Attributes whose value is None are always
    #: rendered this way.)
    empty_attributes_are_booleans: bool

    def _default(
        self, language: str, value: Optional[Set[str]], kwarg: str
    ) -> Set[str]:
        if value is not None:
            return value
        if language == self.XML:
            # When XML is the markup language in use, all of the
            # defaults are the empty list.
            return set()

        # Otherwise, it depends on what's in HTML_DEFAULTS.
        return self.HTML_DEFAULTS[kwarg]

    def __init__(
        self,
        language: Optional[str] = None,
        entity_substitution: Optional[_EntitySubstitutionFunction] = None,
        void_element_close_prefix: str = "/",
        cdata_containing_tags: Optional[Set[str]] = None,
        empty_attributes_are_booleans: bool = False,
        indent: Union[int,str] = 1,
    ):
        r"""Constructor.

        :param language: This should be `Formatter.XML` if you are formatting
           XML markup and `Formatter.HTML` if you are formatting HTML markup.

        :param entity_substitution: A function to call to replace special
           characters with XML/HTML entities. For examples, see
           bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
        :param void_element_close_prefix: By default, void elements
           are represented as <tag/> (XML rules) rather than <tag>
           (HTML rules). To get <tag>, pass in the empty string.
        :param cdata_containing_tags: The set of tags that are defined
           as containing CDATA in this dialect. For example, in HTML,
           <script> and <style> tags are defined as containing CDATA,
           and their contents should not be formatted.
        :param empty_attributes_are_booleans: If this is set to true,
          then attributes whose values are sent to the empty string
          will be treated as `HTML boolean
          attributes<https://dev.w3.org/html5/spec-LC/common-microsyntaxes.html#boolean-attributes>`_. (Attributes
          whose value is None are always rendered this way.)
        :param indent: If indent is a non-negative integer or string,
            then the contents of elements will be indented
            appropriately when pretty-printing. An indent level of 0,
            negative, or "" will only insert newlines. Using a
            positive integer indent indents that many spaces per
            level. If indent is a string (such as "\t"), that string
            is used to indent each level. The default behavior is to
            indent one space per level.

        """
        self.language = language or self.HTML
        self.entity_substitution = entity_substitution
        self.void_element_close_prefix = void_element_close_prefix
        self.cdata_containing_tags = self._default(
            self.language, cdata_containing_tags, "cdata_containing_tags"
        )
        self.empty_attributes_are_booleans = empty_attributes_are_booleans
        if indent is None:
            indent = 0
        indent_str: str
        if isinstance(indent, int):
            if indent < 0:
                indent = 0
            indent_str = " " * indent
        elif isinstance(indent, str):
            indent_str = indent
        else:
            indent_str = " "
        self.indent = indent_str

    def substitute(self, ns: str) -> str:
        """Process a string that needs to undergo entity substitution.
        This may be a string encountered in an attribute value or as
        text.

        :param ns: A string.
        :return: The same string but with certain characters replaced by named
           or numeric entities.
        """
        if not self.entity_substitution:
            return ns
        from .element import NavigableString

        if (
            isinstance(ns, NavigableString)
            and ns.parent is not None
            and ns.parent.name in self.cdata_containing_tags
        ):
            # Do nothing.
            return ns
        # Substitute.
        return self.entity_substitution(ns)

    def attribute_value(self, value: str) -> str:
        """Process the value of an attribute.

        :param ns: A string.
        :return: A string with certain characters replaced by named
           or numeric entities.
        """
        return self.substitute(value)

    def attributes(
        self, tag: bs4.element.Tag
    ) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
        """Reorder a tag's attributes however you want.

        By default, attributes are sorted alphabetically. This makes
        behavior consistent between Python 2 and Python 3, and preserves
        backwards compatibility with older versions of Beautiful Soup.

        If `empty_attributes_are_booleans` is True, then
        attributes whose values are set to the empty string will be
        treated as boolean attributes.
        """
        if tag.attrs is None:
            return []

        items: Iterable[Tuple[str, _AttributeValue]] = list(tag.attrs.items())
        return sorted(
            (k, (None if self.empty_attributes_are_booleans and v == "" else v))
            for k, v in items
        )


class HTMLFormatter(Formatter):
    """A generic Formatter for HTML."""

    REGISTRY: Dict[Optional[str], HTMLFormatter] = {}

    def __init__(
        self,
        entity_substitution: Optional[_EntitySubstitutionFunction] = None,
        void_element_close_prefix: str = "/",
        cdata_containing_tags: Optional[Set[str]] = None,
        empty_attributes_are_booleans: bool = False,
        indent: Union[int,str] = 1,
    ):
        super(HTMLFormatter, self).__init__(
            self.HTML,
            entity_substitution,
            void_element_close_prefix,
            cdata_containing_tags,
            empty_attributes_are_booleans,
            indent=indent
        )


class XMLFormatter(Formatter):
    """A generic Formatter for XML."""

    REGISTRY: Dict[Optional[str], XMLFormatter] = {}

    def __init__(
        self,
        entity_substitution: Optional[_EntitySubstitutionFunction] = None,
        void_element_close_prefix: str = "/",
        cdata_containing_tags: Optional[Set[str]] = None,
        empty_attributes_are_booleans: bool = False,
        indent: Union[int,str] = 1,
    ):
        super(XMLFormatter, self).__init__(
            self.XML,
            entity_substitution,
            void_element_close_prefix,
            cdata_containing_tags,
            empty_attributes_are_booleans,
            indent=indent,
        )


# Set up aliases for the default formatters.
HTMLFormatter.REGISTRY["html"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html
)

HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html5,
    void_element_close_prefix="",
    empty_attributes_are_booleans=True,
)
HTMLFormatter.REGISTRY["html5-4.12"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html,
    void_element_close_prefix="",
    empty_attributes_are_booleans=True,
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
)
HTMLFormatter.REGISTRY[None] = HTMLFormatter(entity_substitution=None)
XMLFormatter.REGISTRY["html"] = XMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html
)
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
)

XMLFormatter.REGISTRY[None] = XMLFormatter(entity_substitution=None)

# Define type aliases to improve readability.
#

#: A function to call to replace special characters with XML or HTML
#: entities.
_EntitySubstitutionFunction: TypeAlias = Callable[[str], str]

# Many of the output-centered methods take an argument that can either
# be a Formatter object or the name of a Formatter to be looked up.
_FormatterOrName = Union[Formatter, str]