aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py
blob: 5f2b38de2fcbfe43188595143925d102bbb7604a (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
from __future__ import annotations

# Use of this source code is governed by the MIT license.
__license__ = "MIT"

from collections import defaultdict
import re
from types import ModuleType
from typing import (
    Any,
    cast,
    Dict,
    Iterable,
    List,
    Optional,
    Pattern,
    Set,
    Tuple,
    Type,
    TYPE_CHECKING,
)
import warnings
import sys
from bs4.element import (
    AttributeDict,
    AttributeValueList,
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
    RubyParenthesisString,
    RubyTextString,
    Stylesheet,
    Script,
    TemplateString,
    nonwhitespace_re,
)

# Exceptions were moved to their own module in 4.13. Import here for
# backwards compatibility.
from bs4.exceptions import ParserRejectedMarkup

from bs4._typing import (
    _AttributeValues,
    _RawAttributeValue,
)

from bs4._warnings import XMLParsedAsHTMLWarning

if TYPE_CHECKING:
    from bs4 import BeautifulSoup
    from bs4.element import (
        NavigableString,
        Tag,
    )
    from bs4._typing import (
        _AttributeValue,
        _Encoding,
        _Encodings,
        _RawOrProcessedAttributeValues,
        _RawMarkup,
    )

__all__ = [
    "HTMLTreeBuilder",
    "SAXTreeBuilder",
    "TreeBuilder",
    "TreeBuilderRegistry",
]

# Some useful features for a TreeBuilder to have.
FAST = "fast"
PERMISSIVE = "permissive"
STRICT = "strict"
XML = "xml"
HTML = "html"
HTML_5 = "html5"

__all__ = [
    "TreeBuilderRegistry",
    "TreeBuilder",
    "HTMLTreeBuilder",
    "DetectsXMLParsedAsHTML",

    "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0
]

class TreeBuilderRegistry(object):
    """A way of looking up TreeBuilder subclasses by their name or by desired
    features.
    """

    builders_for_feature: Dict[str, List[Type[TreeBuilder]]]
    builders: List[Type[TreeBuilder]]

    def __init__(self) -> None:
        self.builders_for_feature = defaultdict(list)
        self.builders = []

    def register(self, treebuilder_class: type[TreeBuilder]) -> None:
        """Register a treebuilder based on its advertised features.

        :param treebuilder_class: A subclass of `TreeBuilder`. its
           `TreeBuilder.features` attribute should list its features.
        """
        for feature in treebuilder_class.features:
            self.builders_for_feature[feature].insert(0, treebuilder_class)
        self.builders.insert(0, treebuilder_class)

    def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]:
        """Look up a TreeBuilder subclass with the desired features.

        :param features: A list of features to look for. If none are
            provided, the most recently registered TreeBuilder subclass
            will be used.
        :return: A TreeBuilder subclass, or None if there's no
            registered subclass with all the requested features.
        """
        if len(self.builders) == 0:
            # There are no builders at all.
            return None

        if len(features) == 0:
            # They didn't ask for any features. Give them the most
            # recently registered builder.
            return self.builders[0]

        # Go down the list of features in order, and eliminate any builders
        # that don't match every feature.
        feature_list = list(features)
        feature_list.reverse()
        candidates = None
        candidate_set = None
        while len(feature_list) > 0:
            feature = feature_list.pop()
            we_have_the_feature = self.builders_for_feature.get(feature, [])
            if len(we_have_the_feature) > 0:
                if candidates is None:
                    candidates = we_have_the_feature
                    candidate_set = set(candidates)
                else:
                    # Eliminate any candidates that don't have this feature.
                    candidate_set = candidate_set.intersection(set(we_have_the_feature))

        # The only valid candidates are the ones in candidate_set.
        # Go through the original list of candidates and pick the first one
        # that's in candidate_set.
        if candidate_set is None or candidates is None:
            return None
        for candidate in candidates:
            if candidate in candidate_set:
                return candidate
        return None


#: The `BeautifulSoup` constructor will take a list of features
#: and use it to look up `TreeBuilder` classes in this registry.
builder_registry: TreeBuilderRegistry = TreeBuilderRegistry()


class TreeBuilder(object):
    """Turn a textual document into a Beautiful Soup object tree.

    This is an abstract superclass which smooths out the behavior of
    different parser libraries into a single, unified interface.

    :param multi_valued_attributes: If this is set to None, the
     TreeBuilder will not turn any values for attributes like
     'class' into lists. Setting this to a dictionary will
     customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`
     for an example.

     Internally, these are called "CDATA list attributes", but that
     probably doesn't make sense to an end-user, so the argument name
     is ``multi_valued_attributes``.

    :param preserve_whitespace_tags: A set of tags to treat
     the way <pre> tags are treated in HTML. Tags in this set
     are immune from pretty-printing; their contents will always be
     output as-is.

    :param string_containers: A dictionary mapping tag names to
     the classes that should be instantiated to contain the textual
     contents of those tags. The default is to use NavigableString
     for every tag, no matter what the name. You can override the
     default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`.

    :param store_line_numbers: If the parser keeps track of the line
     numbers and positions of the original markup, that information
     will, by default, be stored in each corresponding
     :py:class:`bs4.element.Tag` object. You can turn this off by
     passing store_line_numbers=False; then Tag.sourcepos and
     Tag.sourceline will always be None. If the parser you're using
     doesn't keep track of this information, then store_line_numbers
     is irrelevant.

    :param attribute_dict_class: The value of a multi-valued attribute
      (such as HTML's 'class') willl be stored in an instance of this
      class.  The default is Beautiful Soup's built-in
      `AttributeValueList`, which is a normal Python list, and you
      will probably never need to change it.
    """

    USE_DEFAULT: Any = object()  #: :meta private:

    def __init__(
        self,
        multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT,
        preserve_whitespace_tags: Set[str] = USE_DEFAULT,
        store_line_numbers: bool = USE_DEFAULT,
        string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT,
        empty_element_tags: Set[str] = USE_DEFAULT,
        attribute_dict_class: Type[AttributeDict] = AttributeDict,
        attribute_value_list_class: Type[AttributeValueList] = AttributeValueList,
    ):
        self.soup = None
        if multi_valued_attributes is self.USE_DEFAULT:
            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
        self.cdata_list_attributes = multi_valued_attributes
        if preserve_whitespace_tags is self.USE_DEFAULT:
            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
        self.preserve_whitespace_tags = preserve_whitespace_tags
        if empty_element_tags is self.USE_DEFAULT:
            self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS
        else:
            self.empty_element_tags = empty_element_tags
        # TODO: store_line_numbers is probably irrelevant now that
        # the behavior of sourceline and sourcepos has been made consistent
        # everywhere.
        if store_line_numbers == self.USE_DEFAULT:
            store_line_numbers = self.TRACKS_LINE_NUMBERS
        self.store_line_numbers = store_line_numbers
        if string_containers == self.USE_DEFAULT:
            string_containers = self.DEFAULT_STRING_CONTAINERS
        self.string_containers = string_containers
        self.attribute_dict_class = attribute_dict_class
        self.attribute_value_list_class = attribute_value_list_class

    NAME: str = "[Unknown tree builder]"
    ALTERNATE_NAMES: Iterable[str] = []
    features: Iterable[str] = []

    is_xml: bool = False
    picklable: bool = False

    soup: Optional[BeautifulSoup]  #: :meta private:

    #: A tag will be considered an empty-element
    #: tag when and only when it has no contents.
    empty_element_tags: Optional[Set[str]] = None  #: :meta private:
    cdata_list_attributes: Dict[str, Set[str]]  #: :meta private:
    preserve_whitespace_tags: Set[str]  #: :meta private:
    string_containers: Dict[str, Type[NavigableString]]  #: :meta private:
    tracks_line_numbers: bool  #: :meta private:

    #: A value for these tag/attribute combinations is a space- or
    #: comma-separated list of CDATA, rather than a single CDATA.
    DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set)

    #: Whitespace should be preserved inside these tags.
    DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set()

    #: The textual contents of tags with these names should be
    #: instantiated with some class other than `bs4.element.NavigableString`.
    DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {}

    #: By default, tags are treated as empty-element tags if they have
    #: no contents--that is, using XML rules. HTMLTreeBuilder
    #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the
    #: HTML 4 and HTML5 standards.
    DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None

    #: Most parsers don't keep track of line numbers.
    TRACKS_LINE_NUMBERS: bool = False

    def initialize_soup(self, soup: BeautifulSoup) -> None:
        """The BeautifulSoup object has been initialized and is now
        being associated with the TreeBuilder.

        :param soup: A BeautifulSoup object.
        """
        self.soup = soup

    def reset(self) -> None:
        """Do any work necessary to reset the underlying parser
        for a new document.

        By default, this does nothing.
        """
        pass

    def can_be_empty_element(self, tag_name: str) -> bool:
        """Might a tag with this name be an empty-element tag?

        The final markup may or may not actually present this tag as
        self-closing.

        For instance: an HTMLBuilder does not consider a <p> tag to be
        an empty-element tag (it's not in
        HTMLBuilder.empty_element_tags). This means an empty <p> tag
        will be presented as "<p></p>", not "<p/>" or "<p>".

        The default implementation has no opinion about which tags are
        empty-element tags, so a tag will be presented as an
        empty-element tag if and only if it has no children.
        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
        be left alone.

        :param tag_name: The name of a markup tag.
        """
        if self.empty_element_tags is None:
            return True
        return tag_name in self.empty_element_tags

    def feed(self, markup: _RawMarkup) -> None:
        """Run incoming markup through some parsing process."""
        raise NotImplementedError()

    def prepare_markup(
        self,
        markup: _RawMarkup,
        user_specified_encoding: Optional[_Encoding] = None,
        document_declared_encoding: Optional[_Encoding] = None,
        exclude_encodings: Optional[_Encodings] = None,
    ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
        """Run any preliminary steps necessary to make incoming markup
        acceptable to the parser.

        :param markup: The markup that's about to be parsed.
        :param user_specified_encoding: The user asked to try this encoding
           to convert the markup into a Unicode string.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding. NOTE: This argument is not used by the
            calling code and can probably be removed.
        :param exclude_encodings: The user asked *not* to try any of
            these encodings.

        :yield: A series of 4-tuples: (markup, encoding, declared encoding,
            has undergone character replacement)

            Each 4-tuple represents a strategy that the parser can try
            to convert the document to Unicode and parse it. Each
            strategy will be tried in turn.

         By default, the only strategy is to parse the markup
         as-is. See `LXMLTreeBuilderForXML` and
         `HTMLParserTreeBuilder` for implementations that take into
         account the quirks of particular parsers.

        :meta private:

        """
        yield markup, None, None, False

    def test_fragment_to_document(self, fragment: str) -> str:
        """Wrap an HTML fragment to make it look like a document.

        Different parsers do this differently. For instance, lxml
        introduces an empty <head> tag, and html5lib
        doesn't. Abstracting this away lets us write simple tests
        which run HTML fragments through the parser and compare the
        results against other HTML fragments.

        This method should not be used outside of unit tests.

        :param fragment: A fragment of HTML.
        :return: A full HTML document.
        :meta private:
        """
        return fragment

    def set_up_substitutions(self, tag: Tag) -> bool:
        """Set up any substitutions that will need to be performed on
        a `Tag` when it's output as a string.

        By default, this does nothing. See `HTMLTreeBuilder` for a
        case where this is used.

        :return: Whether or not a substitution was performed.
        :meta private:
        """
        return False

    def _replace_cdata_list_attribute_values(
        self, tag_name: str, attrs: _RawOrProcessedAttributeValues
    ) -> _AttributeValues:
        """When an attribute value is associated with a tag that can
        have multiple values for that attribute, convert the string
        value to a list of strings.

        Basically, replaces class="foo bar" with class=["foo", "bar"]

        NOTE: This method modifies its input in place.

        :param tag_name: The name of a tag.
        :param attrs: A dictionary containing the tag's attributes.
           Any appropriate attribute values will be modified in place.
        :return: The modified dictionary that was originally passed in.
        """

        # First, cast the attrs dict to _AttributeValues. This might
        # not be accurate yet, but it will be by the time this method
        # returns.
        modified_attrs = cast(_AttributeValues, attrs)
        if not modified_attrs or not self.cdata_list_attributes:
            # Nothing to do.
            return modified_attrs

        # There is at least a possibility that we need to modify one of
        # the attribute values.
        universal: Set[str] = self.cdata_list_attributes.get("*", set())
        tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None)
        for attr in list(modified_attrs.keys()):
            modified_value: _AttributeValue
            if attr in universal or (tag_specific and attr in tag_specific):
                # We have a "class"-type attribute whose string
                # value is a whitespace-separated list of
                # values. Split it into a list.
                original_value: _AttributeValue = modified_attrs[attr]
                if isinstance(original_value, _RawAttributeValue):
                    # This is a _RawAttributeValue (a string) that
                    # needs to be split and converted to a
                    # AttributeValueList so it can be an
                    # _AttributeValue.
                    modified_value = self.attribute_value_list_class(
                        nonwhitespace_re.findall(original_value)
                    )
                else:
                    # html5lib calls setAttributes twice for the
                    # same tag when rearranging the parse tree. On
                    # the second call the attribute value here is
                    # already a list. This can also happen when a
                    # Tag object is cloned. If this happens, leave
                    # the value alone rather than trying to split
                    # it again.
                    modified_value = original_value
                modified_attrs[attr] = modified_value
        return modified_attrs


class SAXTreeBuilder(TreeBuilder):
    """A Beautiful Soup treebuilder that listens for SAX events.

    This is not currently used for anything, and it will be removed
    soon. It was a good idea, but it wasn't properly integrated into the
    rest of Beautiful Soup, so there have been long stretches where it
    hasn't worked properly.
    """

    def __init__(self, *args: Any, **kwargs: Any) -> None:
        warnings.warn(
            "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.",
            DeprecationWarning,
            stacklevel=2,
        )
        super(SAXTreeBuilder, self).__init__(*args, **kwargs)

    def feed(self, markup: _RawMarkup) -> None:
        raise NotImplementedError()

    def close(self) -> None:
        pass

    def startElement(self, name: str, attrs: Dict[str, str]) -> None:
        attrs = AttributeDict((key[1], value) for key, value in list(attrs.items()))
        # print("Start %s, %r" % (name, attrs))
        assert self.soup is not None
        self.soup.handle_starttag(name, None, None, attrs)

    def endElement(self, name: str) -> None:
        # print("End %s" % name)
        assert self.soup is not None
        self.soup.handle_endtag(name)

    def startElementNS(
        self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str]
    ) -> None:
        # Throw away (ns, nodeName) for now.
        self.startElement(nodeName, attrs)

    def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None:
        # Throw away (ns, nodeName) for now.
        self.endElement(nodeName)
        # handler.endElementNS((ns, node.nodeName), node.nodeName)

    def startPrefixMapping(self, prefix: str, nodeValue: str) -> None:
        # Ignore the prefix for now.
        pass

    def endPrefixMapping(self, prefix: str) -> None:
        # Ignore the prefix for now.
        # handler.endPrefixMapping(prefix)
        pass

    def characters(self, content: str) -> None:
        assert self.soup is not None
        self.soup.handle_data(content)

    def startDocument(self) -> None:
        pass

    def endDocument(self) -> None:
        pass


class HTMLTreeBuilder(TreeBuilder):
    """This TreeBuilder knows facts about HTML, such as which tags are treated
    specially by the HTML standard.
    """

    #: Some HTML tags are defined as having no contents. Beautiful Soup
    #: treats these specially.
    DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set(
        [
            # These are from HTML5.
            "area",
            "base",
            "br",
            "col",
            "embed",
            "hr",
            "img",
            "input",
            "keygen",
            "link",
            "menuitem",
            "meta",
            "param",
            "source",
            "track",
            "wbr",
            # These are from earlier versions of HTML and are removed in HTML5.
            "basefont",
            "bgsound",
            "command",
            "frame",
            "image",
            "isindex",
            "nextid",
            "spacer",
        ]
    )

    #: The HTML standard defines these tags as block-level elements. Beautiful
    #: Soup does not treat these elements differently from other elements,
    #: but it may do so eventually, and this information is available if
    #: you need to use it.
    DEFAULT_BLOCK_ELEMENTS: Set[str] = set(
        [
            "address",
            "article",
            "aside",
            "blockquote",
            "canvas",
            "dd",
            "div",
            "dl",
            "dt",
            "fieldset",
            "figcaption",
            "figure",
            "footer",
            "form",
            "h1",
            "h2",
            "h3",
            "h4",
            "h5",
            "h6",
            "header",
            "hr",
            "li",
            "main",
            "nav",
            "noscript",
            "ol",
            "output",
            "p",
            "pre",
            "section",
            "table",
            "tfoot",
            "ul",
            "video",
        ]
    )

    #: These HTML tags need special treatment so they can be
    #: represented by a string class other than `bs4.element.NavigableString`.
    #:
    #: For some of these tags, it's because the HTML standard defines
    #: an unusual content model for them. I made this list by going
    #: through the HTML spec
    #: (https://html.spec.whatwg.org/#metadata-content) and looking for
    #: "metadata content" elements that can contain strings.
    #:
    #: The Ruby tags (<rt> and <rp>) are here despite being normal
    #: "phrasing content" tags, because the content they contain is
    #: qualitatively different from other text in the document, and it
    #: can be useful to be able to distinguish it.
    #:
    #: TODO: Arguably <noscript> could go here but it seems
    #: qualitatively different from the other tags.
    DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {
        "rt": RubyTextString,
        "rp": RubyParenthesisString,
        "style": Stylesheet,
        "script": Script,
        "template": TemplateString,
    }

    #: The HTML standard defines these attributes as containing a
    #: space-separated list of values, not a single value. That is,
    #: class="foo bar" means that the 'class' attribute has two values,
    #: 'foo' and 'bar', not the single value 'foo bar'.  When we
    #: encounter one of these attributes, we will parse its value into
    #: a list of values if possible. Upon output, the list will be
    #: converted back into a string.
    DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = {
        "*": {"class", "accesskey", "dropzone"},
        "a": {"rel", "rev"},
        "link": {"rel", "rev"},
        "td": {"headers"},
        "th": {"headers"},
        "form": {"accept-charset"},
        "object": {"archive"},
        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
        "area": {"rel"},
        "icon": {"sizes"},
        "iframe": {"sandbox"},
        "output": {"for"},
    }

    #: By default, whitespace inside these HTML tags will be
    #: preserved rather than being collapsed.
    DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"])

    def set_up_substitutions(self, tag: Tag) -> bool:
        """Replace the declared encoding in a <meta> tag with a placeholder,
        to be substituted when the tag is output to a string.

        An HTML document may come in to Beautiful Soup as one
        encoding, but exit in a different encoding, and the <meta> tag
        needs to be changed to reflect this.

        :return: Whether or not a substitution was performed.

        :meta private:
        """
        # We are only interested in <meta> tags
        if tag.name != "meta":
            return False

        # TODO: This cast will fail in the (very unlikely) scenario
        # that the programmer who instantiates the TreeBuilder
        # specifies meta['content'] or meta['charset'] as
        # cdata_list_attributes.
        content: Optional[str] = cast(Optional[str], tag.get("content"))
        charset: Optional[str] = cast(Optional[str], tag.get("charset"))

        # But we can accommodate meta['http-equiv'] being made a
        # cdata_list_attribute (again, very unlikely) without much
        # trouble.
        http_equiv: List[str] = tag.get_attribute_list("http-equiv")

        # We are interested in <meta> tags that say what encoding the
        # document was originally in. This means HTML 5-style <meta>
        # tags that provide the "charset" attribute. It also means
        # HTML 4-style <meta> tags that provide the "content"
        # attribute and have "http-equiv" set to "content-type".
        #
        # In both cases we will replace the value of the appropriate
        # attribute with a standin object that can take on any
        # encoding.
        substituted = False
        if charset is not None:
            # HTML 5 style:
            # <meta charset="utf8">
            tag["charset"] = CharsetMetaAttributeValue(charset)
            substituted = True

        elif content is not None and any(
            x.lower() == "content-type" for x in http_equiv
        ):
            # HTML 4 style:
            # <meta http-equiv="content-type" content="text/html; charset=utf8">
            tag["content"] = ContentMetaAttributeValue(content)
            substituted = True

        return substituted


class DetectsXMLParsedAsHTML(object):
    """A mixin class for any class (a TreeBuilder, or some class used by a
    TreeBuilder) that's in a position to detect whether an XML
    document is being incorrectly parsed as HTML, and issue an
    appropriate warning.

    This requires being able to observe an incoming processing
    instruction that might be an XML declaration, and also able to
    observe tags as they're opened. If you can't do that for a given
    `TreeBuilder`, there's a less reliable implementation based on
    examining the raw markup.
    """

    #: Regular expression for seeing if string markup has an <html> tag.
    LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I)

    #: Regular expression for seeing if byte markup has an <html> tag.
    LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I)

    #: The start of an XML document string.
    XML_PREFIX: str = "<?xml"

    #: The start of an XML document bytestring.
    XML_PREFIX_B: bytes = b"<?xml"

    # This is typed as str, not `ProcessingInstruction`, because this
    # check may be run before any Beautiful Soup objects are created.
    _first_processing_instruction: Optional[str]  #: :meta private:
    _root_tag_name: Optional[str]  #: :meta private:

    @classmethod
    def warn_if_markup_looks_like_xml(
        cls, markup: Optional[_RawMarkup], stacklevel: int = 3
    ) -> bool:
        """Perform a check on some markup to see if it looks like XML
        that's not XHTML. If so, issue a warning.

        This is much less reliable than doing the check while parsing,
        but some of the tree builders can't do that.

        :param stacklevel: The stacklevel of the code calling this\
         function.

        :return: True if the markup looks like non-XHTML XML, False
         otherwise.
        """
        if markup is None:
            return False
        markup = markup[:500]
        if isinstance(markup, bytes):
            markup_b: bytes = markup
            looks_like_xml = markup_b.startswith(
                cls.XML_PREFIX_B
            ) and not cls.LOOKS_LIKE_HTML_B.search(markup)
        else:
            markup_s: str = markup
            looks_like_xml = markup_s.startswith(
                cls.XML_PREFIX
            ) and not cls.LOOKS_LIKE_HTML.search(markup)

        if looks_like_xml:
            cls._warn(stacklevel=stacklevel + 2)
            return True
        return False

    @classmethod
    def _warn(cls, stacklevel: int = 5) -> None:
        """Issue a warning about XML being parsed as HTML."""
        warnings.warn(
            XMLParsedAsHTMLWarning.MESSAGE,
            XMLParsedAsHTMLWarning,
            stacklevel=stacklevel,
        )

    def _initialize_xml_detector(self) -> None:
        """Call this method before parsing a document."""
        self._first_processing_instruction = None
        self._root_tag_name = None

    def _document_might_be_xml(self, processing_instruction: str) -> None:
        """Call this method when encountering an XML declaration, or a
        "processing instruction" that might be an XML declaration.

        This helps Beautiful Soup detect potential issues later, if
        the XML document turns out to be a non-XHTML document that's
        being parsed as XML.
        """
        if (
            self._first_processing_instruction is not None
            or self._root_tag_name is not None
        ):
            # The document has already started. Don't bother checking
            # anymore.
            return

        self._first_processing_instruction = processing_instruction

        # We won't know until we encounter the first tag whether or
        # not this is actually a problem.

    def _root_tag_encountered(self, name: str) -> None:
        """Call this when you encounter the document's root tag.

        This is where we actually check whether an XML document is
        being incorrectly parsed as HTML, and issue the warning.
        """
        if self._root_tag_name is not None:
            # This method was incorrectly called multiple times. Do
            # nothing.
            return

        self._root_tag_name = name

        if (
            name != "html"
            and self._first_processing_instruction is not None
            and self._first_processing_instruction.lower().startswith("xml ")
        ):
            # We encountered an XML declaration and then a tag other
            # than 'html'. This is a reliable indicator that a
            # non-XHTML document is being parsed as XML.
            self._warn(stacklevel=10)


def register_treebuilders_from(module: ModuleType) -> None:
    """Copy TreeBuilders from the given module into this module."""
    this_module = sys.modules[__name__]
    for name in module.__all__:
        obj = getattr(module, name)

        if issubclass(obj, TreeBuilder):
            setattr(this_module, name, obj)
            this_module.__all__.append(name)
            # Register the builder while we're at it.
            this_module.builder_registry.register(obj)


# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
# want to use HTMLParser as a last resort.
from . import _htmlparser # noqa: E402

register_treebuilders_from(_htmlparser)
try:
    from . import _html5lib

    register_treebuilders_from(_html5lib)
except ImportError:
    # They don't have html5lib installed.
    pass
try:
    from . import _lxml

    register_treebuilders_from(_lxml)
except ImportError:
    # They don't have lxml installed.
    pass