diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/dammit.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/dammit.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/bs4/dammit.py | 1408 |
1 files changed, 1408 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/dammit.py b/.venv/lib/python3.12/site-packages/bs4/dammit.py new file mode 100644 index 00000000..c9f42446 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/dammit.py @@ -0,0 +1,1408 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's `Universal +Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained +by Kurt McKee. It does not rewrite the body of an XML or HTML document +to reflect a new encoding; that's the job of `TreeBuilder`. + +""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from html.entities import codepoint2name +from collections import defaultdict +import codecs +from html.entities import html5 +import re +from logging import Logger, getLogger +from types import ModuleType +from typing import ( + Dict, + Iterator, + List, + Optional, + Pattern, + Set, + Tuple, + Type, + Union, + cast, +) +from typing_extensions import Literal +from bs4._typing import ( + _Encoding, + _Encodings, +) +import warnings + +# Import a library to autodetect character encodings. We'll support +# any of a number of libraries that all support the same API: +# +# * cchardet +# * chardet +# * charset-normalizer +chardet_module: Optional[ModuleType] = None +try: + # PyPI package: cchardet + import cchardet + + chardet_module = cchardet +except ImportError: + try: + # Debian package: python-chardet + # PyPI package: chardet + import chardet + + chardet_module = chardet + except ImportError: + try: + # PyPI package: charset-normalizer + import charset_normalizer + + chardet_module = charset_normalizer + except ImportError: + # No chardet available. + pass + + +def _chardet_dammit(s: bytes) -> Optional[str]: + """Try as hard as possible to detect the encoding of a bytestring.""" + if chardet_module is None or isinstance(s, str): + return None + module = chardet_module + return module.detect(s)["encoding"] + + +# Build bytestring and Unicode versions of regular expressions for finding +# a declared encoding inside an XML or HTML document. +xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private: +html_meta: str = ( + "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private: +) + +# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky. +encoding_res: Dict[Type, Dict[str, Pattern]] = dict() +encoding_res[bytes] = { + "html": re.compile(html_meta.encode("ascii"), re.I), + "xml": re.compile(xml_encoding.encode("ascii"), re.I), +} +encoding_res[str] = { + "html": re.compile(html_meta, re.I), + "xml": re.compile(xml_encoding, re.I), +} + + +class EntitySubstitution(object): + """The ability to substitute XML or HTML entities for certain characters.""" + + #: A map of named HTML entities to the corresponding Unicode string. + #: + #: :meta hide-value: + HTML_ENTITY_TO_CHARACTER: Dict[str, str] + + #: A map of Unicode strings to the corresponding named HTML entities; + #: the inverse of HTML_ENTITY_TO_CHARACTER. + #: + #: :meta hide-value: + CHARACTER_TO_HTML_ENTITY: Dict[str, str] + + #: A regular expression that matches any character (or, in rare + #: cases, pair of characters) that can be replaced with a named + #: HTML entity. + #: + #: :meta hide-value: + CHARACTER_TO_HTML_ENTITY_RE: Pattern[str] + + #: A very similar regular expression to + #: CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped + #: ampersands. This is used by the 'html' formatted to provide + #: backwards-compatibility, even though the HTML5 spec allows most + #: ampersands to go unescaped. + #: + #: :meta hide-value: + CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str] + + @classmethod + def _populate_class_variables(cls) -> None: + """Initialize variables used by this class to manage the plethora of + HTML5 named entities. + + This function sets the following class variables: + + CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to + entity names like "angmsdaa". When a single Unicode string has + multiple entity names, we try to choose the most commonly-used + name. + + HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to + Unicode strings like "⦨". + + CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any + Unicode string that corresponds to an HTML5 named entity. + + CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar + regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which + also matches unescaped ampersands. This is used by the 'html' + formatted to provide backwards-compatibility, even though the HTML5 + spec allows most ampersands to go unescaped. + """ + unicode_to_name = {} + name_to_unicode = {} + + short_entities = set() + long_entities_by_first_character = defaultdict(set) + + for name_with_semicolon, character in sorted(html5.items()): + # "It is intentional, for legacy compatibility, that many + # code points have multiple character reference names. For + # example, some appear both with and without the trailing + # semicolon, or with different capitalizations." + # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references + # + # The parsers are in charge of handling (or not) character + # references with no trailing semicolon, so we remove the + # semicolon whenever it appears. + if name_with_semicolon.endswith(";"): + name = name_with_semicolon[:-1] + else: + name = name_with_semicolon + + # When parsing HTML, we want to recognize any known named + # entity and convert it to a sequence of Unicode + # characters. + if name not in name_to_unicode: + name_to_unicode[name] = character + + # When _generating_ HTML, we want to recognize special + # character sequences that _could_ be converted to named + # entities. + unicode_to_name[character] = name + + # We also need to build a regular expression that lets us + # _find_ those characters in output strings so we can + # replace them. + # + # This is tricky, for two reasons. + + if len(character) == 1 and ord(character) < 128 and character not in "<>": + # First, it would be annoying to turn single ASCII + # characters like | into named entities like + # |. The exceptions are <>, which we _must_ + # turn into named entities to produce valid HTML. + continue + + if len(character) > 1 and all(ord(x) < 128 for x in character): + # We also do not want to turn _combinations_ of ASCII + # characters like 'fj' into named entities like 'fj', + # though that's more debateable. + continue + + # Second, some named entities have a Unicode value that's + # a subset of the Unicode value for some _other_ named + # entity. As an example, \u2267' is ≧, + # but '\u2267\u0338' is ≧̸. Our regular + # expression needs to match the first two characters of + # "\u2267\u0338foo", but only the first character of + # "\u2267foo". + # + # In this step, we build two sets of characters that + # _eventually_ need to go into the regular expression. But + # we won't know exactly what the regular expression needs + # to look like until we've gone through the entire list of + # named entities. + if len(character) == 1 and character != "&": + short_entities.add(character) + else: + long_entities_by_first_character[character[0]].add(character) + + # Now that we've been through the entire list of entities, we + # can create a regular expression that matches any of them. + particles = set() + for short in short_entities: + long_versions = long_entities_by_first_character[short] + if not long_versions: + particles.add(short) + else: + ignore = "".join([x[1] for x in long_versions]) + # This finds, e.g. \u2267 but only if it is _not_ + # followed by \u0338. + particles.add("%s(?![%s])" % (short, ignore)) + + for long_entities in list(long_entities_by_first_character.values()): + for long_entity in long_entities: + particles.add(long_entity) + + re_definition = "(%s)" % "|".join(particles) + + particles.add("&") + re_definition_with_ampersand = "(%s)" % "|".join(particles) + + # If an entity shows up in both html5 and codepoint2name, it's + # likely that HTML5 gives it several different names, such as + # 'rsquo' and 'rsquor'. When converting Unicode characters to + # named entities, the codepoint2name name should take + # precedence where possible, since that's the more easily + # recognizable one. + for codepoint, name in list(codepoint2name.items()): + character = chr(codepoint) + unicode_to_name[character] = name + + cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name + cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode + cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition) + cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile( + re_definition_with_ampersand + ) + + #: A map of Unicode strings to the corresponding named XML entities. + #: + #: :meta hide-value: + CHARACTER_TO_XML_ENTITY: Dict[str, str] = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + # Matches any named or numeric HTML entity. + ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I) + + #: A regular expression matching an angle bracket or an ampersand that + #: is not part of an XML or HTML entity. + #: + #: :meta hide-value: + BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile( + "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")" + ) + + #: A regular expression matching an angle bracket or an ampersand. + #: + #: :meta hide-value: + AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])") + + @classmethod + def _substitute_html_entity(cls, matchobj: re.Match) -> str: + """Used with a regular expression to substitute the + appropriate HTML entity for a special character string.""" + original_entity = matchobj.group(0) + entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity) + if entity is None: + return "&%s;" % original_entity + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj: re.Match) -> str: + """Used with a regular expression to substitute the + appropriate XML entity for a special character string.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def _escape_entity_name(cls, matchobj: re.Match) -> str: + return "&%s;" % matchobj.group(1) + + @classmethod + def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str: + possible_entity = matchobj.group(1) + if possible_entity in cls.HTML_ENTITY_TO_CHARACTER: + return "&%s;" % possible_entity + return "&%s;" % possible_entity + + @classmethod + def quoted_attribute_value(cls, value: str) -> str: + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> Welcome to "Bob's bar" + + :param value: The XML attribute value to quote + :return: The quoted value + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str: + """Replace special XML characters with named XML entities. + + The less-than sign will become <, the greater-than sign + will become >, and any ampersands will become &. If you + want ampersands that seem to be part of an entity definition + to be left alone, use `substitute_xml_containing_entities` + instead. + + :param value: A string to be substituted. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + + :return: A version of ``value`` with special characters replaced + with named entities. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value: str, make_quoted_attribute: bool = False + ) -> str: + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s: str) -> str: + """Replace certain Unicode characters with named HTML entities. + + This differs from ``data.encode(encoding, 'xmlcharrefreplace')`` + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + + :param s: The string to be modified. + :return: The string with some Unicode characters replaced with + HTML entities. + """ + # Convert any appropriate characters to HTML entities. + return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub( + cls._substitute_html_entity, s + ) + + @classmethod + def substitute_html5(cls, s: str) -> str: + """Replace certain Unicode characters with named HTML entities + using HTML5 rules. + + Specifically, this method is much less aggressive about + escaping ampersands than substitute_html. Only ambiguous + ampersands are escaped, per the HTML5 standard: + + "An ambiguous ampersand is a U+0026 AMPERSAND character (&) + that is followed by one or more ASCII alphanumerics, followed + by a U+003B SEMICOLON character (;), where these characters do + not match any of the names given in the named character + references section." + + Unlike substitute_html5_raw, this method assumes HTML entities + were converted to Unicode characters on the way in, as + Beautiful Soup does. By the time Beautiful Soup does its work, + the only ambiguous ampersands that need to be escaped are the + ones that were escaped in the original markup when mentioning + HTML entities. + + :param s: The string to be modified. + :return: The string with some Unicode characters replaced with + HTML entities. + """ + # First, escape any HTML entities found in the markup. + s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s) + + # Next, convert any appropriate characters to unescaped HTML entities. + s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s) + + return s + + @classmethod + def substitute_html5_raw(cls, s: str) -> str: + """Replace certain Unicode characters with named HTML entities + using HTML5 rules. + + substitute_html5_raw is similar to substitute_html5 but it is + designed for standalone use (whereas substitute_html5 is + designed for use with Beautiful Soup). + + :param s: The string to be modified. + :return: The string with some Unicode characters replaced with + HTML entities. + """ + # First, escape the ampersand for anything that looks like an + # entity but isn't in the list of recognized entities. All other + # ampersands can be left alone. + s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s) + + # Then, convert a range of Unicode characters to unescaped + # HTML entities. + s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s) + + return s + + +EntitySubstitution._populate_class_variables() + + +class EncodingDetector: + """This class is capable of guessing a number of possible encodings + for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the ``known_definite_encodings`` argument to the constructor). + + 2. An encoding determined by sniffing the document's byte-order mark. + + 3. Encodings you specifically tell EncodingDetector to try if + byte-order mark sniffing fails (the ``user_encodings`` argument to the + constructor). + + 4. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a <meta> tag (if the bytestring is to be + interpreted as an HTML document.) + + 5. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 6. UTF-8. + + 7. Windows-1252. + + :param markup: Some markup in an unknown encoding. + + :param known_definite_encodings: When determining the encoding + of ``markup``, these encodings will be tried first, in + order. In HTML terms, this corresponds to the "known + definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_. + + :param user_encodings: These encodings will be tried after the + ``known_definite_encodings`` have been tried and failed, and + after an attempt to sniff the encoding by looking at a + byte order mark has failed. In HTML terms, this + corresponds to the step "user has explicitly instructed + the user agent to override the document's character + encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_. + + :param override_encodings: A **deprecated** alias for + ``known_definite_encodings``. Any encodings here will be tried + immediately after the encodings in + ``known_definite_encodings``. + + :param is_html: If True, this markup is considered to be + HTML. Otherwise it's assumed to be XML. + + :param exclude_encodings: These encodings will not be tried, + even if they otherwise would be. + + """ + + def __init__( + self, + markup: bytes, + known_definite_encodings: Optional[_Encodings] = None, + is_html: Optional[bool] = False, + exclude_encodings: Optional[_Encodings] = None, + user_encodings: Optional[_Encodings] = None, + override_encodings: Optional[_Encodings] = None, + ): + self.known_definite_encodings = list(known_definite_encodings or []) + if override_encodings: + warnings.warn( + "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.", + DeprecationWarning, + stacklevel=3, + ) + self.known_definite_encodings += override_encodings + self.user_encodings = user_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) + self.chardet_encoding = None + self.is_html = False if is_html is None else is_html + self.declared_encoding: Optional[str] = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + known_definite_encodings: _Encodings + user_encodings: _Encodings + exclude_encodings: _Encodings + chardet_encoding: Optional[_Encoding] + is_html: bool + declared_encoding: Optional[_Encoding] + markup: bytes + sniffed_encoding: Optional[_Encoding] + + def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool: + """Should we even bother to try this encoding? + + :param encoding: Name of an encoding. + :param tried: Encodings that have already been tried. This + will be modified as a side effect. + """ + if encoding is None: + return False + encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self) -> Iterator[_Encoding]: + """Yield a number of encodings that might work for this markup. + + :yield: A sequence of strings. Each is the name of an encoding + that *might* work to convert a bytestring into Unicode. + """ + tried: Set[_Encoding] = set() + + # First, try the known definite encodings + for e in self.known_definite_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self.sniffed_encoding is not None and self._usable( + self.sniffed_encoding, tried + ): + yield self.sniffed_encoding + + # Sniffing the byte-order mark did nothing; try the user + # encodings. + for e in self.user_encodings: + if self._usable(e, tried): + yield e + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html + ) + if self.declared_encoding is not None and self._usable( + self.declared_encoding, tried + ): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = _chardet_dammit(self.markup) + if self.chardet_encoding is not None and self._usable( + self.chardet_encoding, tried + ): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ("utf-8", "windows-1252"): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]: + """If a byte-order mark is present, strip it and return the encoding it implies. + + :param data: A bytestring that may or may not begin with a + byte-order mark. + + :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark) + """ + encoding = None + if isinstance(data, str): + # Unicode data cannot have a byte-order mark. + return data, encoding + if ( + (len(data) >= 4) + and (data[:2] == b"\xfe\xff") + and (data[2:4] != b"\x00\x00") + ): + encoding = "utf-16be" + data = data[2:] + elif ( + (len(data) >= 4) + and (data[:2] == b"\xff\xfe") + and (data[2:4] != b"\x00\x00") + ): + encoding = "utf-16le" + data = data[2:] + elif data[:3] == b"\xef\xbb\xbf": + encoding = "utf-8" + data = data[3:] + elif data[:4] == b"\x00\x00\xfe\xff": + encoding = "utf-32be" + data = data[4:] + elif data[:4] == b"\xff\xfe\x00\x00": + encoding = "utf-32le" + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding( + cls, + markup: Union[bytes, str], + is_html: bool = False, + search_entire_document: bool = False, + ) -> Optional[_Encoding]: + """Given a document, tries to find an encoding declared within the + text of the document itself. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a <meta> tag, hopefully near the + beginning of the document. + + :param markup: Some markup. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param search_entire_document: Since an encoding is supposed + to declared near the beginning of the document, most of + the time it's only necessary to search a few kilobytes of + data. Set this to True to force this method to search the + entire document. + :return: The declared encoding, if one is found. + """ + if search_entire_document: + xml_endpos = html_endpos = len(markup) + else: + xml_endpos = 1024 + html_endpos = max(2048, int(len(markup) * 0.05)) + + if isinstance(markup, bytes): + res = encoding_res[bytes] + else: + res = encoding_res[str] + + xml_re = res["xml"] + html_re = res["html"] + declared_encoding: Optional[_Encoding] = None + declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) + if not declared_encoding_match and is_html: + declared_encoding_match = html_re.search(markup, endpos=html_endpos) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0] + if declared_encoding: + if isinstance(declared_encoding, bytes): + declared_encoding = declared_encoding.decode("ascii", "replace") + return declared_encoding.lower() + return None + + +class UnicodeDammit: + """A class for detecting the encoding of a bytestring containing an + HTML or XML document, and decoding it to Unicode. If the source + encoding is windows-1252, `UnicodeDammit` can also replace + Microsoft smart quotes with their HTML or XML equivalents. + + :param markup: HTML or XML markup in an unknown encoding. + + :param known_definite_encodings: When determining the encoding + of ``markup``, these encodings will be tried first, in + order. In HTML terms, this corresponds to the "known + definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_. + + :param user_encodings: These encodings will be tried after the + ``known_definite_encodings`` have been tried and failed, and + after an attempt to sniff the encoding by looking at a + byte order mark has failed. In HTML terms, this + corresponds to the step "user has explicitly instructed + the user agent to override the document's character + encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_. + + :param override_encodings: A **deprecated** alias for + ``known_definite_encodings``. Any encodings here will be tried + immediately after the encodings in + ``known_definite_encodings``. + + :param smart_quotes_to: By default, Microsoft smart quotes will, + like all other characters, be converted to Unicode + characters. Setting this to ``ascii`` will convert them to ASCII + quotes instead. Setting it to ``xml`` will convert them to XML + entity references, and setting it to ``html`` will convert them + to HTML entity references. + + :param is_html: If True, ``markup`` is treated as an HTML + document. Otherwise it's treated as an XML document. + + :param exclude_encodings: These encodings will not be considered, + even if the sniffing code thinks they might make sense. + + """ + + def __init__( + self, + markup: bytes, + known_definite_encodings: Optional[_Encodings] = [], + smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None, + is_html: bool = False, + exclude_encodings: Optional[_Encodings] = [], + user_encodings: Optional[_Encodings] = None, + override_encodings: Optional[_Encodings] = None, + ): + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + self.is_html = is_html + self.log = getLogger(__name__) + self.detector = EncodingDetector( + markup, + known_definite_encodings, + is_html, + exclude_encodings, + user_encodings, + override_encodings, + ) + + # Short-circuit if the data is in Unicode to begin with. + if isinstance(markup, str) or markup == b"": + self.markup = markup + self.unicode_markup = str(markup) + self.original_encoding = None + return + + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. + self.markup = self.detector.markup + + u = None + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break + + if not u: + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. + + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") + if u is not None: + self.log.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER." + ) + + self.contains_replacement_characters = True + break + + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. + # + # Note that this is extremely unlikely, probably impossible, + # because the "replace" strategy is so powerful. Even running + # the Python binary through Unicode, Dammit gives you Unicode, + # albeit Unicode riddled with REPLACEMENT CHARACTER. + if u is None: + self.original_encoding = None + self.unicode_markup = None + else: + self.unicode_markup = u + + #: The original markup, before it was converted to Unicode. + #: This is not necessarily the same as what was passed in to the + #: constructor, since any byte-order mark will be stripped. + markup: bytes + + #: The Unicode version of the markup, following conversion. This + #: is set to None if there was simply no way to convert the + #: bytestring to Unicode (as with binary data). + unicode_markup: Optional[str] + + #: This is True if `UnicodeDammit.unicode_markup` contains + #: U+FFFD REPLACEMENT_CHARACTER characters which were not present + #: in `UnicodeDammit.markup`. These mark character sequences that + #: could not be represented in Unicode. + contains_replacement_characters: bool + + #: Unicode, Dammit's best guess as to the original character + #: encoding of `UnicodeDammit.markup`. + original_encoding: Optional[_Encoding] + + #: The strategy used to handle Microsoft smart quotes. + smart_quotes_to: Optional[str] + + #: The (encoding, error handling strategy) 2-tuples that were used to + #: try and convert the markup to Unicode. + tried_encodings: List[Tuple[_Encoding, str]] + + log: Logger #: :meta private: + + def _sub_ms_char(self, match: re.Match) -> bytes: + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character. + + TODO: Since this is only used to convert smart quotes, it + could be simplified, and MS_CHARS_TO_ASCII made much less + parochial. + """ + orig: bytes = match.group(1) + sub: bytes + if self.smart_quotes_to == "ascii": + if orig in self.MS_CHARS_TO_ASCII: + sub = self.MS_CHARS_TO_ASCII[orig].encode() + else: + # Shouldn't happen; substitute the character + # with itself. + sub = orig + else: + if orig in self.MS_CHARS: + substitutions = self.MS_CHARS[orig] + if type(substitutions) is tuple: + if self.smart_quotes_to == "xml": + sub = b"&#x" + substitutions[1].encode() + b";" + else: + sub = b"&" + substitutions[0].encode() + b";" + else: + substitutions = cast(str, substitutions) + sub = substitutions.encode() + else: + # Shouldn't happen; substitute the character + # for itself. + sub = orig + return sub + + #: This dictionary maps commonly seen values for "charset" in HTML + #: meta tags to the corresponding Python codec names. It only covers + #: values that aren't in Python's aliases and can't be determined + #: by the heuristics in `find_codec`. + #: + #: :meta hide-value: + CHARSET_ALIASES: Dict[str, _Encoding] = { + "macintosh": "mac-roman", + "x-sjis": "shift-jis", + } + + #: A list of encodings that tend to contain Microsoft smart quotes. + #: + #: :meta hide-value: + ENCODINGS_WITH_SMART_QUOTES: _Encodings = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def _convert_from( + self, proposed: _Encoding, errors: str = "strict" + ) -> Optional[str]: + """Attempt to convert the markup to the proposed encoding. + + :param proposed: The name of a character encoding. + :param errors: An error handling strategy, used when calling `str`. + :return: The converted markup, or `None` if the proposed + encoding/error handling strategy didn't work. + """ + lookup_result = self.find_codec(proposed) + if lookup_result is None or (lookup_result, errors) in self.tried_encodings: + return None + proposed = lookup_result + self.tried_encodings.append((proposed, errors)) + markup = self.markup + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if ( + self.smart_quotes_to is not None + and proposed in self.ENCODINGS_WITH_SMART_QUOTES + ): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + # print("Trying to convert document to %s (errors=%s)" % ( + # proposed, errors)) + u = self._to_unicode(markup, proposed, errors) + self.unicode_markup = u + self.original_encoding = proposed + except Exception: + # print("That didn't work!") + # print(e) + return None + # print("Correct encoding: %s" % proposed) + return self.unicode_markup + + def _to_unicode( + self, data: bytes, encoding: _Encoding, errors: str = "strict" + ) -> str: + """Given a bytestring and its encoding, decodes the string into Unicode. + + :param encoding: The name of an encoding. + :param errors: An error handling strategy, used when calling `str`. + """ + return str(data, encoding, errors) + + @property + def declared_html_encoding(self) -> Optional[_Encoding]: + """If the markup is an HTML document, returns the encoding, if any, + declared *inside* the document. + """ + if not self.is_html: + return None + return self.detector.declared_encoding + + def find_codec(self, charset: _Encoding) -> Optional[str]: + """Look up the Python codec corresponding to a given character set. + + :param charset: The name of a character set. + :return: The name of a Python codec. + """ + value = ( + self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) + or charset + ) + if value: + return value.lower() + return None + + def _codec(self, charset: _Encoding) -> Optional[str]: + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + #: + #: :meta hide-value: + MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = { + b"\x80": ("euro", "20AC"), + b"\x81": " ", + b"\x82": ("sbquo", "201A"), + b"\x83": ("fnof", "192"), + b"\x84": ("bdquo", "201E"), + b"\x85": ("hellip", "2026"), + b"\x86": ("dagger", "2020"), + b"\x87": ("Dagger", "2021"), + b"\x88": ("circ", "2C6"), + b"\x89": ("permil", "2030"), + b"\x8a": ("Scaron", "160"), + b"\x8b": ("lsaquo", "2039"), + b"\x8c": ("OElig", "152"), + b"\x8d": "?", + b"\x8e": ("#x17D", "17D"), + b"\x8f": "?", + b"\x90": "?", + b"\x91": ("lsquo", "2018"), + b"\x92": ("rsquo", "2019"), + b"\x93": ("ldquo", "201C"), + b"\x94": ("rdquo", "201D"), + b"\x95": ("bull", "2022"), + b"\x96": ("ndash", "2013"), + b"\x97": ("mdash", "2014"), + b"\x98": ("tilde", "2DC"), + b"\x99": ("trade", "2122"), + b"\x9a": ("scaron", "161"), + b"\x9b": ("rsaquo", "203A"), + b"\x9c": ("oelig", "153"), + b"\x9d": "?", + b"\x9e": ("#x17E", "17E"), + b"\x9f": ("Yuml", ""), + } + + #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + #: horrors like stripping diacritical marks to turn á into a, but also + #: contains non-horrors like turning “ into ". + #: + #: Seriously, don't use this for anything other than removing smart + #: quotes. + #: + #: :meta private: + MS_CHARS_TO_ASCII: Dict[bytes, str] = { + b"\x80": "EUR", + b"\x81": " ", + b"\x82": ",", + b"\x83": "f", + b"\x84": ",,", + b"\x85": "...", + b"\x86": "+", + b"\x87": "++", + b"\x88": "^", + b"\x89": "%", + b"\x8a": "S", + b"\x8b": "<", + b"\x8c": "OE", + b"\x8d": "?", + b"\x8e": "Z", + b"\x8f": "?", + b"\x90": "?", + b"\x91": "'", + b"\x92": "'", + b"\x93": '"', + b"\x94": '"', + b"\x95": "*", + b"\x96": "-", + b"\x97": "--", + b"\x98": "~", + b"\x99": "(TM)", + b"\x9a": "s", + b"\x9b": ">", + b"\x9c": "oe", + b"\x9d": "?", + b"\x9e": "z", + b"\x9f": "Y", + b"\xa0": " ", + b"\xa1": "!", + b"\xa2": "c", + b"\xa3": "GBP", + b"\xa4": "$", # This approximation is especially parochial--this is the + # generic currency symbol. + b"\xa5": "YEN", + b"\xa6": "|", + b"\xa7": "S", + b"\xa8": "..", + b"\xa9": "", + b"\xaa": "(th)", + b"\xab": "<<", + b"\xac": "!", + b"\xad": " ", + b"\xae": "(R)", + b"\xaf": "-", + b"\xb0": "o", + b"\xb1": "+-", + b"\xb2": "2", + b"\xb3": "3", + b"\xb4": "'", + b"\xb5": "u", + b"\xb6": "P", + b"\xb7": "*", + b"\xb8": ",", + b"\xb9": "1", + b"\xba": "(th)", + b"\xbb": ">>", + b"\xbc": "1/4", + b"\xbd": "1/2", + b"\xbe": "3/4", + b"\xbf": "?", + b"\xc0": "A", + b"\xc1": "A", + b"\xc2": "A", + b"\xc3": "A", + b"\xc4": "A", + b"\xc5": "A", + b"\xc6": "AE", + b"\xc7": "C", + b"\xc8": "E", + b"\xc9": "E", + b"\xca": "E", + b"\xcb": "E", + b"\xcc": "I", + b"\xcd": "I", + b"\xce": "I", + b"\xcf": "I", + b"\xd0": "D", + b"\xd1": "N", + b"\xd2": "O", + b"\xd3": "O", + b"\xd4": "O", + b"\xd5": "O", + b"\xd6": "O", + b"\xd7": "*", + b"\xd8": "O", + b"\xd9": "U", + b"\xda": "U", + b"\xdb": "U", + b"\xdc": "U", + b"\xdd": "Y", + b"\xde": "b", + b"\xdf": "B", + b"\xe0": "a", + b"\xe1": "a", + b"\xe2": "a", + b"\xe3": "a", + b"\xe4": "a", + b"\xe5": "a", + b"\xe6": "ae", + b"\xe7": "c", + b"\xe8": "e", + b"\xe9": "e", + b"\xea": "e", + b"\xeb": "e", + b"\xec": "i", + b"\xed": "i", + b"\xee": "i", + b"\xef": "i", + b"\xf0": "o", + b"\xf1": "n", + b"\xf2": "o", + b"\xf3": "o", + b"\xf4": "o", + b"\xf5": "o", + b"\xf6": "o", + b"\xf7": "/", + b"\xf8": "o", + b"\xf9": "u", + b"\xfa": "u", + b"\xfb": "u", + b"\xfc": "u", + b"\xfd": "y", + b"\xfe": "b", + b"\xff": "y", + } + + #: A map used when removing rogue Windows-1252/ISO-8859-1 + #: characters in otherwise UTF-8 documents. + #: + #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in + #: Windows-1252. + #: + #: :meta hide-value: + WINDOWS_1252_TO_UTF8: Dict[int, bytes] = { + 0x80: b"\xe2\x82\xac", # € + 0x82: b"\xe2\x80\x9a", # ‚ + 0x83: b"\xc6\x92", # ƒ + 0x84: b"\xe2\x80\x9e", # „ + 0x85: b"\xe2\x80\xa6", # … + 0x86: b"\xe2\x80\xa0", # † + 0x87: b"\xe2\x80\xa1", # ‡ + 0x88: b"\xcb\x86", # ˆ + 0x89: b"\xe2\x80\xb0", # ‰ + 0x8A: b"\xc5\xa0", # Š + 0x8B: b"\xe2\x80\xb9", # ‹ + 0x8C: b"\xc5\x92", # Œ + 0x8E: b"\xc5\xbd", # Ž + 0x91: b"\xe2\x80\x98", # ‘ + 0x92: b"\xe2\x80\x99", # ’ + 0x93: b"\xe2\x80\x9c", # “ + 0x94: b"\xe2\x80\x9d", # ” + 0x95: b"\xe2\x80\xa2", # • + 0x96: b"\xe2\x80\x93", # – + 0x97: b"\xe2\x80\x94", # — + 0x98: b"\xcb\x9c", # ˜ + 0x99: b"\xe2\x84\xa2", # ™ + 0x9A: b"\xc5\xa1", # š + 0x9B: b"\xe2\x80\xba", # › + 0x9C: b"\xc5\x93", # œ + 0x9E: b"\xc5\xbe", # ž + 0x9F: b"\xc5\xb8", # Ÿ + 0xA0: b"\xc2\xa0", # + 0xA1: b"\xc2\xa1", # ¡ + 0xA2: b"\xc2\xa2", # ¢ + 0xA3: b"\xc2\xa3", # £ + 0xA4: b"\xc2\xa4", # ¤ + 0xA5: b"\xc2\xa5", # ¥ + 0xA6: b"\xc2\xa6", # ¦ + 0xA7: b"\xc2\xa7", # § + 0xA8: b"\xc2\xa8", # ¨ + 0xA9: b"\xc2\xa9", # © + 0xAA: b"\xc2\xaa", # ª + 0xAB: b"\xc2\xab", # « + 0xAC: b"\xc2\xac", # ¬ + 0xAD: b"\xc2\xad", # + 0xAE: b"\xc2\xae", # ® + 0xAF: b"\xc2\xaf", # ¯ + 0xB0: b"\xc2\xb0", # ° + 0xB1: b"\xc2\xb1", # ± + 0xB2: b"\xc2\xb2", # ² + 0xB3: b"\xc2\xb3", # ³ + 0xB4: b"\xc2\xb4", # ´ + 0xB5: b"\xc2\xb5", # µ + 0xB6: b"\xc2\xb6", # ¶ + 0xB7: b"\xc2\xb7", # · + 0xB8: b"\xc2\xb8", # ¸ + 0xB9: b"\xc2\xb9", # ¹ + 0xBA: b"\xc2\xba", # º + 0xBB: b"\xc2\xbb", # » + 0xBC: b"\xc2\xbc", # ¼ + 0xBD: b"\xc2\xbd", # ½ + 0xBE: b"\xc2\xbe", # ¾ + 0xBF: b"\xc2\xbf", # ¿ + 0xC0: b"\xc3\x80", # À + 0xC1: b"\xc3\x81", # Á + 0xC2: b"\xc3\x82", #  + 0xC3: b"\xc3\x83", # à + 0xC4: b"\xc3\x84", # Ä + 0xC5: b"\xc3\x85", # Å + 0xC6: b"\xc3\x86", # Æ + 0xC7: b"\xc3\x87", # Ç + 0xC8: b"\xc3\x88", # È + 0xC9: b"\xc3\x89", # É + 0xCA: b"\xc3\x8a", # Ê + 0xCB: b"\xc3\x8b", # Ë + 0xCC: b"\xc3\x8c", # Ì + 0xCD: b"\xc3\x8d", # Í + 0xCE: b"\xc3\x8e", # Î + 0xCF: b"\xc3\x8f", # Ï + 0xD0: b"\xc3\x90", # Ð + 0xD1: b"\xc3\x91", # Ñ + 0xD2: b"\xc3\x92", # Ò + 0xD3: b"\xc3\x93", # Ó + 0xD4: b"\xc3\x94", # Ô + 0xD5: b"\xc3\x95", # Õ + 0xD6: b"\xc3\x96", # Ö + 0xD7: b"\xc3\x97", # × + 0xD8: b"\xc3\x98", # Ø + 0xD9: b"\xc3\x99", # Ù + 0xDA: b"\xc3\x9a", # Ú + 0xDB: b"\xc3\x9b", # Û + 0xDC: b"\xc3\x9c", # Ü + 0xDD: b"\xc3\x9d", # Ý + 0xDE: b"\xc3\x9e", # Þ + 0xDF: b"\xc3\x9f", # ß + 0xE0: b"\xc3\xa0", # à + 0xE1: b"\xa1", # á + 0xE2: b"\xc3\xa2", # â + 0xE3: b"\xc3\xa3", # ã + 0xE4: b"\xc3\xa4", # ä + 0xE5: b"\xc3\xa5", # å + 0xE6: b"\xc3\xa6", # æ + 0xE7: b"\xc3\xa7", # ç + 0xE8: b"\xc3\xa8", # è + 0xE9: b"\xc3\xa9", # é + 0xEA: b"\xc3\xaa", # ê + 0xEB: b"\xc3\xab", # ë + 0xEC: b"\xc3\xac", # ì + 0xED: b"\xc3\xad", # í + 0xEE: b"\xc3\xae", # î + 0xEF: b"\xc3\xaf", # ï + 0xF0: b"\xc3\xb0", # ð + 0xF1: b"\xc3\xb1", # ñ + 0xF2: b"\xc3\xb2", # ò + 0xF3: b"\xc3\xb3", # ó + 0xF4: b"\xc3\xb4", # ô + 0xF5: b"\xc3\xb5", # õ + 0xF6: b"\xc3\xb6", # ö + 0xF7: b"\xc3\xb7", # ÷ + 0xF8: b"\xc3\xb8", # ø + 0xF9: b"\xc3\xb9", # ù + 0xFA: b"\xc3\xba", # ú + 0xFB: b"\xc3\xbb", # û + 0xFC: b"\xc3\xbc", # ü + 0xFD: b"\xc3\xbd", # ý + 0xFE: b"\xc3\xbe", # þ + } + + #: :meta private: + MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [ + (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF + (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF + (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4 + ] + + #: :meta private: + FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0] + + #: :meta private: + LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle( + cls, + in_bytes: bytes, + main_encoding: _Encoding = "utf8", + embedded_encoding: _Encoding = "windows-1252", + ) -> bytes: + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + :param in_bytes: A bytestring that you suspect contains + characters from multiple encodings. Note that this *must* + be a bytestring. If you've already converted the document + to Unicode, you're too late. + :param main_encoding: The primary encoding of ``in_bytes``. + :param embedded_encoding: The encoding that was used to embed characters + in the main document. + :return: A bytestring similar to ``in_bytes``, in which + ``embedded_encoding`` characters have been converted to + their ``main_encoding`` equivalents. + """ + if embedded_encoding.replace("_", "-").lower() not in ( + "windows-1252", + "windows_1252", + ): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings." + ) + + if main_encoding.lower() not in ("utf8", "utf-8"): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding." + ) + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER: + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b"".join(byte_chunks) |