diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/markdown/htmlparser.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/markdown/htmlparser.py | 347 |
1 files changed, 347 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/markdown/htmlparser.py b/.venv/lib/python3.12/site-packages/markdown/htmlparser.py new file mode 100644 index 00000000..33b918d5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/markdown/htmlparser.py @@ -0,0 +1,347 @@ +# Python Markdown + +# A Python implementation of John Gruber's Markdown. + +# Documentation: https://python-markdown.github.io/ +# GitHub: https://github.com/Python-Markdown/markdown/ +# PyPI: https://pypi.org/project/Markdown/ + +# Started by Manfred Stienstra (http://www.dwerg.net/). +# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +# Currently maintained by Waylan Limberg (https://github.com/waylan), +# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) +# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +# Copyright 2004 Manfred Stienstra (the original version) + +# License: BSD (see LICENSE.md for details). + +""" +This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches. +A copy is imported rather than the module being directly imported as this ensures that the user can import +and use the unmodified library for their own needs. +""" + +from __future__ import annotations + +import re +import importlib.util +import sys +from typing import TYPE_CHECKING, Sequence + +if TYPE_CHECKING: # pragma: no cover + from markdown import Markdown + + +# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. +# Users can still do `from html import parser` and get the default behavior. +spec = importlib.util.find_spec('html.parser') +htmlparser = importlib.util.module_from_spec(spec) +spec.loader.exec_module(htmlparser) +sys.modules['htmlparser'] = htmlparser + +# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions. +htmlparser.piclose = re.compile(r'\?>') +# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. +htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') +# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block, +# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete, +# and the two regex are the same, then incomplete will simply never match and we avoid the logic within. +htmlparser.incomplete = htmlparser.entityref +# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value. +htmlparser.locatestarttagend_tolerant = re.compile(r""" + <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here + (?:[\s/]* # optional whitespace before attribute name + (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here + (?:\s*=+\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^`>\s]* # bare value <= added backtick here + ) + (?:\s*,)* # possibly followed by a comma + )?(?:\s|/(?!>))* + )* + )? + \s* # trailing whitespace +""", re.VERBOSE) + +# Match a blank line at the start of a block of text (two newlines). +# The newlines may be preceded by additional whitespace. +blank_line_re = re.compile(r'^([ ]*\n){2}') + + +class HTMLExtractor(htmlparser.HTMLParser): + """ + Extract raw HTML from text. + + The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the + [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text + is stored in `cleandoc` as a list of strings. + """ + + def __init__(self, md: Markdown, *args, **kwargs): + if 'convert_charrefs' not in kwargs: + kwargs['convert_charrefs'] = False + + # Block tags that should contain no content (self closing) + self.empty_tags = set(['hr']) + + self.lineno_start_cache = [0] + + # This calls self.reset + super().__init__(*args, **kwargs) + self.md = md + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" + self.inraw = False + self.intail = False + self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags + self._cache: list[str] = [] + self.cleandoc: list[str] = [] + self.lineno_start_cache = [0] + + super().reset() + + def close(self): + """Handle any buffered data.""" + super().close() + if len(self.rawdata): + # Temp fix for https://bugs.python.org/issue41989 + # TODO: remove this when the bug is fixed in all supported Python versions. + if self.convert_charrefs and not self.cdata_elem: # pragma: no cover + self.handle_data(htmlparser.unescape(self.rawdata)) + else: + self.handle_data(self.rawdata) + # Handle any unclosed tags. + if len(self._cache): + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + self._cache = [] + + @property + def line_offset(self) -> int: + """Returns char index in `self.rawdata` for the start of the current line. """ + for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): + last_line_start_pos = self.lineno_start_cache[ii] + lf_pos = self.rawdata.find('\n', last_line_start_pos) + if lf_pos == -1: + # No more newlines found. Use end of raw data as start of line beyond end. + lf_pos = len(self.rawdata) + self.lineno_start_cache.append(lf_pos+1) + + return self.lineno_start_cache[self.lineno-1] + + def at_line_start(self) -> bool: + """ + Returns True if current position is at start of line. + + Allows for up to three blank spaces at start of line. + """ + if self.offset == 0: + return True + if self.offset > 3: + return False + # Confirm up to first 3 chars are whitespace + return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' + + def get_endtag_text(self, tag: str) -> str: + """ + Returns the text of the end tag. + + If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. + """ + # Attempt to extract actual tag from raw source text + start = self.line_offset + self.offset + m = htmlparser.endendtag.search(self.rawdata, start) + if m: + return self.rawdata[start:m.end()] + else: # pragma: no cover + # Failed to extract from raw data. Assume well formed and lowercase. + return '</{}>'.format(tag) + + def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]): + # Handle tags that should always be empty and do not specify a closing tag + if tag in self.empty_tags: + self.handle_startendtag(tag, attrs) + return + + if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): + # Started a new raw block. Prepare stack. + self.inraw = True + self.cleandoc.append('\n') + + text = self.get_starttag_text() + if self.inraw: + self.stack.append(tag) + self._cache.append(text) + else: + self.cleandoc.append(text) + if tag in self.CDATA_CONTENT_ELEMENTS: + # This is presumably a standalone tag in a code span (see #1036). + self.clear_cdata_mode() + + def handle_endtag(self, tag: str): + text = self.get_endtag_text(tag) + + if self.inraw: + self._cache.append(text) + if tag in self.stack: + # Remove tag from stack + while self.stack: + if self.stack.pop() == tag: + break + if len(self.stack) == 0: + # End of raw block. + if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): + # Preserve blank line and end of raw block. + self._cache.append('\n') + else: + # More content exists after `endtag`. + self.intail = True + # Reset stack. + self.inraw = False + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + # Insert blank line between this and next line. + self.cleandoc.append('\n\n') + self._cache = [] + else: + self.cleandoc.append(text) + + def handle_data(self, data: str): + if self.intail and '\n' in data: + self.intail = False + if self.inraw: + self._cache.append(data) + else: + self.cleandoc.append(data) + + def handle_empty_tag(self, data: str, is_block: bool): + """ Handle empty tags (`<data>`). """ + if self.inraw or self.intail: + # Append this to the existing raw block + self._cache.append(data) + elif self.at_line_start() and is_block: + # Handle this as a standalone raw block + if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): + # Preserve blank line after tag in raw block. + data += '\n' + else: + # More content exists after tag. + self.intail = True + item = self.cleandoc[-1] if self.cleandoc else '' + # If we only have one newline before block element, add another + if not item.endswith('\n\n') and item.endswith('\n'): + self.cleandoc.append('\n') + self.cleandoc.append(self.md.htmlStash.store(data)) + # Insert blank line between this and next line. + self.cleandoc.append('\n\n') + else: + self.cleandoc.append(data) + + def handle_startendtag(self, tag: str, attrs): + self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) + + def handle_charref(self, name: str): + self.handle_empty_tag('&#{};'.format(name), is_block=False) + + def handle_entityref(self, name: str): + self.handle_empty_tag('&{};'.format(name), is_block=False) + + def handle_comment(self, data: str): + self.handle_empty_tag('<!--{}-->'.format(data), is_block=True) + + def handle_decl(self, data: str): + self.handle_empty_tag('<!{}>'.format(data), is_block=True) + + def handle_pi(self, data: str): + self.handle_empty_tag('<?{}?>'.format(data), is_block=True) + + def unknown_decl(self, data: str): + end = ']]>' if data.startswith('CDATA[') else ']>' + self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) + + def parse_pi(self, i: int) -> int: + if self.at_line_start() or self.intail: + return super().parse_pi(i) + # This is not the beginning of a raw block so treat as plain data + # and avoid consuming any tags which may follow (see #1066). + self.handle_data('<?') + return i + 2 + + def parse_html_declaration(self, i: int) -> int: + if self.at_line_start() or self.intail: + return super().parse_html_declaration(i) + # This is not the beginning of a raw block so treat as plain data + # and avoid consuming any tags which may follow (see #1066). + self.handle_data('<!') + return i + 2 + + def parse_bogus_comment(self, i: int, report: int = 0) -> int: + # Override the default behavior so that bogus comments get passed + # through unaltered by setting `report` to `0` (see #1425). + pos = super().parse_bogus_comment(i, report) + if pos == -1: # pragma: no cover + return -1 + self.handle_empty_tag(self.rawdata[i:pos], is_block=False) + return pos + + # The rest has been copied from base class in standard lib to address #1036. + # As `__startag_text` is private, all references to it must be in this subclass. + # The last few lines of `parse_starttag` are reversed so that `handle_starttag` + # can override `cdata_mode` in certain situations (in a code span). + __starttag_text: str | None = None + + def get_starttag_text(self) -> str: + """Return full source of start tag: `<...>`.""" + return self.__starttag_text + + def parse_starttag(self, i: int) -> int: # pragma: no cover + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between `i+1` and `j` into a tag and `attrs` + attrs = [] + match = htmlparser.tagfind_tolerant.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = match.group(1).lower() + while k < endpos: + m = htmlparser.attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = htmlparser.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") # noqa: E127 + else: + offset = offset + len(self.__starttag_text) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: `<span attr="value" />` + self.handle_startendtag(tag, attrs) + else: + # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) *** + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + self.handle_starttag(tag, attrs) + return endpos |