aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/lxml/html/html5parser.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/lxml/html/html5parser.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to '.venv/lib/python3.12/site-packages/lxml/html/html5parser.py')
-rw-r--r--.venv/lib/python3.12/site-packages/lxml/html/html5parser.py260
1 files changed, 260 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/lxml/html/html5parser.py b/.venv/lib/python3.12/site-packages/lxml/html/html5parser.py
new file mode 100644
index 00000000..2f7be156
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/lxml/html/html5parser.py
@@ -0,0 +1,260 @@
+"""
+An interface to html5lib that mimics the lxml.html interface.
+"""
+import sys
+import string
+
+from html5lib import HTMLParser as _HTMLParser
+from html5lib.treebuilders.etree_lxml import TreeBuilder
+from lxml import etree
+from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
+
+# python3 compatibility
+try:
+ _strings = basestring
+except NameError:
+ _strings = (bytes, str)
+try:
+ from urllib2 import urlopen
+except ImportError:
+ from urllib.request import urlopen
+try:
+ from urlparse import urlparse
+except ImportError:
+ from urllib.parse import urlparse
+
+
+class HTMLParser(_HTMLParser):
+ """An html5lib HTML parser with lxml as tree."""
+
+ def __init__(self, strict=False, **kwargs):
+ _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
+
+
+try:
+ from html5lib import XHTMLParser as _XHTMLParser
+except ImportError:
+ pass
+else:
+ class XHTMLParser(_XHTMLParser):
+ """An html5lib XHTML Parser with lxml as tree."""
+
+ def __init__(self, strict=False, **kwargs):
+ _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
+
+ xhtml_parser = XHTMLParser()
+
+
+def _find_tag(tree, tag):
+ elem = tree.find(tag)
+ if elem is not None:
+ return elem
+ return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
+
+
+def document_fromstring(html, guess_charset=None, parser=None):
+ """
+ Parse a whole document into a string.
+
+ If `guess_charset` is true, or if the input is not Unicode but a
+ byte string, the `chardet` library will perform charset guessing
+ on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ if parser is None:
+ parser = html_parser
+
+ options = {}
+ if guess_charset is None and isinstance(html, bytes):
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ guess_charset = True
+ if guess_charset is not None:
+ options['useChardet'] = guess_charset
+ return parser.parse(html, **options).getroot()
+
+
+def fragments_fromstring(html, no_leading_text=False,
+ guess_charset=None, parser=None):
+ """Parses several HTML elements, returning a list of elements.
+
+ The first item in the list may be a string. If no_leading_text is true,
+ then it will be an error if there is leading text, and it will always be
+ a list of only elements.
+
+ If `guess_charset` is true, the `chardet` library will perform charset
+ guessing on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ if parser is None:
+ parser = html_parser
+
+ options = {}
+ if guess_charset is None and isinstance(html, bytes):
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ guess_charset = False
+ if guess_charset is not None:
+ options['useChardet'] = guess_charset
+ children = parser.parseFragment(html, 'div', **options)
+ if children and isinstance(children[0], _strings):
+ if no_leading_text:
+ if children[0].strip():
+ raise etree.ParserError('There is leading text: %r' %
+ children[0])
+ del children[0]
+ return children
+
+
+def fragment_fromstring(html, create_parent=False,
+ guess_charset=None, parser=None):
+ """Parses a single HTML element; it is an error if there is more than
+ one element, or if anything but whitespace precedes or follows the
+ element.
+
+ If 'create_parent' is true (or is a tag name) then a parent node
+ will be created to encapsulate the HTML in a single element. In
+ this case, leading or trailing text is allowed.
+
+ If `guess_charset` is true, the `chardet` library will perform charset
+ guessing on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ accept_leading_text = bool(create_parent)
+
+ elements = fragments_fromstring(
+ html, guess_charset=guess_charset, parser=parser,
+ no_leading_text=not accept_leading_text)
+
+ if create_parent:
+ if not isinstance(create_parent, _strings):
+ create_parent = 'div'
+ new_root = Element(create_parent)
+ if elements:
+ if isinstance(elements[0], _strings):
+ new_root.text = elements[0]
+ del elements[0]
+ new_root.extend(elements)
+ return new_root
+
+ if not elements:
+ raise etree.ParserError('No elements found')
+ if len(elements) > 1:
+ raise etree.ParserError('Multiple elements found')
+ result = elements[0]
+ if result.tail and result.tail.strip():
+ raise etree.ParserError('Element followed by text: %r' % result.tail)
+ result.tail = None
+ return result
+
+
+def fromstring(html, guess_charset=None, parser=None):
+ """Parse the html, returning a single element/document.
+
+ This tries to minimally parse the chunk of text, without knowing if it
+ is a fragment or a document.
+
+ 'base_url' will set the document's base_url attribute (and the tree's
+ docinfo.URL)
+
+ If `guess_charset` is true, or if the input is not Unicode but a
+ byte string, the `chardet` library will perform charset guessing
+ on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+ doc = document_fromstring(html, parser=parser,
+ guess_charset=guess_charset)
+
+ # document starts with doctype or <html>, full document!
+ start = html[:50]
+ if isinstance(start, bytes):
+ # Allow text comparison in python3.
+ # Decode as ascii, that also covers latin-1 and utf-8 for the
+ # characters we need.
+ start = start.decode('ascii', 'replace')
+
+ start = start.lstrip().lower()
+ if start.startswith('<html') or start.startswith('<!doctype'):
+ return doc
+
+ head = _find_tag(doc, 'head')
+
+ # if the head is not empty we have a full document
+ if len(head):
+ return doc
+
+ body = _find_tag(doc, 'body')
+
+ # The body has just one element, so it was probably a single
+ # element passed in
+ if (len(body) == 1 and (not body.text or not body.text.strip())
+ and (not body[-1].tail or not body[-1].tail.strip())):
+ return body[0]
+
+ # Now we have a body which represents a bunch of tags which have the
+ # content that was passed in. We will create a fake container, which
+ # is the body tag, except <body> implies too much structure.
+ if _contains_block_level_tag(body):
+ body.tag = 'div'
+ else:
+ body.tag = 'span'
+ return body
+
+
+def parse(filename_url_or_file, guess_charset=None, parser=None):
+ """Parse a filename, URL, or file-like object into an HTML document
+ tree. Note: this returns a tree, not an element. Use
+ ``parse(...).getroot()`` to get the document root.
+
+ If ``guess_charset`` is true, the ``useChardet`` option is passed into
+ html5lib to enable character detection. This option is on by default
+ when parsing from URLs, off by default when parsing from file(-like)
+ objects (which tend to return Unicode more often than not), and on by
+ default when parsing from a file path (which is read in binary mode).
+ """
+ if parser is None:
+ parser = html_parser
+ if not isinstance(filename_url_or_file, _strings):
+ fp = filename_url_or_file
+ if guess_charset is None:
+ # assume that file-like objects return Unicode more often than bytes
+ guess_charset = False
+ elif _looks_like_url(filename_url_or_file):
+ fp = urlopen(filename_url_or_file)
+ if guess_charset is None:
+ # assume that URLs return bytes
+ guess_charset = True
+ else:
+ fp = open(filename_url_or_file, 'rb')
+ if guess_charset is None:
+ guess_charset = True
+
+ options = {}
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ if guess_charset:
+ options['useChardet'] = guess_charset
+ return parser.parse(fp, **options)
+
+
+def _looks_like_url(str):
+ scheme = urlparse(str)[0]
+ if not scheme:
+ return False
+ elif (sys.platform == 'win32' and
+ scheme in string.ascii_letters
+ and len(scheme) == 1):
+ # looks like a 'normal' absolute path
+ return False
+ else:
+ return True
+
+
+html_parser = HTMLParser()