about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/bs4/formatter.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/formatter.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/formatter.py')
-rw-r--r--.venv/lib/python3.12/site-packages/bs4/formatter.py276
1 files changed, 276 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/formatter.py b/.venv/lib/python3.12/site-packages/bs4/formatter.py
new file mode 100644
index 00000000..bfa08764
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/formatter.py
@@ -0,0 +1,276 @@
+from __future__ import annotations
+from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union
+from typing_extensions import TypeAlias
+from bs4.dammit import EntitySubstitution
+
+if TYPE_CHECKING:
+    from bs4._typing import _AttributeValue
+
+
+class Formatter(EntitySubstitution):
+    """Describes a strategy to use when outputting a parse tree to a string.
+
+    Some parts of this strategy come from the distinction between
+    HTML4, HTML5, and XML. Others are configurable by the user.
+
+    Formatters are passed in as the `formatter` argument to methods
+    like `bs4.element.Tag.encode`. Most people won't need to
+    think about formatters, and most people who need to think about
+    them can pass in one of these predefined strings as `formatter`
+    rather than making a new Formatter object:
+
+    For HTML documents:
+     * 'html' - HTML entity substitution for generic HTML documents. (default)
+     * 'html5' - HTML entity substitution for HTML5 documents, as
+                 well as some optimizations in the way tags are rendered.
+     * 'html5-4.12.0' - The version of the 'html5' formatter used prior to
+                        Beautiful Soup 4.13.0.
+     * 'minimal' - Only make the substitutions necessary to guarantee
+                   valid HTML.
+     * None - Do not perform any substitution. This will be faster
+              but may result in invalid markup.
+
+    For XML documents:
+     * 'html' - Entity substitution for XHTML documents.
+     * 'minimal' - Only make the substitutions necessary to guarantee
+                   valid XML. (default)
+     * None - Do not perform any substitution. This will be faster
+              but may result in invalid markup.
+
+    """
+
+    #: Constant name denoting HTML markup
+    HTML: str = "html"
+
+    #: Constant name denoting XML markup
+    XML: str = "xml"
+
+    #: Default values for the various constructor options when the
+    #: markup language is HTML.
+    HTML_DEFAULTS: Dict[str, Set[str]] = dict(
+        cdata_containing_tags=set(["script", "style"]),
+    )
+
+    language: Optional[str]  #: :meta private:
+    entity_substitution: Optional[_EntitySubstitutionFunction]  #: :meta private:
+    void_element_close_prefix: str  #: :meta private:
+    cdata_containing_tags: Set[str]  #: :meta private:
+    indent: str  #: :meta private:
+
+    #: If this is set to true by the constructor, then attributes whose
+    #: values are sent to the empty string will be treated as HTML
+    #: boolean attributes. (Attributes whose value is None are always
+    #: rendered this way.)
+    empty_attributes_are_booleans: bool
+
+    def _default(
+        self, language: str, value: Optional[Set[str]], kwarg: str
+    ) -> Set[str]:
+        if value is not None:
+            return value
+        if language == self.XML:
+            # When XML is the markup language in use, all of the
+            # defaults are the empty list.
+            return set()
+
+        # Otherwise, it depends on what's in HTML_DEFAULTS.
+        return self.HTML_DEFAULTS[kwarg]
+
+    def __init__(
+        self,
+        language: Optional[str] = None,
+        entity_substitution: Optional[_EntitySubstitutionFunction] = None,
+        void_element_close_prefix: str = "/",
+        cdata_containing_tags: Optional[Set[str]] = None,
+        empty_attributes_are_booleans: bool = False,
+        indent: Union[int,str] = 1,
+    ):
+        r"""Constructor.
+
+        :param language: This should be `Formatter.XML` if you are formatting
+           XML markup and `Formatter.HTML` if you are formatting HTML markup.
+
+        :param entity_substitution: A function to call to replace special
+           characters with XML/HTML entities. For examples, see
+           bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
+        :param void_element_close_prefix: By default, void elements
+           are represented as <tag/> (XML rules) rather than <tag>
+           (HTML rules). To get <tag>, pass in the empty string.
+        :param cdata_containing_tags: The set of tags that are defined
+           as containing CDATA in this dialect. For example, in HTML,
+           <script> and <style> tags are defined as containing CDATA,
+           and their contents should not be formatted.
+        :param empty_attributes_are_booleans: If this is set to true,
+          then attributes whose values are sent to the empty string
+          will be treated as `HTML boolean
+          attributes<https://dev.w3.org/html5/spec-LC/common-microsyntaxes.html#boolean-attributes>`_. (Attributes
+          whose value is None are always rendered this way.)
+        :param indent: If indent is a non-negative integer or string,
+            then the contents of elements will be indented
+            appropriately when pretty-printing. An indent level of 0,
+            negative, or "" will only insert newlines. Using a
+            positive integer indent indents that many spaces per
+            level. If indent is a string (such as "\t"), that string
+            is used to indent each level. The default behavior is to
+            indent one space per level.
+
+        """
+        self.language = language or self.HTML
+        self.entity_substitution = entity_substitution
+        self.void_element_close_prefix = void_element_close_prefix
+        self.cdata_containing_tags = self._default(
+            self.language, cdata_containing_tags, "cdata_containing_tags"
+        )
+        self.empty_attributes_are_booleans = empty_attributes_are_booleans
+        if indent is None:
+            indent = 0
+        indent_str: str
+        if isinstance(indent, int):
+            if indent < 0:
+                indent = 0
+            indent_str = " " * indent
+        elif isinstance(indent, str):
+            indent_str = indent
+        else:
+            indent_str = " "
+        self.indent = indent_str
+
+    def substitute(self, ns: str) -> str:
+        """Process a string that needs to undergo entity substitution.
+        This may be a string encountered in an attribute value or as
+        text.
+
+        :param ns: A string.
+        :return: The same string but with certain characters replaced by named
+           or numeric entities.
+        """
+        if not self.entity_substitution:
+            return ns
+        from .element import NavigableString
+
+        if (
+            isinstance(ns, NavigableString)
+            and ns.parent is not None
+            and ns.parent.name in self.cdata_containing_tags
+        ):
+            # Do nothing.
+            return ns
+        # Substitute.
+        return self.entity_substitution(ns)
+
+    def attribute_value(self, value: str) -> str:
+        """Process the value of an attribute.
+
+        :param ns: A string.
+        :return: A string with certain characters replaced by named
+           or numeric entities.
+        """
+        return self.substitute(value)
+
+    def attributes(
+        self, tag: bs4.element.Tag
+    ) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
+        """Reorder a tag's attributes however you want.
+
+        By default, attributes are sorted alphabetically. This makes
+        behavior consistent between Python 2 and Python 3, and preserves
+        backwards compatibility with older versions of Beautiful Soup.
+
+        If `empty_attributes_are_booleans` is True, then
+        attributes whose values are set to the empty string will be
+        treated as boolean attributes.
+        """
+        if tag.attrs is None:
+            return []
+
+        items: Iterable[Tuple[str, _AttributeValue]] = list(tag.attrs.items())
+        return sorted(
+            (k, (None if self.empty_attributes_are_booleans and v == "" else v))
+            for k, v in items
+        )
+
+
+class HTMLFormatter(Formatter):
+    """A generic Formatter for HTML."""
+
+    REGISTRY: Dict[Optional[str], HTMLFormatter] = {}
+
+    def __init__(
+        self,
+        entity_substitution: Optional[_EntitySubstitutionFunction] = None,
+        void_element_close_prefix: str = "/",
+        cdata_containing_tags: Optional[Set[str]] = None,
+        empty_attributes_are_booleans: bool = False,
+        indent: Union[int,str] = 1,
+    ):
+        super(HTMLFormatter, self).__init__(
+            self.HTML,
+            entity_substitution,
+            void_element_close_prefix,
+            cdata_containing_tags,
+            empty_attributes_are_booleans,
+            indent=indent
+        )
+
+
+class XMLFormatter(Formatter):
+    """A generic Formatter for XML."""
+
+    REGISTRY: Dict[Optional[str], XMLFormatter] = {}
+
+    def __init__(
+        self,
+        entity_substitution: Optional[_EntitySubstitutionFunction] = None,
+        void_element_close_prefix: str = "/",
+        cdata_containing_tags: Optional[Set[str]] = None,
+        empty_attributes_are_booleans: bool = False,
+        indent: Union[int,str] = 1,
+    ):
+        super(XMLFormatter, self).__init__(
+            self.XML,
+            entity_substitution,
+            void_element_close_prefix,
+            cdata_containing_tags,
+            empty_attributes_are_booleans,
+            indent=indent,
+        )
+
+
+# Set up aliases for the default formatters.
+HTMLFormatter.REGISTRY["html"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html
+)
+
+HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html5,
+    void_element_close_prefix="",
+    empty_attributes_are_booleans=True,
+)
+HTMLFormatter.REGISTRY["html5-4.12"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html,
+    void_element_close_prefix="",
+    empty_attributes_are_booleans=True,
+)
+HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_xml
+)
+HTMLFormatter.REGISTRY[None] = HTMLFormatter(entity_substitution=None)
+XMLFormatter.REGISTRY["html"] = XMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html
+)
+XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_xml
+)
+
+XMLFormatter.REGISTRY[None] = XMLFormatter(entity_substitution=None)
+
+# Define type aliases to improve readability.
+#
+
+#: A function to call to replace special characters with XML or HTML
+#: entities.
+_EntitySubstitutionFunction: TypeAlias = Callable[[str], str]
+
+# Many of the output-centered methods take an argument that can either
+# be a Formatter object or the name of a Formatter to be looked up.
+_FormatterOrName = Union[Formatter, str]