about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/docx/oxml/text/pagebreak.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/docx/oxml/text/pagebreak.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/docx/oxml/text/pagebreak.py')
-rw-r--r--.venv/lib/python3.12/site-packages/docx/oxml/text/pagebreak.py284
1 files changed, 284 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/docx/oxml/text/pagebreak.py b/.venv/lib/python3.12/site-packages/docx/oxml/text/pagebreak.py
new file mode 100644
index 00000000..943f9b6c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/docx/oxml/text/pagebreak.py
@@ -0,0 +1,284 @@
+"""Custom element class for rendered page-break (CT_LastRenderedPageBreak)."""
+
+from __future__ import annotations
+
+import copy
+from typing import TYPE_CHECKING
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+from docx.shared import lazyproperty
+
+if TYPE_CHECKING:
+    from docx.oxml.text.hyperlink import CT_Hyperlink
+    from docx.oxml.text.paragraph import CT_P
+
+
+class CT_LastRenderedPageBreak(BaseOxmlElement):
+    """`<w:lastRenderedPageBreak>` element, indicating page break inserted by renderer.
+
+    A rendered page-break is one inserted by the renderer when it runs out of room on a
+    page. It is an empty element (no attrs or children) and is a child of CT_R, peer to
+    CT_Text.
+
+    NOTE: this complex-type name does not exist in the schema, where
+    `w:lastRenderedPageBreak` maps to `CT_Empty`. This name was added to give it
+    distinguished behavior. CT_Empty is used for many elements.
+    """
+
+    @property
+    def following_fragment_p(self) -> CT_P:
+        """A "loose" `CT_P` containing only the paragraph content before this break.
+
+        Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
+        page-break in its paragraph.
+
+        The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
+        page-break with this `w:lastRenderedPageBreak` element and all content preceding
+        it removed.
+
+        NOTE: this `w:p` can itself contain one or more `w:renderedPageBreak` elements
+        (when the paragraph contained more than one). While this is rare, the caller
+        should treat this paragraph the same as other paragraphs and split it if
+        necessary in a folloing step or recursion.
+        """
+        if not self == self._first_lrpb_in_p(self._enclosing_p):
+            raise ValueError("only defined on first rendered page-break in paragraph")
+
+        # -- splitting approach is different when break is inside a hyperlink --
+        return (
+            self._following_frag_in_hlink
+            if self._is_in_hyperlink
+            else self._following_frag_in_run
+        )
+
+    @property
+    def follows_all_content(self) -> bool:
+        """True when this page-break element is the last "content" in the paragraph.
+
+        This is very uncommon case and may only occur in contrived or cases where the
+        XML is edited by hand, but it is not precluded by the spec.
+        """
+        # -- a page-break inside a hyperlink never meets these criteria (for our
+        # -- purposes at least) because it is considered "atomic" and always associated
+        # -- with the page it starts on.
+        if self._is_in_hyperlink:
+            return False
+
+        return bool(
+            # -- XPath will match zero-or-one w:lastRenderedPageBreak element --
+            self._enclosing_p.xpath(
+                # -- in first run of paragraph --
+                f"(./w:r)[last()]"
+                # -- all page-breaks --
+                f"/w:lastRenderedPageBreak"
+                # -- that are not preceded by any content-bearing elements --
+                f"[not(following-sibling::*[{self._run_inner_content_xpath}])]"
+            )
+        )
+
+    @property
+    def precedes_all_content(self) -> bool:
+        """True when a `w:lastRenderedPageBreak` precedes all paragraph content.
+
+        This is a common case; it occurs whenever the page breaks on an even paragraph
+        boundary.
+        """
+        # -- a page-break inside a hyperlink never meets these criteria because there
+        # -- is always part of the hyperlink text before the page-break.
+        if self._is_in_hyperlink:
+            return False
+
+        return bool(
+            # -- XPath will match zero-or-one w:lastRenderedPageBreak element --
+            self._enclosing_p.xpath(
+                # -- in first run of paragraph --
+                f"./w:r[1]"
+                # -- all page-breaks --
+                f"/w:lastRenderedPageBreak"
+                # -- that are not preceded by any content-bearing elements --
+                f"[not(preceding-sibling::*[{self._run_inner_content_xpath}])]"
+            )
+        )
+
+    @property
+    def preceding_fragment_p(self) -> CT_P:
+        """A "loose" `CT_P` containing only the paragraph content before this break.
+
+        Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
+        paragraph in its paragraph.
+
+        The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
+        page-break with this `w:lastRenderedPageBreak` element and all its following
+        siblings removed.
+        """
+        if not self == self._first_lrpb_in_p(self._enclosing_p):
+            raise ValueError("only defined on first rendered page-break in paragraph")
+
+        # -- splitting approach is different when break is inside a hyperlink --
+        return (
+            self._preceding_frag_in_hlink
+            if self._is_in_hyperlink
+            else self._preceding_frag_in_run
+        )
+
+    def _enclosing_hyperlink(self, lrpb: CT_LastRenderedPageBreak) -> CT_Hyperlink:
+        """The `w:hyperlink` grandparent of this `w:lastRenderedPageBreak`.
+
+        Raises `IndexError` when this page-break has a `w:p` grandparent, so only call
+        when `._is_in_hyperlink` is True.
+        """
+        return lrpb.xpath("./parent::w:r/parent::w:hyperlink")[0]
+
+    @property
+    def _enclosing_p(self) -> CT_P:
+        """The `w:p` element parent or grandparent of this `w:lastRenderedPageBreak`."""
+        return self.xpath("./ancestor::w:p[1]")[0]
+
+    def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
+        """The first `w:lastRenderedPageBreak` element in `p`.
+
+        Raises `ValueError` if there are no rendered page-breaks in `p`.
+        """
+        lrpbs = p.xpath(
+            "./w:r/w:lastRenderedPageBreak | ./w:hyperlink/w:r/w:lastRenderedPageBreak"
+        )
+        if not lrpbs:
+            raise ValueError("no rendered page-breaks in paragraph element")
+        return lrpbs[0]
+
+    @lazyproperty
+    def _following_frag_in_hlink(self) -> CT_P:
+        """Following CT_P fragment when break occurs within a hyperlink.
+
+        Note this is a *partial-function* and raises when `lrpb` is not inside a
+        hyperlink.
+        """
+        if not self._is_in_hyperlink:
+            raise ValueError("only defined on a rendered page-break in a hyperlink")
+
+        # -- work on a clone `w:p` so our mutations don't persist --
+        p = copy.deepcopy(self._enclosing_p)
+
+        # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
+        lrpb = self._first_lrpb_in_p(p)
+
+        # -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
+        hyperlink = lrpb._enclosing_hyperlink(lrpb)
+
+        # -- delete all w:p inner-content preceding the hyperlink --
+        for e in hyperlink.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
+            p.remove(e)
+
+        # -- remove the whole hyperlink, it belongs to the preceding-fragment-p --
+        hyperlink.getparent().remove(hyperlink)
+
+        # -- that's it, return the remaining fragment of `w:p` clone --
+        return p
+
+    @lazyproperty
+    def _following_frag_in_run(self) -> CT_P:
+        """following CT_P fragment when break does not occur in a hyperlink.
+
+        Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
+        """
+        if self._is_in_hyperlink:
+            raise ValueError("only defined on a rendered page-break not in a hyperlink")
+
+        # -- work on a clone `w:p` so our mutations don't persist --
+        p = copy.deepcopy(self._enclosing_p)
+
+        # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
+        lrpb = self._first_lrpb_in_p(p)
+
+        # -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
+        enclosing_r = lrpb.xpath("./parent::w:r")[0]
+
+        # -- delete all w:p inner-content preceding that run (but not w:pPr) --
+        for e in enclosing_r.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
+            p.remove(e)
+
+        # -- then remove all run inner-content preceding this lrpb in its run (but not
+        # -- the `w:rPr`) and also remove the page-break itself
+        for e in lrpb.xpath("./preceding-sibling::*[not(self::w:rPr)]"):
+            enclosing_r.remove(e)
+        enclosing_r.remove(lrpb)
+
+        return p
+
+    @lazyproperty
+    def _is_in_hyperlink(self) -> bool:
+        """True when this page-break is embedded in a hyperlink run."""
+        return bool(self.xpath("./parent::w:r/parent::w:hyperlink"))
+
+    @lazyproperty
+    def _preceding_frag_in_hlink(self) -> CT_P:
+        """Preceding CT_P fragment when break occurs within a hyperlink.
+
+        Note this is a *partial-function* and raises when `lrpb` is not inside a
+        hyperlink.
+        """
+        if not self._is_in_hyperlink:
+            raise ValueError("only defined on a rendered page-break in a hyperlink")
+
+        # -- work on a clone `w:p` so our mutations don't persist --
+        p = copy.deepcopy(self._enclosing_p)
+
+        # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
+        lrpb = self._first_lrpb_in_p(p)
+
+        # -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
+        hyperlink = lrpb._enclosing_hyperlink(lrpb)
+
+        # -- delete all w:p inner-content following the hyperlink --
+        for e in hyperlink.xpath("./following-sibling::*"):
+            p.remove(e)
+
+        # -- remove this page-break from inside the hyperlink --
+        lrpb.getparent().remove(lrpb)
+
+        # -- that's it, the entire hyperlink goes into the preceding fragment so
+        # -- the hyperlink is not "split".
+        return p
+
+    @lazyproperty
+    def _preceding_frag_in_run(self) -> CT_P:
+        """Preceding CT_P fragment when break does not occur in a hyperlink.
+
+        Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
+        """
+        if self._is_in_hyperlink:
+            raise ValueError("only defined on a rendered page-break not in a hyperlink")
+
+        # -- work on a clone `w:p` so our mutations don't persist --
+        p = copy.deepcopy(self._enclosing_p)
+
+        # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
+        lrpb = self._first_lrpb_in_p(p)
+
+        # -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
+        enclosing_r = lrpb.xpath("./parent::w:r")[0]
+
+        # -- delete all `w:p` inner-content following that run --
+        for e in enclosing_r.xpath("./following-sibling::*"):
+            p.remove(e)
+
+        # -- then delete all `w:r` inner-content following this lrpb in its run and
+        # -- also remove the page-break itself
+        for e in lrpb.xpath("./following-sibling::*"):
+            enclosing_r.remove(e)
+        enclosing_r.remove(lrpb)
+
+        return p
+
+    @lazyproperty
+    def _run_inner_content_xpath(self) -> str:
+        """XPath fragment matching any run inner-content elements."""
+        return (
+            "self::w:br"
+            " | self::w:cr"
+            " | self::w:drawing"
+            " | self::w:noBreakHyphen"
+            " | self::w:ptab"
+            " | self::w:t"
+            " | self::w:tab"
+        )