diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/docx/text/pagebreak.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/docx/text/pagebreak.py | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/docx/text/pagebreak.py b/.venv/lib/python3.12/site-packages/docx/text/pagebreak.py new file mode 100644 index 00000000..0977ccea --- /dev/null +++ b/.venv/lib/python3.12/site-packages/docx/text/pagebreak.py @@ -0,0 +1,104 @@ +"""Proxy objects related to rendered page-breaks.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak +from docx.shared import Parented + +if TYPE_CHECKING: + import docx.types as t + from docx.text.paragraph import Paragraph + + +class RenderedPageBreak(Parented): + """A page-break inserted by Word during page-layout for print or display purposes. + + This usually does not correspond to a "hard" page-break inserted by the document + author, rather just that Word ran out of room on one page and needed to start + another. The position of these can change depending on the printer and page-size, as + well as margins, etc. They also will change in response to edits, but not until Word + loads and saves the document. + + Note these are never inserted by `python-docx` because it has no rendering function. + These are generally only useful for text-extraction of existing documents when + `python-docx` is being used solely as a document "reader". + + NOTE: a rendered page-break can occur within a hyperlink; consider a multi-word + hyperlink like "excellent Wikipedia article on LLMs" that happens to fall close to + the end of the last line on a page such that the page breaks between "Wikipedia" and + "article". In such a "page-breaks-in-hyperlink" case, THESE METHODS WILL "MOVE" THE + PAGE-BREAK to occur after the hyperlink, such that the entire hyperlink appears in + the paragraph returned by `.preceding_paragraph_fragment`. While this places the + "tail" text of the hyperlink on the "wrong" page, it avoids having two hyperlinks + each with a fragment of the actual text and pointing to the same address. + """ + + def __init__( + self, + lastRenderedPageBreak: CT_LastRenderedPageBreak, + parent: t.ProvidesStoryPart, + ): + super().__init__(parent) + self._element = lastRenderedPageBreak + self._lastRenderedPageBreak = lastRenderedPageBreak + + @property + def preceding_paragraph_fragment(self) -> Paragraph | None: + """A "loose" paragraph containing the content preceding this page-break. + + Compare `.following_paragraph_fragment` as these two are intended to be used + together. + + This value is `None` when no content precedes this page-break. This case is + common and occurs whenever a page breaks on an even paragraph boundary. + Returning `None` for this case avoids "inserting" a non-existent paragraph into + the content stream. Note that content can include DrawingML items like images or + charts. + + Note the returned paragraph *is divorced from the document body*. Any changes + made to it will not be reflected in the document. It is intended to provide a + familiar container (`Paragraph`) to interrogate for the content preceding this + page-break in the paragraph in which it occured. + + Contains the entire hyperlink when this break occurs within a hyperlink. + """ + if self._lastRenderedPageBreak.precedes_all_content: + return None + + from docx.text.paragraph import Paragraph + + return Paragraph(self._lastRenderedPageBreak.preceding_fragment_p, self._parent) + + @property + def following_paragraph_fragment(self) -> Paragraph | None: + """A "loose" paragraph containing the content following this page-break. + + HAS POTENTIALLY SURPRISING BEHAVIORS so read carefully to be sure this is what + you want. This is primarily targeted toward text-extraction use-cases for which + precisely associating text with the page it occurs on is important. + + Compare `.preceding_paragraph_fragment` as these two are intended to be used + together. + + This value is `None` when no content follows this page-break. This case is + unlikely to occur in practice because Word places even-paragraph-boundary + page-breaks on the paragraph *following* the page-break. Still, it is possible + and must be checked for. Returning `None` for this case avoids "inserting" an + extra, non-existent paragraph into the content stream. Note that content can + include DrawingML items like images or charts, not just text. + + The returned paragraph *is divorced from the document body*. Any changes made to + it will not be reflected in the document. It is intended to provide a container + (`Paragraph`) with familiar properties and methods that can be used to + characterize the paragraph content following a mid-paragraph page-break. + + Contains no portion of the hyperlink when this break occurs within a hyperlink. + """ + if self._lastRenderedPageBreak.follows_all_content: + return None + + from docx.text.paragraph import Paragraph + + return Paragraph(self._lastRenderedPageBreak.following_fragment_p, self._parent) |