blob: 943f9b6c241dfd9d24e9eebefd5926d721c71c51 (
about) (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
|
"""Custom element class for rendered page-break (CT_LastRenderedPageBreak)."""
from __future__ import annotations
import copy
from typing import TYPE_CHECKING
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.shared import lazyproperty
if TYPE_CHECKING:
from docx.oxml.text.hyperlink import CT_Hyperlink
from docx.oxml.text.paragraph import CT_P
class CT_LastRenderedPageBreak(BaseOxmlElement):
"""`<w:lastRenderedPageBreak>` element, indicating page break inserted by renderer.
A rendered page-break is one inserted by the renderer when it runs out of room on a
page. It is an empty element (no attrs or children) and is a child of CT_R, peer to
CT_Text.
NOTE: this complex-type name does not exist in the schema, where
`w:lastRenderedPageBreak` maps to `CT_Empty`. This name was added to give it
distinguished behavior. CT_Empty is used for many elements.
"""
@property
def following_fragment_p(self) -> CT_P:
"""A "loose" `CT_P` containing only the paragraph content before this break.
Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
page-break in its paragraph.
The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
page-break with this `w:lastRenderedPageBreak` element and all content preceding
it removed.
NOTE: this `w:p` can itself contain one or more `w:renderedPageBreak` elements
(when the paragraph contained more than one). While this is rare, the caller
should treat this paragraph the same as other paragraphs and split it if
necessary in a folloing step or recursion.
"""
if not self == self._first_lrpb_in_p(self._enclosing_p):
raise ValueError("only defined on first rendered page-break in paragraph")
# -- splitting approach is different when break is inside a hyperlink --
return (
self._following_frag_in_hlink
if self._is_in_hyperlink
else self._following_frag_in_run
)
@property
def follows_all_content(self) -> bool:
"""True when this page-break element is the last "content" in the paragraph.
This is very uncommon case and may only occur in contrived or cases where the
XML is edited by hand, but it is not precluded by the spec.
"""
# -- a page-break inside a hyperlink never meets these criteria (for our
# -- purposes at least) because it is considered "atomic" and always associated
# -- with the page it starts on.
if self._is_in_hyperlink:
return False
return bool(
# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
self._enclosing_p.xpath(
# -- in first run of paragraph --
f"(./w:r)[last()]"
# -- all page-breaks --
f"/w:lastRenderedPageBreak"
# -- that are not preceded by any content-bearing elements --
f"[not(following-sibling::*[{self._run_inner_content_xpath}])]"
)
)
@property
def precedes_all_content(self) -> bool:
"""True when a `w:lastRenderedPageBreak` precedes all paragraph content.
This is a common case; it occurs whenever the page breaks on an even paragraph
boundary.
"""
# -- a page-break inside a hyperlink never meets these criteria because there
# -- is always part of the hyperlink text before the page-break.
if self._is_in_hyperlink:
return False
return bool(
# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
self._enclosing_p.xpath(
# -- in first run of paragraph --
f"./w:r[1]"
# -- all page-breaks --
f"/w:lastRenderedPageBreak"
# -- that are not preceded by any content-bearing elements --
f"[not(preceding-sibling::*[{self._run_inner_content_xpath}])]"
)
)
@property
def preceding_fragment_p(self) -> CT_P:
"""A "loose" `CT_P` containing only the paragraph content before this break.
Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
paragraph in its paragraph.
The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
page-break with this `w:lastRenderedPageBreak` element and all its following
siblings removed.
"""
if not self == self._first_lrpb_in_p(self._enclosing_p):
raise ValueError("only defined on first rendered page-break in paragraph")
# -- splitting approach is different when break is inside a hyperlink --
return (
self._preceding_frag_in_hlink
if self._is_in_hyperlink
else self._preceding_frag_in_run
)
def _enclosing_hyperlink(self, lrpb: CT_LastRenderedPageBreak) -> CT_Hyperlink:
"""The `w:hyperlink` grandparent of this `w:lastRenderedPageBreak`.
Raises `IndexError` when this page-break has a `w:p` grandparent, so only call
when `._is_in_hyperlink` is True.
"""
return lrpb.xpath("./parent::w:r/parent::w:hyperlink")[0]
@property
def _enclosing_p(self) -> CT_P:
"""The `w:p` element parent or grandparent of this `w:lastRenderedPageBreak`."""
return self.xpath("./ancestor::w:p[1]")[0]
def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
"""The first `w:lastRenderedPageBreak` element in `p`.
Raises `ValueError` if there are no rendered page-breaks in `p`.
"""
lrpbs = p.xpath(
"./w:r/w:lastRenderedPageBreak | ./w:hyperlink/w:r/w:lastRenderedPageBreak"
)
if not lrpbs:
raise ValueError("no rendered page-breaks in paragraph element")
return lrpbs[0]
@lazyproperty
def _following_frag_in_hlink(self) -> CT_P:
"""Following CT_P fragment when break occurs within a hyperlink.
Note this is a *partial-function* and raises when `lrpb` is not inside a
hyperlink.
"""
if not self._is_in_hyperlink:
raise ValueError("only defined on a rendered page-break in a hyperlink")
# -- work on a clone `w:p` so our mutations don't persist --
p = copy.deepcopy(self._enclosing_p)
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
lrpb = self._first_lrpb_in_p(p)
# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
hyperlink = lrpb._enclosing_hyperlink(lrpb)
# -- delete all w:p inner-content preceding the hyperlink --
for e in hyperlink.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
p.remove(e)
# -- remove the whole hyperlink, it belongs to the preceding-fragment-p --
hyperlink.getparent().remove(hyperlink)
# -- that's it, return the remaining fragment of `w:p` clone --
return p
@lazyproperty
def _following_frag_in_run(self) -> CT_P:
"""following CT_P fragment when break does not occur in a hyperlink.
Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
"""
if self._is_in_hyperlink:
raise ValueError("only defined on a rendered page-break not in a hyperlink")
# -- work on a clone `w:p` so our mutations don't persist --
p = copy.deepcopy(self._enclosing_p)
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
lrpb = self._first_lrpb_in_p(p)
# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
enclosing_r = lrpb.xpath("./parent::w:r")[0]
# -- delete all w:p inner-content preceding that run (but not w:pPr) --
for e in enclosing_r.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
p.remove(e)
# -- then remove all run inner-content preceding this lrpb in its run (but not
# -- the `w:rPr`) and also remove the page-break itself
for e in lrpb.xpath("./preceding-sibling::*[not(self::w:rPr)]"):
enclosing_r.remove(e)
enclosing_r.remove(lrpb)
return p
@lazyproperty
def _is_in_hyperlink(self) -> bool:
"""True when this page-break is embedded in a hyperlink run."""
return bool(self.xpath("./parent::w:r/parent::w:hyperlink"))
@lazyproperty
def _preceding_frag_in_hlink(self) -> CT_P:
"""Preceding CT_P fragment when break occurs within a hyperlink.
Note this is a *partial-function* and raises when `lrpb` is not inside a
hyperlink.
"""
if not self._is_in_hyperlink:
raise ValueError("only defined on a rendered page-break in a hyperlink")
# -- work on a clone `w:p` so our mutations don't persist --
p = copy.deepcopy(self._enclosing_p)
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
lrpb = self._first_lrpb_in_p(p)
# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
hyperlink = lrpb._enclosing_hyperlink(lrpb)
# -- delete all w:p inner-content following the hyperlink --
for e in hyperlink.xpath("./following-sibling::*"):
p.remove(e)
# -- remove this page-break from inside the hyperlink --
lrpb.getparent().remove(lrpb)
# -- that's it, the entire hyperlink goes into the preceding fragment so
# -- the hyperlink is not "split".
return p
@lazyproperty
def _preceding_frag_in_run(self) -> CT_P:
"""Preceding CT_P fragment when break does not occur in a hyperlink.
Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
"""
if self._is_in_hyperlink:
raise ValueError("only defined on a rendered page-break not in a hyperlink")
# -- work on a clone `w:p` so our mutations don't persist --
p = copy.deepcopy(self._enclosing_p)
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
lrpb = self._first_lrpb_in_p(p)
# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
enclosing_r = lrpb.xpath("./parent::w:r")[0]
# -- delete all `w:p` inner-content following that run --
for e in enclosing_r.xpath("./following-sibling::*"):
p.remove(e)
# -- then delete all `w:r` inner-content following this lrpb in its run and
# -- also remove the page-break itself
for e in lrpb.xpath("./following-sibling::*"):
enclosing_r.remove(e)
enclosing_r.remove(lrpb)
return p
@lazyproperty
def _run_inner_content_xpath(self) -> str:
"""XPath fragment matching any run inner-content elements."""
return (
"self::w:br"
" | self::w:cr"
" | self::w:drawing"
" | self::w:noBreakHyphen"
" | self::w:ptab"
" | self::w:t"
" | self::w:tab"
)
|