1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
|
"""Custom element classes related to text runs (CT_R)."""
from __future__ import annotations
from typing import TYPE_CHECKING, Callable, Iterator, List
from docx.oxml.drawing import CT_Drawing
from docx.oxml.ns import qn
from docx.oxml.simpletypes import ST_BrClear, ST_BrType
from docx.oxml.text.font import CT_RPr
from docx.oxml.xmlchemy import BaseOxmlElement, OptionalAttribute, ZeroOrMore, ZeroOrOne
from docx.shared import TextAccumulator
if TYPE_CHECKING:
from docx.oxml.shape import CT_Anchor, CT_Inline
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
from docx.oxml.text.parfmt import CT_TabStop
# ------------------------------------------------------------------------------------
# Run-level elements
class CT_R(BaseOxmlElement):
"""`<w:r>` element, containing the properties and text for a run."""
add_br: Callable[[], CT_Br]
add_tab: Callable[[], CT_TabStop]
get_or_add_rPr: Callable[[], CT_RPr]
_add_drawing: Callable[[], CT_Drawing]
_add_t: Callable[..., CT_Text]
rPr: CT_RPr | None = ZeroOrOne("w:rPr") # pyright: ignore[reportAssignmentType]
br = ZeroOrMore("w:br")
cr = ZeroOrMore("w:cr")
drawing = ZeroOrMore("w:drawing")
t = ZeroOrMore("w:t")
tab = ZeroOrMore("w:tab")
def add_t(self, text: str) -> CT_Text:
"""Return a newly added `<w:t>` element containing `text`."""
t = self._add_t(text=text)
if len(text.strip()) < len(text):
t.set(qn("xml:space"), "preserve")
return t
def add_drawing(self, inline_or_anchor: CT_Inline | CT_Anchor) -> CT_Drawing:
"""Return newly appended `CT_Drawing` (`w:drawing`) child element.
The `w:drawing` element has `inline_or_anchor` as its child.
"""
drawing = self._add_drawing()
drawing.append(inline_or_anchor)
return drawing
def clear_content(self) -> None:
"""Remove all child elements except a `w:rPr` element if present."""
# -- remove all run inner-content except a `w:rPr` when present. --
for e in self.xpath("./*[not(self::w:rPr)]"):
self.remove(e)
@property
def inner_content_items(self) -> List[str | CT_Drawing | CT_LastRenderedPageBreak]:
"""Text of run, possibly punctuated by `w:lastRenderedPageBreak` elements."""
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
accum = TextAccumulator()
def iter_items() -> Iterator[str | CT_Drawing | CT_LastRenderedPageBreak]:
for e in self.xpath(
"w:br"
" | w:cr"
" | w:drawing"
" | w:lastRenderedPageBreak"
" | w:noBreakHyphen"
" | w:ptab"
" | w:t"
" | w:tab"
):
if isinstance(e, (CT_Drawing, CT_LastRenderedPageBreak)):
yield from accum.pop()
yield e
else:
accum.push(str(e))
# -- don't forget the "tail" string --
yield from accum.pop()
return list(iter_items())
@property
def lastRenderedPageBreaks(self) -> List[CT_LastRenderedPageBreak]:
"""All `w:lastRenderedPageBreaks` descendants of this run."""
return self.xpath("./w:lastRenderedPageBreak")
@property
def style(self) -> str | None:
"""String contained in `w:val` attribute of `w:rStyle` grandchild.
|None| if that element is not present.
"""
rPr = self.rPr
if rPr is None:
return None
return rPr.style
@style.setter
def style(self, style: str | None):
"""Set character style of this `w:r` element to `style`.
If `style` is None, remove the style element.
"""
rPr = self.get_or_add_rPr()
rPr.style = style
@property
def text(self) -> str:
"""The textual content of this run.
Inner-content child elements like `w:tab` are translated to their text
equivalent.
"""
return "".join(
str(e) for e in self.xpath("w:br | w:cr | w:noBreakHyphen | w:ptab | w:t | w:tab")
)
@text.setter
def text(self, text: str): # pyright: ignore[reportIncompatibleMethodOverride]
self.clear_content()
_RunContentAppender.append_to_run_from_text(self, text)
def _insert_rPr(self, rPr: CT_RPr) -> CT_RPr:
self.insert(0, rPr)
return rPr
# ------------------------------------------------------------------------------------
# Run inner-content elements
class CT_Br(BaseOxmlElement):
"""`<w:br>` element, indicating a line, page, or column break in a run."""
type: str | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:type", ST_BrType, default="textWrapping"
)
clear: str | None = OptionalAttribute("w:clear", ST_BrClear) # pyright: ignore
def __str__(self) -> str:
"""Text equivalent of this element. Actual value depends on break type.
A line break is translated as "\n". Column and page breaks produce the empty
string ("").
This allows the text of run inner-content to be accessed in a consistent way
for all run inner-context text elements.
"""
return "\n" if self.type == "textWrapping" else ""
class CT_Cr(BaseOxmlElement):
"""`<w:cr>` element, representing a carriage-return (0x0D) character within a run.
In Word, this represents a "soft carriage-return" in the sense that it does not end
the paragraph the way pressing Enter (aka. Return) on the keyboard does. Here the
text equivalent is considered to be newline ("\n") since in plain-text that's the
closest Python equivalent.
NOTE: this complex-type name does not exist in the schema, where `w:tab` maps to
`CT_Empty`. This name was added to give it distinguished behavior. CT_Empty is used
for many elements.
"""
def __str__(self) -> str:
"""Text equivalent of this element, a single newline ("\n")."""
return "\n"
class CT_NoBreakHyphen(BaseOxmlElement):
"""`<w:noBreakHyphen>` element, a hyphen ineligible for a line-wrap position.
This maps to a plain-text dash ("-").
NOTE: this complex-type name does not exist in the schema, where `w:noBreakHyphen`
maps to `CT_Empty`. This name was added to give it behavior distinguished from the
many other elements represented in the schema by CT_Empty.
"""
def __str__(self) -> str:
"""Text equivalent of this element, a single dash character ("-")."""
return "-"
class CT_PTab(BaseOxmlElement):
"""`<w:ptab>` element, representing an absolute-position tab character within a run.
This character advances the rendering position to the specified position regardless
of any tab-stops, perhaps for layout of a table-of-contents (TOC) or similar.
"""
def __str__(self) -> str:
"""Text equivalent of this element, a single tab ("\t") character.
This allows the text of run inner-content to be accessed in a consistent way
for all run inner-context text elements.
"""
return "\t"
# -- CT_Tab functionality is provided by CT_TabStop which also uses `w:tab` tag. That
# -- element class provides the __str__() method for this empty element, unconditionally
# -- returning "\t".
class CT_Text(BaseOxmlElement):
"""`<w:t>` element, containing a sequence of characters within a run."""
def __str__(self) -> str:
"""Text contained in this element, the empty string if it has no content.
This property allows this run inner-content element to be queried for its text
the same way as other run-content elements are. In particular, this never
returns None, as etree._Element does when there is no content.
"""
return self.text or ""
# ------------------------------------------------------------------------------------
# Utility
class _RunContentAppender:
"""Translates a Python string into run content elements appended in a `w:r` element.
Contiguous sequences of regular characters are appended in a single `<w:t>` element.
Each tab character ('\t') causes a `<w:tab/>` element to be appended. Likewise a
newline or carriage return character ('\n', '\r') causes a `<w:cr>` element to be
appended.
"""
def __init__(self, r: CT_R):
self._r = r
self._bfr: List[str] = []
@classmethod
def append_to_run_from_text(cls, r: CT_R, text: str):
"""Append inner-content elements for `text` to `r` element."""
appender = cls(r)
appender.add_text(text)
def add_text(self, text: str):
"""Append inner-content elements for `text` to the `w:r` element."""
for char in text:
self.add_char(char)
self.flush()
def add_char(self, char: str):
"""Process next character of input through finite state maching (FSM).
There are two possible states, buffer pending and not pending, but those are
hidden behind the `.flush()` method which must be called at the end of text to
ensure any pending `<w:t>` element is written.
"""
if char == "\t":
self.flush()
self._r.add_tab()
elif char in "\r\n":
self.flush()
self._r.add_br()
else:
self._bfr.append(char)
def flush(self):
text = "".join(self._bfr)
if text:
self._r.add_t(text)
self._bfr.clear()
|