1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
|
"""Custom element classes for core properties-related XML elements."""
from __future__ import annotations
import datetime as dt
import re
from typing import TYPE_CHECKING, Any, Callable
from docx.oxml.ns import nsdecls, qn
from docx.oxml.parser import parse_xml
from docx.oxml.xmlchemy import BaseOxmlElement, ZeroOrOne
if TYPE_CHECKING:
from lxml.etree import _Element as etree_Element # pyright: ignore[reportPrivateUsage]
class CT_CoreProperties(BaseOxmlElement):
"""`<cp:coreProperties>` element, the root element of the Core Properties part.
Stored as `/docProps/core.xml`. Implements many of the Dublin Core document metadata
elements. String elements resolve to an empty string ("") if the element is not
present in the XML. String elements are limited in length to 255 unicode characters.
"""
get_or_add_revision: Callable[[], etree_Element]
category = ZeroOrOne("cp:category", successors=())
contentStatus = ZeroOrOne("cp:contentStatus", successors=())
created = ZeroOrOne("dcterms:created", successors=())
creator = ZeroOrOne("dc:creator", successors=())
description = ZeroOrOne("dc:description", successors=())
identifier = ZeroOrOne("dc:identifier", successors=())
keywords = ZeroOrOne("cp:keywords", successors=())
language = ZeroOrOne("dc:language", successors=())
lastModifiedBy = ZeroOrOne("cp:lastModifiedBy", successors=())
lastPrinted = ZeroOrOne("cp:lastPrinted", successors=())
modified = ZeroOrOne("dcterms:modified", successors=())
revision: etree_Element | None = ZeroOrOne( # pyright: ignore[reportAssignmentType]
"cp:revision", successors=()
)
subject = ZeroOrOne("dc:subject", successors=())
title = ZeroOrOne("dc:title", successors=())
version = ZeroOrOne("cp:version", successors=())
_coreProperties_tmpl = "<cp:coreProperties %s/>\n" % nsdecls("cp", "dc", "dcterms")
@classmethod
def new(cls):
"""Return a new `<cp:coreProperties>` element."""
xml = cls._coreProperties_tmpl
coreProperties = parse_xml(xml)
return coreProperties
@property
def author_text(self):
"""The text in the `dc:creator` child element."""
return self._text_of_element("creator")
@author_text.setter
def author_text(self, value: str):
self._set_element_text("creator", value)
@property
def category_text(self) -> str:
return self._text_of_element("category")
@category_text.setter
def category_text(self, value: str):
self._set_element_text("category", value)
@property
def comments_text(self) -> str:
return self._text_of_element("description")
@comments_text.setter
def comments_text(self, value: str):
self._set_element_text("description", value)
@property
def contentStatus_text(self):
return self._text_of_element("contentStatus")
@contentStatus_text.setter
def contentStatus_text(self, value: str):
self._set_element_text("contentStatus", value)
@property
def created_datetime(self):
return self._datetime_of_element("created")
@created_datetime.setter
def created_datetime(self, value: dt.datetime):
self._set_element_datetime("created", value)
@property
def identifier_text(self):
return self._text_of_element("identifier")
@identifier_text.setter
def identifier_text(self, value: str):
self._set_element_text("identifier", value)
@property
def keywords_text(self):
return self._text_of_element("keywords")
@keywords_text.setter
def keywords_text(self, value: str):
self._set_element_text("keywords", value)
@property
def language_text(self):
return self._text_of_element("language")
@language_text.setter
def language_text(self, value: str):
self._set_element_text("language", value)
@property
def lastModifiedBy_text(self):
return self._text_of_element("lastModifiedBy")
@lastModifiedBy_text.setter
def lastModifiedBy_text(self, value: str):
self._set_element_text("lastModifiedBy", value)
@property
def lastPrinted_datetime(self):
return self._datetime_of_element("lastPrinted")
@lastPrinted_datetime.setter
def lastPrinted_datetime(self, value: dt.datetime):
self._set_element_datetime("lastPrinted", value)
@property
def modified_datetime(self) -> dt.datetime | None:
return self._datetime_of_element("modified")
@modified_datetime.setter
def modified_datetime(self, value: dt.datetime):
self._set_element_datetime("modified", value)
@property
def revision_number(self):
"""Integer value of revision property."""
revision = self.revision
if revision is None:
return 0
revision_str = str(revision.text)
try:
revision = int(revision_str)
except ValueError:
# non-integer revision strings also resolve to 0
revision = 0
# as do negative integers
if revision < 0:
revision = 0
return revision
@revision_number.setter
def revision_number(self, value: int):
"""Set revision property to string value of integer `value`."""
if not isinstance(value, int) or value < 1: # pyright: ignore[reportUnnecessaryIsInstance]
tmpl = "revision property requires positive int, got '%s'"
raise ValueError(tmpl % value)
revision = self.get_or_add_revision()
revision.text = str(value)
@property
def subject_text(self):
return self._text_of_element("subject")
@subject_text.setter
def subject_text(self, value: str):
self._set_element_text("subject", value)
@property
def title_text(self):
return self._text_of_element("title")
@title_text.setter
def title_text(self, value: str):
self._set_element_text("title", value)
@property
def version_text(self):
return self._text_of_element("version")
@version_text.setter
def version_text(self, value: str):
self._set_element_text("version", value)
def _datetime_of_element(self, property_name: str) -> dt.datetime | None:
element = getattr(self, property_name)
if element is None:
return None
datetime_str = element.text
try:
return self._parse_W3CDTF_to_datetime(datetime_str)
except ValueError:
# invalid datetime strings are ignored
return None
def _get_or_add(self, prop_name: str) -> BaseOxmlElement:
"""Return element returned by "get_or_add_" method for `prop_name`."""
get_or_add_method_name = "get_or_add_%s" % prop_name
get_or_add_method = getattr(self, get_or_add_method_name)
element = get_or_add_method()
return element
@classmethod
def _offset_dt(cls, dt_: dt.datetime, offset_str: str) -> dt.datetime:
"""A |datetime| instance offset from `dt_` by timezone offset in `offset_str`.
`offset_str` is like `"-07:00"`.
"""
match = cls._offset_pattern.match(offset_str)
if match is None:
raise ValueError("'%s' is not a valid offset string" % offset_str)
sign, hours_str, minutes_str = match.groups()
sign_factor = -1 if sign == "+" else 1
hours = int(hours_str) * sign_factor
minutes = int(minutes_str) * sign_factor
td = dt.timedelta(hours=hours, minutes=minutes)
return dt_ + td
_offset_pattern = re.compile(r"([+-])(\d\d):(\d\d)")
@classmethod
def _parse_W3CDTF_to_datetime(cls, w3cdtf_str: str) -> dt.datetime:
# valid W3CDTF date cases:
# yyyy e.g. "2003"
# yyyy-mm e.g. "2003-12"
# yyyy-mm-dd e.g. "2003-12-31"
# UTC timezone e.g. "2003-12-31T10:14:55Z"
# numeric timezone e.g. "2003-12-31T10:14:55-08:00"
templates = (
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%d",
"%Y-%m",
"%Y",
)
# strptime isn't smart enough to parse literal timezone offsets like
# "-07:30", so we have to do it ourselves
parseable_part = w3cdtf_str[:19]
offset_str = w3cdtf_str[19:]
dt_ = None
for tmpl in templates:
try:
dt_ = dt.datetime.strptime(parseable_part, tmpl)
except ValueError:
continue
if dt_ is None:
tmpl = "could not parse W3CDTF datetime string '%s'"
raise ValueError(tmpl % w3cdtf_str)
if len(offset_str) == 6:
dt_ = cls._offset_dt(dt_, offset_str)
return dt_.replace(tzinfo=dt.timezone.utc)
def _set_element_datetime(self, prop_name: str, value: dt.datetime):
"""Set date/time value of child element having `prop_name` to `value`."""
if not isinstance(value, dt.datetime): # pyright: ignore[reportUnnecessaryIsInstance]
tmpl = "property requires <type 'datetime.datetime'> object, got %s"
raise ValueError(tmpl % type(value))
element = self._get_or_add(prop_name)
dt_str = value.strftime("%Y-%m-%dT%H:%M:%SZ")
element.text = dt_str
if prop_name in ("created", "modified"):
# These two require an explicit "xsi:type="dcterms:W3CDTF""
# attribute. The first and last line are a hack required to add
# the xsi namespace to the root element rather than each child
# element in which it is referenced
self.set(qn("xsi:foo"), "bar")
element.set(qn("xsi:type"), "dcterms:W3CDTF")
del self.attrib[qn("xsi:foo")]
def _set_element_text(self, prop_name: str, value: Any) -> None:
"""Set string value of `name` property to `value`."""
if not isinstance(value, str):
value = str(value)
if len(value) > 255:
tmpl = "exceeded 255 char limit for property, got:\n\n'%s'"
raise ValueError(tmpl % value)
element = self._get_or_add(prop_name)
element.text = value
def _text_of_element(self, property_name: str) -> str:
"""The text in the element matching `property_name`.
The empty string if the element is not present or contains no text.
"""
element = getattr(self, property_name)
if element is None:
return ""
if element.text is None:
return ""
return element.text
|