1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
|
# -*- coding: utf-8 -*-
"""Library to open and read files in the epub version 2."""
from __future__ import unicode_literals
__author__ = 'Florian Strzelecki <florian.strzelecki@gmail.com>'
__version__ = '0.5.2'
__all__ = ['opf', 'ncx', 'utils']
import os
import shutil
import tempfile
import uuid
import warnings
import zipfile
from xml.dom import minidom
from epub import ncx, opf, utils
MIMETYPE_EPUB = 'application/epub+zip'
MIMETYPE_OPF = 'application/oebps-package+xml'
MIMETYPE_NCX = 'application/x-dtbncx+xml'
DEFAULT_OPF_PATH = 'OEBPS/content.opf'
DEFAULT_NCX_PATH = 'toc.ncx'
def open(filename, mode=None):
"""Open an epub file and return an EpubFile object"""
warnings.warn('Function `epub.open` is deprecated since 0.5.0.',
DeprecationWarning)
return open_epub(filename, mode)
def open_epub(filename, mode=None):
return EpubFile(filename, mode)
class BadEpubFile(zipfile.BadZipfile):
pass
class EpubFile(zipfile.ZipFile):
"""Represent an epub zip file, as described in version 2.0.1 of epub spec.
This class allow an access throught a low-level API to the epub real file.
It extends zipfile.ZipFile class and modify only a little some of its
behavior.
See http://idpf.org/epub/201 for more information about Epub 2.0.1.
"""
@property
def content_path(self):
"""Return the content path, ie, the path relative to OPF file.
If OPF file is located in `OEBPS/content.opf`, then `content_path` is
equal to `OEBPS`.
"""
return os.path.dirname(self.opf_path).replace('\\', '/')
def __init__(self, filename, mode=None):
"""Open the Epub zip file with mode read "r", write "w" or append "a".
"""
mode = mode or 'r'
zipfile.ZipFile.__init__(self, filename, mode)
self.uid = None
self.opf_path = None
self.opf = None
self.toc = None
if self.mode == 'r':
self._init_read()
elif self.mode == 'w':
self._init_new()
elif self.mode == 'a':
if len(self.namelist()) == 0:
self._init_new()
else:
self._init_read()
def _init_new(self):
"""Build an empty epub archive."""
# Write mimetype file: 'application/epub+zip'
self.writestr('mimetype', MIMETYPE_EPUB)
# Default path for opf
self.opf_path = DEFAULT_OPF_PATH
# Uid & Uid's id
uid_id = 'BookId'
self.uid = '%s' % uuid.uuid4()
# Create metadata, manifest, and spine, as minimalist as possible
metadata = opf.Metadata()
metadata.add_identifier(self.uid, uid_id, 'uid')
manifest = opf.Manifest()
manifest.add_item('ncx', 'toc.ncx', MIMETYPE_NCX)
spine = opf.Spine('ncx')
# Create Opf object
self.opf = opf.Opf(uid_id=uid_id,
metadata=metadata, manifest=manifest, spine=spine)
# Create Ncx object
self.toc = ncx.Ncx()
self.toc.uid = self.uid
def _init_read(self):
"""Get content from existing epub file"""
# Read container.xml to get OPF xml file path
xmlstring = self.read('META-INF/container.xml')
container_xml = minidom.parseString(xmlstring).documentElement
for element in container_xml.getElementsByTagName('rootfile'):
if element.getAttribute('media-type') == MIMETYPE_OPF:
# Only take the first full-path available
self.opf_path = element.getAttribute('full-path')
break
# Read OPF xml file
xml_string = self.read(self.opf_path)
self.opf = opf.parse_opf(xml_string)
uids = [x for x in self.opf.metadata.identifiers
if x[1] == self.opf.uid_id]
if uids:
self.uid = uids[0]
else:
self.uid = None
warnings.warn('The ePub does not define any uid', SyntaxWarning)
item_toc = self.get_item(self.opf.spine.toc)
# Inspect NCX toc file
self.toc = None
if item_toc is not None:
self.toc = ncx.parse_toc(self.read_item(item_toc))
else:
warnings.warn('The ePub does not define any NCX file',
SyntaxWarning)
self.toc = ncx.Ncx()
self.toc.uid = self.uid
def close(self):
if self.fp is None:
return
if self.mode in ('w', 'a'):
self._write_close()
zipfile.ZipFile.close(self)
def remove_paths(self, paths):
"""Remove files from the archive
Warning: This will be slow, it needs to recreate from scratch the
complete archive.
This method (well, the whole behavior of "write epub file") needs
a rework in a future version.
"""
with tempfile.NamedTemporaryFile('rb', delete=False) as temp:
with zipfile.ZipFile(temp.name, 'w') as new_zip:
for item in self.infolist():
if item.filename not in paths:
new_zip.writestr(item, self.read(item.filename))
zipfile.ZipFile.close(self)
shutil.move(temp.name, self.filename)
zipfile.ZipFile.__init__(self, self.filename, self.mode)
def _write_close(self):
"""Handle writes when closing epub.
Both new file mode (w) and append file mode (a), some files must be
generated: container, OPF, and NCX.
"""
item_toc = self.get_item(self.opf.spine.toc)
# Remove the old files
to_remove = ['META-INF/container.xml', self.opf_path]
if item_toc:
to_remove.append(
# Replace \ by /, no matter what OS's separator could be
os.path.join(self.content_path,
item_toc.href).replace('\\', '/')
)
self.remove_paths(to_remove)
# Write META-INF/container.xml
self.writestr('META-INF/container.xml',
self._build_container().encode('utf-8'))
# Write OPF File
self.writestr(self.opf_path,
self.opf.as_xml_document().toxml().encode('utf-8'))
# Write NCX File if exist
if item_toc:
toc_path = os.path.join(
self.content_path, item_toc.href
).replace('\\', '/')
toc_content = self.toc.as_xml_document().toxml().encode('utf-8')
self.writestr(toc_path, toc_content)
def _build_container(self):
"""Build a simple XML container as in epub 2.0.1 specification."""
template = """<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0"
xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="%s"
media-type="application/oebps-package+xml"/>
</rootfiles>
</container>"""
return template % self.opf_path
def add_item(self, filename, manifest_item,
append_to_spine=False, is_linear=True):
"""Add a file to epub.
A manifest item must be provide to describe it.
This function will raise a RuntimeError if epub is already closed. It
will raise an IOError if epub is open in read-only (`r` mode).
Optional: you can use `append_to_spine` flag (default=False) to append
item to spine, and use `is_linear` (default=True) to specify if it is
linear or not.
"""
self.check_mode_write()
self.opf.manifest.append(manifest_item)
write_path = os.path.join(
self.content_path, manifest_item.href
).replace('\\', '/')
self.write(filename, write_path)
if append_to_spine:
self.opf.spine.add_itemref(manifest_item.identifier, is_linear)
def check_mode_write(self):
"""Raise error if epub file is not writable.
Raise RuntimeError if file is already closed.
Raise IOError if file is opened read-only.
"""
if not self.fp:
raise RuntimeError(
'Attempt to write to EPUB file that was already closed')
if self.mode == 'r':
raise IOError(
'Attempt to write to EPUB file that was open as read-only.')
# extract method is zipfile.ZipFile.extract(member[, path[, pwd]])
def extract_item(self, item, to_path=None):
"""Extract an item from its href in epub to `to_path` location.
"""
path = item if not hasattr(item, 'href') else item.href
member_path = os.path.join(self.content_path, path).replace('\\', '/')
return self.extract(member=member_path, path=to_path)
def get_item(self, identifier):
"""Get an item from manifest through its "id" attribute.
Return an EpubManifestItem if found, else None.
"""
return self.opf.manifest.get(identifier, None)
def get_item_by_href(self, href):
"""Get an item from manifest through its "href" attribute.
Return an EpubManifestItem if found, else None.
"""
found = [x for x in self.opf.manifest.values() if x.href == href]
size = len(found)
if size == 1:
return found[0]
elif size > 1:
raise LookupError('Multiple items are found with this href.')
else:
return None
# read method is zipfile.ZipFile.read(path)
def read_item(self, item):
"""Read a file from the epub zipfile container.
"item" parameter can be the relative path to the opf file or an
EpubManifestItem object.
Html fragments are not acceptable : the path must be exactly the same
as indicated in the opf file.
"""
path = item
if hasattr(item, 'href'):
path = item.href
return self.read(
# Replace \ by /, as ZipFile always uses / as path separator.
os.path.join(self.content_path, path).replace('\\', '/')
)
class Book(object):
"""This class is an attempt to expose a simpler object model than EpubFile.
WARNING: Work in progress. Use with caution.
"""
def __init__(self, epub_file):
self.epub_file = epub_file
@property
def creators(self):
return self.epub_file.opf.metadata.creators
@property
def description(self):
return self.epub_file.opf.metadata.description
@property
def isbn(self):
return self.epub_file.opf.metadata.get_isbn()
@property
def publisher(self):
return self.epub_file.opf.metadata.publisher
@property
def contributors(self):
return self.epub_file.opf.metadata.contributors
@property
def dates(self):
return self.epub_file.opf.metadata.dates
@property
def dc_type(self):
return self.epub_file.opf.metadata.dc_type
@property
def dc_format(self):
return self.epub_file.opf.metadata.format
@property
def identifiers(self):
return self.epub_file.opf.metadata.identifiers
@property
def source(self):
return self.epub_file.opf.metadata.source
@property
def languages(self):
return self.epub_file.opf.metadata.languages
@property
def relation(self):
return self.epub_file.opf.metadata.relation
@property
def coverage(self):
return self.epub_file.opf.metadata.coverage
@property
def right(self):
return self.epub_file.opf.metadata.right
@property
def metas(self):
return self.epub_file.opf.metadata.metas
@property
def subjects(self):
return self.epub_file.opf.metadata.subjects
@property
def titles(self):
return self.epub_file.opf.metadata.titles
@property
def chapters(self):
"""
Return a list of linear chapter from spine.
"""
return [BookChapter(self, identifier)
for identifier, linear in self.epub_file.opf.spine.itemrefs
if linear]
@property
def extra_chapters(self):
"""
Return a list of non-linear chapter from spine.
"""
return [BookChapter(self, identifier)
for identifier, linear in self.epub_file.opf.spine.itemrefs
if not linear]
class BookChapter(object):
@property
def identifier(self):
return self._manifest_item.identifier
def __init__(self, book, identifier, fragment=None):
self._book = book
self._manifest_item = self._book.epub_file.get_item(identifier)
self._fragment = fragment
def read(self):
return self._book.epub_file.read_item(self._manifest_item)
|