about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/epub/__init__.py
blob: 7f6b76ae79ba9f0f95242d4e4b9c910da22e00b0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
# -*- coding: utf-8 -*-
"""Library to open and read files in the epub version 2."""
from __future__ import unicode_literals


__author__ = 'Florian Strzelecki <florian.strzelecki@gmail.com>'
__version__ = '0.5.2'
__all__ = ['opf', 'ncx', 'utils']


import os
import shutil
import tempfile
import uuid
import warnings
import zipfile


from xml.dom import minidom

from epub import ncx, opf, utils


MIMETYPE_EPUB = 'application/epub+zip'
MIMETYPE_OPF = 'application/oebps-package+xml'
MIMETYPE_NCX = 'application/x-dtbncx+xml'

DEFAULT_OPF_PATH = 'OEBPS/content.opf'
DEFAULT_NCX_PATH = 'toc.ncx'


def open(filename, mode=None):
    """Open an epub file and return an EpubFile object"""
    warnings.warn('Function `epub.open` is deprecated since 0.5.0.',
                  DeprecationWarning)
    return open_epub(filename, mode)


def open_epub(filename, mode=None):
    return EpubFile(filename, mode)


class BadEpubFile(zipfile.BadZipfile):
    pass


class EpubFile(zipfile.ZipFile):
    """Represent an epub zip file, as described in version 2.0.1 of epub spec.

    This class allow an access throught a low-level API to the epub real file.
    It extends zipfile.ZipFile class and modify only a little some of its
    behavior.

    See http://idpf.org/epub/201 for more information about Epub 2.0.1.

    """
    @property
    def content_path(self):
        """Return the content path, ie, the path relative to OPF file.

        If OPF file is located in `OEBPS/content.opf`, then `content_path` is
        equal to `OEBPS`.

        """
        return os.path.dirname(self.opf_path).replace('\\', '/')

    def __init__(self, filename, mode=None):
        """Open the Epub zip file with mode read "r", write "w" or append "a".
        """
        mode = mode or 'r'
        zipfile.ZipFile.__init__(self, filename, mode)
        self.uid = None
        self.opf_path = None
        self.opf = None
        self.toc = None

        if self.mode == 'r':
            self._init_read()
        elif self.mode == 'w':
            self._init_new()
        elif self.mode == 'a':
            if len(self.namelist()) == 0:
                self._init_new()
            else:
                self._init_read()

    def _init_new(self):
        """Build an empty epub archive."""
        # Write mimetype file: 'application/epub+zip'
        self.writestr('mimetype', MIMETYPE_EPUB)
        # Default path for opf
        self.opf_path = DEFAULT_OPF_PATH
        # Uid & Uid's id
        uid_id = 'BookId'
        self.uid = '%s' % uuid.uuid4()
        # Create metadata, manifest, and spine, as minimalist as possible
        metadata = opf.Metadata()
        metadata.add_identifier(self.uid, uid_id, 'uid')
        manifest = opf.Manifest()
        manifest.add_item('ncx', 'toc.ncx', MIMETYPE_NCX)
        spine = opf.Spine('ncx')
        # Create Opf object
        self.opf = opf.Opf(uid_id=uid_id,
                           metadata=metadata, manifest=manifest, spine=spine)
        # Create Ncx object
        self.toc = ncx.Ncx()
        self.toc.uid = self.uid

    def _init_read(self):
        """Get content from existing epub file"""
        # Read container.xml to get OPF xml file path
        xmlstring = self.read('META-INF/container.xml')
        container_xml = minidom.parseString(xmlstring).documentElement

        for element in container_xml.getElementsByTagName('rootfile'):
            if element.getAttribute('media-type') == MIMETYPE_OPF:
                # Only take the first full-path available
                self.opf_path = element.getAttribute('full-path')
                break

        # Read OPF xml file
        xml_string = self.read(self.opf_path)
        self.opf = opf.parse_opf(xml_string)
        uids = [x for x in self.opf.metadata.identifiers
                      if x[1] == self.opf.uid_id]
        if uids:
            self.uid = uids[0]
        else:
            self.uid = None
            warnings.warn('The ePub does not define any uid', SyntaxWarning)

        item_toc = self.get_item(self.opf.spine.toc)

        # Inspect NCX toc file
        self.toc = None
        if item_toc is not None:
            self.toc = ncx.parse_toc(self.read_item(item_toc))
        else:
            warnings.warn('The ePub does not define any NCX file',
                          SyntaxWarning)
            self.toc = ncx.Ncx()
            self.toc.uid = self.uid

    def close(self):
        if self.fp is None:
            return
        if self.mode in ('w', 'a'):
            self._write_close()
        zipfile.ZipFile.close(self)

    def remove_paths(self, paths):
        """Remove files from the archive

        Warning: This will be slow, it needs to recreate from scratch the
        complete archive.

        This method (well, the whole behavior of "write epub file") needs
        a rework in a future version.

        """
        with tempfile.NamedTemporaryFile('rb', delete=False) as temp:
            with zipfile.ZipFile(temp.name, 'w') as new_zip:
                for item in self.infolist():
                    if item.filename not in paths:
                        new_zip.writestr(item, self.read(item.filename))
            zipfile.ZipFile.close(self)
            shutil.move(temp.name, self.filename)
            zipfile.ZipFile.__init__(self, self.filename, self.mode)

    def _write_close(self):
        """Handle writes when closing epub.

        Both new file mode (w) and append file mode (a), some files must be
        generated: container, OPF, and NCX.

        """
        item_toc = self.get_item(self.opf.spine.toc)

        # Remove the old files
        to_remove = ['META-INF/container.xml', self.opf_path]
        if item_toc:
            to_remove.append(
                # Replace \ by /, no matter what OS's separator could be
                os.path.join(self.content_path,
                             item_toc.href).replace('\\', '/')
            )

        self.remove_paths(to_remove)

        # Write META-INF/container.xml
        self.writestr('META-INF/container.xml',
                      self._build_container().encode('utf-8'))
        # Write OPF File
        self.writestr(self.opf_path,
                      self.opf.as_xml_document().toxml().encode('utf-8'))
        # Write NCX File if exist
        if item_toc:
            toc_path = os.path.join(
                self.content_path, item_toc.href
            ).replace('\\', '/')
            toc_content = self.toc.as_xml_document().toxml().encode('utf-8')

            self.writestr(toc_path, toc_content)

    def _build_container(self):
        """Build a simple XML container as in epub 2.0.1 specification."""
        template = """<?xml version="1.0" encoding="UTF-8"?>
    <container version="1.0"
               xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
        <rootfiles>
             <rootfile full-path="%s"
                       media-type="application/oebps-package+xml"/>
        </rootfiles>
    </container>"""
        return template % self.opf_path

    def add_item(self, filename, manifest_item,
                 append_to_spine=False, is_linear=True):
        """Add a file to epub.

        A manifest item must be provide to describe it.

        This function will raise a RuntimeError if epub is already closed. It
        will raise an IOError if epub is open in read-only (`r` mode).

        Optional: you can use `append_to_spine` flag (default=False) to append
        item to spine, and use `is_linear` (default=True) to specify if it is
        linear or not.

        """
        self.check_mode_write()
        self.opf.manifest.append(manifest_item)

        write_path = os.path.join(
            self.content_path, manifest_item.href
        ).replace('\\', '/')

        self.write(filename, write_path)

        if append_to_spine:
            self.opf.spine.add_itemref(manifest_item.identifier, is_linear)

    def check_mode_write(self):
        """Raise error if epub file is not writable.

        Raise RuntimeError if file is already closed.

        Raise IOError if file is opened read-only.

        """
        if not self.fp:
            raise RuntimeError(
                  'Attempt to write to EPUB file that was already closed')

        if self.mode == 'r':
            raise IOError(
                  'Attempt to write to EPUB file that was open as read-only.')

    # extract method is  zipfile.ZipFile.extract(member[, path[, pwd]])

    def extract_item(self, item, to_path=None):
        """Extract an item from its href in epub to `to_path` location.
        """
        path = item if not hasattr(item, 'href') else item.href
        member_path = os.path.join(self.content_path, path).replace('\\', '/')

        return  self.extract(member=member_path, path=to_path)

    def get_item(self, identifier):
        """Get an item from manifest through its "id" attribute.

        Return an EpubManifestItem if found, else None.

        """
        return self.opf.manifest.get(identifier, None)

    def get_item_by_href(self, href):
        """Get an item from manifest through its "href" attribute.

        Return an EpubManifestItem if found, else None.

        """
        found = [x for x in self.opf.manifest.values() if x.href == href]
        size = len(found)
        if size == 1:
            return found[0]
        elif size > 1:
            raise LookupError('Multiple items are found with this href.')
        else:
            return None

    # read method is zipfile.ZipFile.read(path)

    def read_item(self, item):
        """Read a file from the epub zipfile container.

        "item" parameter can be the relative path to the opf file or an
        EpubManifestItem object.

        Html fragments are not acceptable : the path must be exactly the same
        as indicated in the opf file.

        """
        path = item
        if hasattr(item, 'href'):
            path = item.href

        return self.read(
            # Replace \ by /, as ZipFile always uses / as path separator.
            os.path.join(self.content_path, path).replace('\\', '/')
        )


class Book(object):
    """This class is an attempt to expose a simpler object model than EpubFile.

    WARNING: Work in progress. Use with caution.

    """

    def __init__(self, epub_file):
        self.epub_file = epub_file

    @property
    def creators(self):
        return self.epub_file.opf.metadata.creators

    @property
    def description(self):
        return self.epub_file.opf.metadata.description

    @property
    def isbn(self):
        return self.epub_file.opf.metadata.get_isbn()

    @property
    def publisher(self):
        return self.epub_file.opf.metadata.publisher

    @property
    def contributors(self):
        return self.epub_file.opf.metadata.contributors

    @property
    def dates(self):
        return self.epub_file.opf.metadata.dates

    @property
    def dc_type(self):
        return self.epub_file.opf.metadata.dc_type

    @property
    def dc_format(self):
        return self.epub_file.opf.metadata.format

    @property
    def identifiers(self):
        return self.epub_file.opf.metadata.identifiers

    @property
    def source(self):
        return self.epub_file.opf.metadata.source

    @property
    def languages(self):
        return self.epub_file.opf.metadata.languages

    @property
    def relation(self):
        return self.epub_file.opf.metadata.relation

    @property
    def coverage(self):
        return self.epub_file.opf.metadata.coverage

    @property
    def right(self):
        return self.epub_file.opf.metadata.right

    @property
    def metas(self):
        return self.epub_file.opf.metadata.metas

    @property
    def subjects(self):
        return self.epub_file.opf.metadata.subjects

    @property
    def titles(self):
        return self.epub_file.opf.metadata.titles

    @property
    def chapters(self):
        """
        Return a list of linear chapter from spine.
        """
        return [BookChapter(self, identifier)
                for identifier, linear in self.epub_file.opf.spine.itemrefs
                if linear]

    @property
    def extra_chapters(self):
        """
        Return a list of non-linear chapter from spine.
        """
        return [BookChapter(self, identifier)
                for identifier, linear in self.epub_file.opf.spine.itemrefs
                if not linear]


class BookChapter(object):

    @property
    def identifier(self):
        return self._manifest_item.identifier

    def __init__(self, book, identifier, fragment=None):
        self._book = book
        self._manifest_item = self._book.epub_file.get_item(identifier)
        self._fragment = fragment

    def read(self):
        return self._book.epub_file.read_item(self._manifest_item)