diff options
| author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
|---|---|---|
| committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
| commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
| tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/epub | |
| parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
| download | gn-ai-master.tar.gz | |
Diffstat (limited to '.venv/lib/python3.12/site-packages/epub')
| -rw-r--r-- | .venv/lib/python3.12/site-packages/epub/__init__.py | 423 | ||||
| -rw-r--r-- | .venv/lib/python3.12/site-packages/epub/ncx.py | 675 | ||||
| -rw-r--r-- | .venv/lib/python3.12/site-packages/epub/opf.py | 535 | ||||
| -rw-r--r-- | .venv/lib/python3.12/site-packages/epub/utils.py | 38 |
4 files changed, 1671 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/epub/__init__.py b/.venv/lib/python3.12/site-packages/epub/__init__.py new file mode 100644 index 00000000..7f6b76ae --- /dev/null +++ b/.venv/lib/python3.12/site-packages/epub/__init__.py @@ -0,0 +1,423 @@ +# -*- coding: utf-8 -*- +"""Library to open and read files in the epub version 2.""" +from __future__ import unicode_literals + + +__author__ = 'Florian Strzelecki <florian.strzelecki@gmail.com>' +__version__ = '0.5.2' +__all__ = ['opf', 'ncx', 'utils'] + + +import os +import shutil +import tempfile +import uuid +import warnings +import zipfile + + +from xml.dom import minidom + +from epub import ncx, opf, utils + + +MIMETYPE_EPUB = 'application/epub+zip' +MIMETYPE_OPF = 'application/oebps-package+xml' +MIMETYPE_NCX = 'application/x-dtbncx+xml' + +DEFAULT_OPF_PATH = 'OEBPS/content.opf' +DEFAULT_NCX_PATH = 'toc.ncx' + + +def open(filename, mode=None): + """Open an epub file and return an EpubFile object""" + warnings.warn('Function `epub.open` is deprecated since 0.5.0.', + DeprecationWarning) + return open_epub(filename, mode) + + +def open_epub(filename, mode=None): + return EpubFile(filename, mode) + + +class BadEpubFile(zipfile.BadZipfile): + pass + + +class EpubFile(zipfile.ZipFile): + """Represent an epub zip file, as described in version 2.0.1 of epub spec. + + This class allow an access throught a low-level API to the epub real file. + It extends zipfile.ZipFile class and modify only a little some of its + behavior. + + See http://idpf.org/epub/201 for more information about Epub 2.0.1. + + """ + @property + def content_path(self): + """Return the content path, ie, the path relative to OPF file. + + If OPF file is located in `OEBPS/content.opf`, then `content_path` is + equal to `OEBPS`. + + """ + return os.path.dirname(self.opf_path).replace('\\', '/') + + def __init__(self, filename, mode=None): + """Open the Epub zip file with mode read "r", write "w" or append "a". + """ + mode = mode or 'r' + zipfile.ZipFile.__init__(self, filename, mode) + self.uid = None + self.opf_path = None + self.opf = None + self.toc = None + + if self.mode == 'r': + self._init_read() + elif self.mode == 'w': + self._init_new() + elif self.mode == 'a': + if len(self.namelist()) == 0: + self._init_new() + else: + self._init_read() + + def _init_new(self): + """Build an empty epub archive.""" + # Write mimetype file: 'application/epub+zip' + self.writestr('mimetype', MIMETYPE_EPUB) + # Default path for opf + self.opf_path = DEFAULT_OPF_PATH + # Uid & Uid's id + uid_id = 'BookId' + self.uid = '%s' % uuid.uuid4() + # Create metadata, manifest, and spine, as minimalist as possible + metadata = opf.Metadata() + metadata.add_identifier(self.uid, uid_id, 'uid') + manifest = opf.Manifest() + manifest.add_item('ncx', 'toc.ncx', MIMETYPE_NCX) + spine = opf.Spine('ncx') + # Create Opf object + self.opf = opf.Opf(uid_id=uid_id, + metadata=metadata, manifest=manifest, spine=spine) + # Create Ncx object + self.toc = ncx.Ncx() + self.toc.uid = self.uid + + def _init_read(self): + """Get content from existing epub file""" + # Read container.xml to get OPF xml file path + xmlstring = self.read('META-INF/container.xml') + container_xml = minidom.parseString(xmlstring).documentElement + + for element in container_xml.getElementsByTagName('rootfile'): + if element.getAttribute('media-type') == MIMETYPE_OPF: + # Only take the first full-path available + self.opf_path = element.getAttribute('full-path') + break + + # Read OPF xml file + xml_string = self.read(self.opf_path) + self.opf = opf.parse_opf(xml_string) + uids = [x for x in self.opf.metadata.identifiers + if x[1] == self.opf.uid_id] + if uids: + self.uid = uids[0] + else: + self.uid = None + warnings.warn('The ePub does not define any uid', SyntaxWarning) + + item_toc = self.get_item(self.opf.spine.toc) + + # Inspect NCX toc file + self.toc = None + if item_toc is not None: + self.toc = ncx.parse_toc(self.read_item(item_toc)) + else: + warnings.warn('The ePub does not define any NCX file', + SyntaxWarning) + self.toc = ncx.Ncx() + self.toc.uid = self.uid + + def close(self): + if self.fp is None: + return + if self.mode in ('w', 'a'): + self._write_close() + zipfile.ZipFile.close(self) + + def remove_paths(self, paths): + """Remove files from the archive + + Warning: This will be slow, it needs to recreate from scratch the + complete archive. + + This method (well, the whole behavior of "write epub file") needs + a rework in a future version. + + """ + with tempfile.NamedTemporaryFile('rb', delete=False) as temp: + with zipfile.ZipFile(temp.name, 'w') as new_zip: + for item in self.infolist(): + if item.filename not in paths: + new_zip.writestr(item, self.read(item.filename)) + zipfile.ZipFile.close(self) + shutil.move(temp.name, self.filename) + zipfile.ZipFile.__init__(self, self.filename, self.mode) + + def _write_close(self): + """Handle writes when closing epub. + + Both new file mode (w) and append file mode (a), some files must be + generated: container, OPF, and NCX. + + """ + item_toc = self.get_item(self.opf.spine.toc) + + # Remove the old files + to_remove = ['META-INF/container.xml', self.opf_path] + if item_toc: + to_remove.append( + # Replace \ by /, no matter what OS's separator could be + os.path.join(self.content_path, + item_toc.href).replace('\\', '/') + ) + + self.remove_paths(to_remove) + + # Write META-INF/container.xml + self.writestr('META-INF/container.xml', + self._build_container().encode('utf-8')) + # Write OPF File + self.writestr(self.opf_path, + self.opf.as_xml_document().toxml().encode('utf-8')) + # Write NCX File if exist + if item_toc: + toc_path = os.path.join( + self.content_path, item_toc.href + ).replace('\\', '/') + toc_content = self.toc.as_xml_document().toxml().encode('utf-8') + + self.writestr(toc_path, toc_content) + + def _build_container(self): + """Build a simple XML container as in epub 2.0.1 specification.""" + template = """<?xml version="1.0" encoding="UTF-8"?> + <container version="1.0" + xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> + <rootfiles> + <rootfile full-path="%s" + media-type="application/oebps-package+xml"/> + </rootfiles> + </container>""" + return template % self.opf_path + + def add_item(self, filename, manifest_item, + append_to_spine=False, is_linear=True): + """Add a file to epub. + + A manifest item must be provide to describe it. + + This function will raise a RuntimeError if epub is already closed. It + will raise an IOError if epub is open in read-only (`r` mode). + + Optional: you can use `append_to_spine` flag (default=False) to append + item to spine, and use `is_linear` (default=True) to specify if it is + linear or not. + + """ + self.check_mode_write() + self.opf.manifest.append(manifest_item) + + write_path = os.path.join( + self.content_path, manifest_item.href + ).replace('\\', '/') + + self.write(filename, write_path) + + if append_to_spine: + self.opf.spine.add_itemref(manifest_item.identifier, is_linear) + + def check_mode_write(self): + """Raise error if epub file is not writable. + + Raise RuntimeError if file is already closed. + + Raise IOError if file is opened read-only. + + """ + if not self.fp: + raise RuntimeError( + 'Attempt to write to EPUB file that was already closed') + + if self.mode == 'r': + raise IOError( + 'Attempt to write to EPUB file that was open as read-only.') + + # extract method is zipfile.ZipFile.extract(member[, path[, pwd]]) + + def extract_item(self, item, to_path=None): + """Extract an item from its href in epub to `to_path` location. + """ + path = item if not hasattr(item, 'href') else item.href + member_path = os.path.join(self.content_path, path).replace('\\', '/') + + return self.extract(member=member_path, path=to_path) + + def get_item(self, identifier): + """Get an item from manifest through its "id" attribute. + + Return an EpubManifestItem if found, else None. + + """ + return self.opf.manifest.get(identifier, None) + + def get_item_by_href(self, href): + """Get an item from manifest through its "href" attribute. + + Return an EpubManifestItem if found, else None. + + """ + found = [x for x in self.opf.manifest.values() if x.href == href] + size = len(found) + if size == 1: + return found[0] + elif size > 1: + raise LookupError('Multiple items are found with this href.') + else: + return None + + # read method is zipfile.ZipFile.read(path) + + def read_item(self, item): + """Read a file from the epub zipfile container. + + "item" parameter can be the relative path to the opf file or an + EpubManifestItem object. + + Html fragments are not acceptable : the path must be exactly the same + as indicated in the opf file. + + """ + path = item + if hasattr(item, 'href'): + path = item.href + + return self.read( + # Replace \ by /, as ZipFile always uses / as path separator. + os.path.join(self.content_path, path).replace('\\', '/') + ) + + +class Book(object): + """This class is an attempt to expose a simpler object model than EpubFile. + + WARNING: Work in progress. Use with caution. + + """ + + def __init__(self, epub_file): + self.epub_file = epub_file + + @property + def creators(self): + return self.epub_file.opf.metadata.creators + + @property + def description(self): + return self.epub_file.opf.metadata.description + + @property + def isbn(self): + return self.epub_file.opf.metadata.get_isbn() + + @property + def publisher(self): + return self.epub_file.opf.metadata.publisher + + @property + def contributors(self): + return self.epub_file.opf.metadata.contributors + + @property + def dates(self): + return self.epub_file.opf.metadata.dates + + @property + def dc_type(self): + return self.epub_file.opf.metadata.dc_type + + @property + def dc_format(self): + return self.epub_file.opf.metadata.format + + @property + def identifiers(self): + return self.epub_file.opf.metadata.identifiers + + @property + def source(self): + return self.epub_file.opf.metadata.source + + @property + def languages(self): + return self.epub_file.opf.metadata.languages + + @property + def relation(self): + return self.epub_file.opf.metadata.relation + + @property + def coverage(self): + return self.epub_file.opf.metadata.coverage + + @property + def right(self): + return self.epub_file.opf.metadata.right + + @property + def metas(self): + return self.epub_file.opf.metadata.metas + + @property + def subjects(self): + return self.epub_file.opf.metadata.subjects + + @property + def titles(self): + return self.epub_file.opf.metadata.titles + + @property + def chapters(self): + """ + Return a list of linear chapter from spine. + """ + return [BookChapter(self, identifier) + for identifier, linear in self.epub_file.opf.spine.itemrefs + if linear] + + @property + def extra_chapters(self): + """ + Return a list of non-linear chapter from spine. + """ + return [BookChapter(self, identifier) + for identifier, linear in self.epub_file.opf.spine.itemrefs + if not linear] + + +class BookChapter(object): + + @property + def identifier(self): + return self._manifest_item.identifier + + def __init__(self, book, identifier, fragment=None): + self._book = book + self._manifest_item = self._book.epub_file.get_item(identifier) + self._fragment = fragment + + def read(self): + return self._book.epub_file.read_item(self._manifest_item) diff --git a/.venv/lib/python3.12/site-packages/epub/ncx.py b/.venv/lib/python3.12/site-packages/epub/ncx.py new file mode 100644 index 00000000..1e20f012 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/epub/ncx.py @@ -0,0 +1,675 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + + +""" +Python lib for reading NCX formated file for epub. + +There is some difference between NCX original format and one for Epub; see +officiel documention for more information.1111 + +NCX doc: http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX +NCX Epub spec: http://idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.4.1 +""" + + +from xml.dom import minidom + + +def parse_toc(xmlstring): + """Inspect an NCX formated xml document.""" + toc = Ncx() + toc_xml = minidom.parseString(xmlstring).documentElement + + xmlns = toc_xml.getAttribute('xmlns') + if xmlns: + toc.xmlns = xmlns + + version = toc_xml.getAttribute('version') + if version: + toc.version = version + + lang = toc_xml.getAttribute('xml:lang') + if lang: + toc.lang = lang + + # Inspect head > meta; unknow meta are ignored + head = toc_xml.getElementsByTagName('head')[0] + metas = {'dtb:uid': '', + 'dtb:depth': '', + 'dtb:totalPageCount': '', + 'dtb:maxPageNumber': '', + 'dtb:generator': ''} + + for meta in head.getElementsByTagName('meta'): + metas[meta.getAttribute('name')] = meta.getAttribute('content') + + toc.uid = metas['dtb:uid'] + toc.depth = metas['dtb:depth'] + toc.total_page_count = metas['dtb:totalPageCount'] + toc.max_page_number = metas['dtb:maxPageNumber'] + toc.generator = metas['dtb:generator'] + + # Get title (one and only one <docTitle> tag is required) + doc_title_node = toc_xml.getElementsByTagName('docTitle')[0] + toc.title = _parse_for_text_tag(doc_title_node) + + # Get authors (<docAuthor> tags are optionnal) + for author in toc_xml.getElementsByTagName('docAuthor'): + toc.authors.append(_parse_for_text_tag(author)) + + # Inspect <navMap> (one is required) + nav_map_node = toc_xml.getElementsByTagName('navMap')[0] + toc.nav_map = _parse_xml_nav_map(nav_map_node) + + # Inspect <pageList> (optionnal, only one) + page_lists = toc_xml.getElementsByTagName('pageList') + if len(page_lists) > 0: + toc.page_list = _parse_xml_page_list(page_lists[0]) + + # Inspect <navList> (optionnal, many are possible) + for nav_list in toc_xml.getElementsByTagName('navList'): + toc.add_nav_list(_parse_xml_nav_list(nav_list)) + + return toc + + +def _parse_xml_nav_map(element): + """Inspect an xml.dom.Element <navMap> and return a NcxNavMap object.""" + nav_map = NavMap() + nav_map.identifier = element.getAttribute('id') + + children = [e for e in element.childNodes if e.nodeType == e.ELEMENT_NODE] + for node in children: + if node.tagName == 'navLabel': + nav_map.add_label(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'navInfo': + nav_map.add_info(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'navPoint': + nav_map.add_point(_parse_xml_nav_point(node)) + + return nav_map + + +def _parse_xml_nav_point(element): + """Inspect an xml.dom.Element <navPoint> and return a NcxNavPoint object. + """ + nav_point = NavPoint() + nav_point.identifier = element.getAttribute('id') + nav_point.class_name = element.getAttribute('class') + nav_point.play_order = element.getAttribute('playOrder') + + children = [e for e in element.childNodes if e.nodeType == e.ELEMENT_NODE] + for node in children: + if node.tagName == 'navLabel': + nav_point.add_label(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'content': + nav_point.src = node.getAttribute('src') + elif node.tagName == 'navPoint': + nav_point.add_point(_parse_xml_nav_point(node)) + + return nav_point + + +def _parse_xml_page_list(element): + """Inspect an xml.dom.Element <pageList> and return a NcxPageList object. + """ + page_list = PageList() + page_list.identifier = element.getAttribute('id') + page_list.class_name = element.getAttribute('class') + + children = [e for e in element.childNodes if e.nodeType == e.ELEMENT_NODE] + for node in children: + if node.tagName == 'navLabel': + page_list.add_label(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'navInfo': + page_list.add_info(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'pageTarget': + page_list.add_target(_parse_xml_page_target(node)) + + return page_list + + +def _parse_xml_page_target(element): + """Inspect an xml.dom.Element <pageTarget> and return a NcxPageTarget + object.""" + page_target = PageTarget() + page_target.identifier = element.getAttribute('id') + page_target.value = element.getAttribute('value') + page_target.target_type = element.getAttribute('type') + page_target.class_name = element.getAttribute('class') + page_target.play_order = element.getAttribute('playOrder') + + children = [e for e in element.childNodes if e.nodeType == e.ELEMENT_NODE] + for node in children: + if node.tagName == 'navLabel': + page_target.add_label(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'content': + page_target.src = node.getAttribute('src') + + return page_target + + +def _parse_xml_nav_list(element): + """Inspect an xml.dom.Element <navList> and return a NcxNavList object.""" + nav_list = NavList() + nav_list.identifier = element.getAttribute('id') + nav_list.class_name = element.getAttribute('class') + + children = [e for e in element.childNodes if e.nodeType == e.ELEMENT_NODE] + for node in children: + if node.tagName == 'navLabel': + nav_list.add_label(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'navInfo': + nav_list.add_info(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'navTarget': + nav_list.add_target(_parse_xml_nav_target(node)) + + return nav_list + + +def _parse_xml_nav_target(element): + """Inspect an xml.dom.Element <navTarget> and return a NcxNavTarget + object.""" + nav_target = NavTarget() + nav_target.identifier = element.getAttribute('id') + nav_target.value = element.getAttribute('value') + nav_target.class_name = element.getAttribute('class') + nav_target.play_order = element.getAttribute('playOrder') + + children = [e for e in element.childNodes if e.nodeType == e.ELEMENT_NODE] + for node in children: + if node.tagName == 'navLabel': + nav_target.add_label(_parse_for_text_tag(node), + node.getAttribute('xml:lang'), + node.getAttribute('dir')) + elif node.tagName == 'content': + nav_target.src = node.getAttribute('src') + + return nav_target + + +def _parse_for_text_tag(xml_element, name=None): + """Inspect an xml.dom.Element with a child 'name' to get its text value. + + NCX file has many element with a child likes + "navLabel" > "text" > TEXT_NODE + and this function allow to avoid some boilerplate code. + + First parameter must be an xml.dom.Element, having one child named by the + second parameter (by default a "text" tag). + + If nothing is founded, an empty string '' is returned. + + Whitespaces and tabulations are stripped.""" + name = name or 'text' + tags = [e for e in xml_element.childNodes + if e.nodeType == e.ELEMENT_NODE and e.tagName == name] + text = '' + if len(tags) > 0: + tag = tags[0] + if tag.firstChild and tag.firstChild.data: + tag.normalize() + text = tag.firstChild.data.strip() + return text + + +def _create_xml_element_text(data, name=None): + """Create a <text> ... </text> Element node. + + You can use a different tag name with the name argument + (default is "text"). + + If data is None or empty, it will create an empty element tag, eg. : + <emptyTag/> instead of <emptyTag></emptyTag>""" + if name is None: + name = 'text' + doc = minidom.Document() + element = doc.createElement(name) + if data: + element.appendChild(doc.createTextNode(data)) + return element + + +class Ncx(object): + """Represent the structured content of a NCX file.""" + + def __init__(self, nav_map=None, page_list=None): + self.xmlns = 'http://www.daisy.org/z3986/2005/ncx/' + self.version = '2005-1' + self.lang = None + self.uid = None + self.depth = None + self.total_page_count = None + self.max_page_number = None + self.generator = None + self.title = None + self.authors = [] + if nav_map is None: + nav_map = NavMap() + self.nav_map = nav_map + if page_list is None: + page_list = PageList() + self.page_list = page_list + self.nav_lists = [] + + def add_nav_list(self, nav_list): + self.nav_lists.append(nav_list) + + def as_xml_document(self): + """Return an xml dom Document node.""" + doc = minidom.Document() + ncx = doc.createElement('ncx') + ncx.setAttribute('xmlns', self.xmlns) + ncx.setAttribute('version', self.version) + if self.lang: + ncx.setAttribute('xml:lang', self.lang) + + # head + ncx.appendChild(self._head_as_xml_element()) + + # title + title = doc.createElement('docTitle') + title.appendChild(_create_xml_element_text(self.title)) + ncx.appendChild(title) + + # authors + for text in self.authors: + author = doc.createElement('docAuthor') + author.appendChild(_create_xml_element_text(text)) + ncx.appendChild(author) + + # nav_map + ncx.appendChild(self.nav_map.as_xml_element()) + + # page_list + if self.page_list: + ncx.appendChild(self.page_list.as_xml_element()) + + # nav_lists + for nav_list in self.nav_lists: + ncx.appendChild(nav_list.as_xml_element()) + + doc.appendChild(ncx) + return doc + + def _head_as_xml_element(self): + """Create an xml Element node <head> with meta-data of Ncx item.""" + doc = minidom.Document() + head = doc.createElement('head') + if self.uid: + head.appendChild(self._meta_as_xml_element('dtb:uid', self.uid)) + if self.depth: + head.appendChild(self._meta_as_xml_element('dtb:depth', + self.depth)) + if self.total_page_count: + head.appendChild(self._meta_as_xml_element('dtb:totalPageCount', + self.total_page_count)) + if self.max_page_number: + head.appendChild(self._meta_as_xml_element('dtb:maxPageNumber', + self.max_page_number)) + if self.generator: + head.appendChild(self._meta_as_xml_element('dtb:generator', + self.generator)) + return head + + def _meta_as_xml_element(self, name, content): + """Create an xml Element node <meta> with attributes name & content.""" + doc = minidom.Document() + meta = doc.createElement('meta') + meta.setAttribute('name', name) + meta.setAttribute('content', content) + return meta + + +class NavMap(object): + """Represente navMap tag of an NCX file.""" + + def __init__(self): + self.identifier = None + self.labels = [] + self.infos = [] + self.nav_point = [] + + def add_label(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.labels.append((label, lang, direction)) + + def add_info(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.infos.append((label, lang, direction)) + + def add_point(self, point): + self.nav_point.append(point) + + def as_xml_element(self): + """Return an xml dom Element node.""" + doc = minidom.Document() + nav_map = doc.createElement('navMap') + + if self.identifier: + nav_map.setAttribute('id', self.identifier) + + for text, lang, direction in self.labels: + label = doc.createElement('navLabel') + label.appendChild(_create_xml_element_text(text)) + if lang: + label.setAttribute('xml:lang', lang) + if direction: + label.setAttribute('dir', direction) + nav_map.appendChild(label) + + for text, lang, direction in self.infos: + info = doc.createElement('navInfo') + info.appendChild(_create_xml_element_text(text)) + if lang: + info.setAttribute('xml:lang', lang) + if direction: + info.setAttribute('dir', direction) + nav_map.appendChild(info) + + for nav_point in self.nav_point: + nav_map.appendChild(nav_point.as_xml_element()) + + return nav_map + + +class NavPoint(object): + + def __init__(self): + self.identifier = None + self.class_name = None + self.play_order = None + self.labels = [] + self.src = None + self.nav_point = [] + + def add_label(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.labels.append((label, lang, direction)) + + def add_point(self, nav_point): + self.nav_point.append(nav_point) + + def as_xml_element(self): + """Return an xml dom Element node.""" + doc = minidom.Document() + nav_point = doc.createElement('navPoint') + + # Attributes + if self.identifier: + nav_point.setAttribute('id', self.identifier) + + if self.class_name: + nav_point.setAttribute('class', self.class_name) + + if self.play_order: + nav_point.setAttribute('playOrder', self.play_order) + + # navLabel + for text, lang, direction in self.labels: + label = doc.createElement('navLabel') + label.appendChild(_create_xml_element_text(text)) + if lang: + label.setAttribute('xml:lang', lang) + if direction: + label.setAttribute('dir', direction) + nav_point.appendChild(label) + + # content + content = doc.createElement('content') + content.setAttribute('src', self.src) + nav_point.appendChild(content) + + # navPoint + for child in self.nav_point: + nav_point.appendChild(child.as_xml_element()) + + return nav_point + + +class PageList(object): + + def __init__(self): + self.identifier = None + self.class_name = None + self.page_target = [] + self.labels = [] + self.infos = [] + + def add_label(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.labels.append((label, lang, direction)) + + def add_info(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.infos.append((label, lang, direction)) + + def add_target(self, page_target): + self.page_target.append(page_target) + + def as_xml_element(self): + """Return an xml dom Element node.""" + doc = minidom.Document() + page_list = doc.createElement('pageList') + + # attributes + if self.identifier: + page_list.setAttribute('id', self.identifier) + + if self.class_name: + page_list.setAttribute('class', self.class_name) + + # navLabel + for text, lang, direction in self.labels: + label = doc.createElement('navLabel') + label.appendChild(_create_xml_element_text(text)) + if lang: + label.setAttribute('xml:lang', lang) + if direction: + label.setAttribute('dir', direction) + page_list.appendChild(label) + + # navInfo + for text, lang, direction in self.infos: + info = doc.createElement('navInfo') + info.appendChild(_create_xml_element_text(text)) + if lang: + info.setAttribute('xml:lang', lang) + if direction: + info.setAttribute('dir', direction) + page_list.appendChild(info) + + # pageTarget + for child in self.page_target: + page_list.appendChild(child.as_xml_element()) + + return page_list + + +class PageTarget(object): + + def __init__(self): + self.identifier = None + self.value = None + self.target_type = None + self.class_name = None + self.play_order = None + self.src = None + self.labels = [] + + def add_label(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.labels.append((label, lang, direction)) + + def as_xml_element(self): + """Return an xml dom Element node.""" + doc = minidom.Document() + page_target = doc.createElement('pageTarget') + + # attributes + if self.identifier: + page_target.setAttribute('id', self.identifier) + + if self.value: + page_target.setAttribute('value', self.value) + + if self.target_type: + page_target.setAttribute('type', self.target_type) + + if self.class_name: + page_target.setAttribute('class', self.class_name) + + if self.play_order: + page_target.setAttribute('playOrder', self.play_order) + + # navLabel + for text, lang, direction in self.labels: + label = doc.createElement('navLabel') + label.appendChild(_create_xml_element_text(text)) + if lang: + label.setAttribute('xml:lang', lang) + if direction: + label.setAttribute('dir', direction) + page_target.appendChild(label) + + # content + content = doc.createElement('content') + content.setAttribute('src', self.src) + page_target.appendChild(content) + + return page_target + + +class NavList(object): + + def __init__(self): + self.identifier = None + self.class_name = None + self.nav_target = [] + self.labels = [] + self.infos = [] + + def add_label(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.labels.append((label, lang, direction)) + + def add_info(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.infos.append((label, lang, direction)) + + def add_target(self, nav_target): + self.nav_target.append(nav_target) + + def as_xml_element(self): + """Return an xml dom Element node.""" + doc = minidom.Document() + nav_list = doc.createElement('navList') + + # attributes + if self.identifier: + nav_list.setAttribute('id', self.identifier) + + if self.class_name: + nav_list.setAttribute('class', self.class_name) + + # navLabel + for text, lang, direction in self.labels: + label = doc.createElement('navLabel') + label.appendChild(_create_xml_element_text(text)) + if lang: + label.setAttribute('xml:lang', lang) + if direction: + label.setAttribute('dir', direction) + nav_list.appendChild(label) + + # navInfo + for text, lang, direction in self.infos: + info = doc.createElement('navInfo') + info.appendChild(_create_xml_element_text(text)) + if lang: + info.setAttribute('xml:lang', lang) + if direction: + info.setAttribute('dir', direction) + nav_list.appendChild(info) + + # navTarget + for nav_target in self.nav_target: + nav_list.appendChild(nav_target.as_xml_element()) + + return nav_list + + +class NavTarget(object): + + def __init__(self): + self.identifier = None + self.class_name = None + self.value = None + self.play_order = None + self.labels = [] + self.src = None + + def add_label(self, label, lang=None, direction=None): + lang = lang or '' + direction = direction or '' + self.labels.append((label, lang, direction)) + + def as_xml_element(self): + """Return an xml dom Element node.""" + doc = minidom.Document() + nav_target = doc.createElement('navTarget') + + # attributes + if self.identifier: + nav_target.setAttribute('id', self.identifier) + + if self.class_name: + nav_target.setAttribute('class', self.class_name) + + if self.value: + nav_target.setAttribute('value', self.value) + + if self.play_order: + nav_target.setAttribute('playOrder', self.play_order) + + # navLabel + for text, lang, direction in self.labels: + label = doc.createElement('navLabel') + label.appendChild(_create_xml_element_text(text)) + if lang: + label.setAttribute('xml:lang', lang) + if direction: + label.setAttribute('dir', direction) + nav_target.appendChild(label) + + # content + content = doc.createElement('content') + content.setAttribute('src', self.src) + nav_target.appendChild(content) + + return nav_target diff --git a/.venv/lib/python3.12/site-packages/epub/opf.py b/.venv/lib/python3.12/site-packages/epub/opf.py new file mode 100644 index 00000000..54471815 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/epub/opf.py @@ -0,0 +1,535 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + + +""" +Python lib for reading OPF formated file for epub. + +Since the "Tour" element is deprecated in Epub 2, it is not supported by this +library. + +OPF epub : http://idpf.org/epub/20/spec/OPF_2.0.1_draft.htm +""" + + +from xml.dom import minidom + + +try: + # Only for Python 2.7+ + from collections import OrderedDict +except ImportError: + try: + # For Python 2.6 + from ordereddict import OrderedDict + except ImportError: + raise ImportError( + 'You should use Python 2.7 or install `ordereddict` from pypi.') + + +from epub.utils import get_node_text + + +XMLNS_DC = 'http://purl.org/dc/elements/1.1/' +XMLNS_OPF = 'http://www.idpf.org/2007/opf' + + +def parse_opf(xml_string): + package = minidom.parseString(xml_string).documentElement + + # Get Uid + uid_id = package.getAttribute('unique-identifier') + + # Store each child nodes into a dict (metadata, manifest, spine, guide) + data = {'metadata': None, + 'manifest': None, + 'spine': None, + 'guide': None} + elements = [e for e in package.childNodes if e.nodeType == e.ELEMENT_NODE] + for node in elements: + tag = node.tagName.lower() + if tag.startswith('opf:'): + tag = tag[4:] + data[tag] = node + + # Inspect metadata + metadata = _parse_xml_metadata(data['metadata']) + + # Inspect manifest + manifest = _parse_xml_manifest(data['manifest']) + + # Inspect spine + spine = _parse_xml_spine(data['spine']) + + # Inspect guide if exist + if data['guide'] is None: + guide = None + else: + guide = _parse_xml_guide(data['guide']) + + opf = Opf(uid_id=uid_id, + metadata=metadata, + manifest=manifest, + spine=spine, + guide=guide) + return opf + + +def _parse_xml_metadata(element): + """Extract metadata from an xml.dom.Element object (ELEMENT_NODE) + + The "<metadata>" tag has a lot of metadatas about the epub this method + inspect and store into object attributes (like "title" or "creator"). + """ + metadata = Metadata() + + for node in element.getElementsByTagName('dc:title'): + metadata.add_title(get_node_text(node), + node.getAttribute('xml:lang')) + + for node in element.getElementsByTagName('dc:creator'): + metadata.add_creator(get_node_text(node), + node.getAttribute('opf:role'), + node.getAttribute('opf:file-as')) + + for node in element.getElementsByTagName('dc:subject'): + metadata.add_subject(get_node_text(node)) + + for node in element.getElementsByTagName('dc:description'): + metadata.description = get_node_text(node) + + for node in element.getElementsByTagName('dc:publisher'): + metadata.publisher = get_node_text(node) + + for node in element.getElementsByTagName('dc:contributor'): + metadata.add_contributor(get_node_text(node), + node.getAttribute('opf:role'), + node.getAttribute('opf:file-as')) + + for node in element.getElementsByTagName('dc:date'): + metadata.add_date(get_node_text(node), + node.getAttribute('opf:event')) + + for node in element.getElementsByTagName('dc:type'): + metadata.dc_type = get_node_text(node) + + for node in element.getElementsByTagName('dc:format'): + metadata.format = get_node_text(node) + + for node in element.getElementsByTagName('dc:identifier'): + metadata.add_identifier(get_node_text(node), + node.getAttribute('id'), + node.getAttribute('opf:scheme')) + + for node in element.getElementsByTagName('dc:source'): + metadata.source = get_node_text(node) + + for node in element.getElementsByTagName('dc:language'): + metadata.add_language(get_node_text(node)) + + for node in element.getElementsByTagName('dc:relation'): + metadata.relation = get_node_text(node) + + for node in element.getElementsByTagName('dc:coverage'): + metadata.coverage = get_node_text(node) + + for node in element.getElementsByTagName('dc:rights'): + metadata.right = get_node_text(node) + + for node in element.getElementsByTagName('meta'): + metadata.add_meta(node.getAttribute('name'), + node.getAttribute('content')) + + return metadata + + +def _parse_xml_manifest(element): + """Inspect an xml.dom.Element <manifest> and return a list of + epub.EpubManifestItem object.""" + + manifest = Manifest() + for e in element.getElementsByTagName('item'): + manifest.add_item(e.getAttribute('id'), + e.getAttribute('href'), + e.getAttribute('media-type'), + e.getAttribute('fallback'), + e.getAttribute('required-namespace'), + e.getAttribute('required-modules'), + e.getAttribute('fallback-style')) + return manifest + + +def _parse_xml_spine(element): + """Inspect an xml.dom.Element <spine> and return epub.opf.Spine object""" + + spine = Spine() + spine.toc = element.getAttribute('toc') + for e in element.getElementsByTagName('itemref'): + spine.add_itemref(e.getAttribute('idref'), + e.getAttribute('linear').lower() != 'no') + return spine + + +def _parse_xml_guide(element): + """Inspect an xml.dom.Element <guide> and return a list of ref as tuple.""" + + guide = Guide() + for e in element.getElementsByTagName('reference'): + guide.add_reference(e.getAttribute('href'), + e.getAttribute('type'), + e.getAttribute('title')) + return guide + + +class Opf(object): + """Represent an OPF formated file. + + OPF is an xml formated file, used in the epub spec.""" + + def __init__(self, uid_id=None, version=None, xmlns=None, + metadata=None, manifest=None, spine=None, guide=None): + self.uid_id = uid_id + self.version = version if version else '2.0' + self.xmlns = xmlns if xmlns else XMLNS_OPF + + if metadata is None: + self.metadata = Metadata() + else: + self.metadata = metadata + if manifest is None: + self.manifest = Manifest() + else: + self.manifest = manifest + if spine is None: + self.spine = Spine() + else: + self.spine = spine + if guide is None: + self.guide = Guide() + else: + self.guide = guide + + def as_xml_document(self): + doc = minidom.Document() + package = doc.createElement('package') + package.setAttribute('version', self.version) + package.setAttribute('unique-identifier', self.uid_id) + package.setAttribute('xmlns', self.xmlns) + package.appendChild(self.metadata.as_xml_element()) + package.appendChild(self.manifest.as_xml_element()) + package.appendChild(self.spine.as_xml_element()) + package.appendChild(self.guide.as_xml_element()) + doc.appendChild(package) + return doc + + +class Metadata(object): + """Represent an epub's metadatas set. + + See http://idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.2""" + + def __init__(self): + self.titles = [] + self.creators = [] + self.subjects = [] + self.description = None + self.publisher = None + self.contributors = [] + self.dates = [] + self.dc_type = None + self.format = None + self.identifiers = [] + self.source = None + self.languages = [] + self.relation = None + self.coverage = None + self.right = None + self.metas = [] + + def add_title(self, title, lang=None): + lang = lang or '' + self.titles.append((title, lang)) + + def add_creator(self, name, role=None, file_as=None): + role = role or '' + file_as = file_as or '' + self.creators.append((name, role, file_as)) + + def add_subject(self, subject): + self.subjects.append(subject) + + def add_contributor(self, name, role=None, file_as=None): + role = role or '' + file_as = file_as or '' + self.contributors.append((name, role, file_as)) + + def add_date(self, date, event=None): + event = event or '' + self.dates.append((date, event)) + + def add_identifier(self, content, identifier=None, scheme=None): + identifier = identifier or '' + scheme = scheme or '' + self.identifiers.append((content, identifier, scheme)) + + def add_language(self, lang): + self.languages.append(lang) + + def add_meta(self, name, content): + self.metas.append((name, content)) + + def get_isbn(self): + l = [x[0] for x in self.identifiers if x[2].lower() == 'isbn'] + isbn = None + if l: + isbn = l[0] + return isbn + + def as_xml_element(self): + """Return an xml dom Element node.""" + doc = minidom.Document() + metadata = doc.createElement('metadata') + metadata.setAttribute('xmlns:dc', XMLNS_DC) + metadata.setAttribute('xmlns:opf', XMLNS_OPF) + + for text, lang in self.titles: + title = doc.createElement('dc:title') + if lang: + title.setAttribute('xml:lang', lang) + title.appendChild(doc.createTextNode(text)) + metadata.appendChild(title) + + for name, role, file_as in self.creators: + creator = doc.createElement('dc:creator') + if role: + creator.setAttribute('opf:role', role) + if file_as: + creator.setAttribute('opf:file-as', file_as) + creator.appendChild(doc.createTextNode(name)) + metadata.appendChild(creator) + + for text in self.subjects: + subject = doc.createElement('dc:subject') + subject.appendChild(doc.createTextNode(text)) + metadata.appendChild(subject) + + if self.description: + description = doc.createElement('dc:description') + description.appendChild(doc.createTextNode(self.description)) + metadata.appendChild(description) + + if self.publisher: + publisher = doc.createElement('dc:publisher') + publisher.appendChild(doc.createTextNode(self.publisher)) + metadata.appendChild(publisher) + + for name, role, file_as in self.contributors: + contributor = doc.createElement('dc:contributor') + if role: + contributor.setAttribute('opf:role', role) + if file_as: + contributor.setAttribute('opf:file-as', file_as) + contributor.appendChild(doc.createTextNode(name)) + metadata.appendChild(contributor) + + for text, event in self.dates: + date = doc.createElement('dc:date') + if event: + date.setAttribute('opf:event', event) + date.appendChild(doc.createTextNode(text)) + metadata.appendChild(date) + + if self.dc_type: + dc_type = doc.createElement('dc:type') + dc_type.appendChild(doc.createTextNode(self.dc_type)) + metadata.appendChild(dc_type) + + if self.format: + dc_format = doc.createElement('dc:format') + dc_format.appendChild(doc.createTextNode(self.format)) + metadata.appendChild(dc_format) + + for text, identifier, scheme in self.identifiers: + dc_identifier = doc.createElement('dc:identifier') + if identifier: + dc_identifier.setAttribute('id', identifier) + if scheme: + dc_identifier.setAttribute('opf:scheme', scheme) + dc_identifier.appendChild(doc.createTextNode(text)) + metadata.appendChild(dc_identifier) + + if self.source: + source = doc.createElement('dc:source') + source.appendChild(doc.createTextNode(self.source)) + metadata.appendChild(source) + + for text in self.languages: + language = doc.createElement('dc:language') + language.appendChild(doc.createTextNode(text)) + metadata.appendChild(language) + + if self.relation: + relation = doc.createElement('dc:relation') + relation.appendChild(doc.createTextNode(self.relation)) + metadata.appendChild(relation) + + if self.coverage: + coverage = doc.createElement('dc:coverage') + coverage.appendChild(doc.createTextNode(self.coverage)) + metadata.appendChild(coverage) + + if self.right: + right = doc.createElement('dc:rights') + right.appendChild(doc.createTextNode(self.right)) + metadata.appendChild(right) + + for name, content in self.metas: + meta = doc.createElement('meta') + meta.setAttribute('name', name) + meta.setAttribute('content', content) + metadata.appendChild(meta) + + return metadata + + +class Manifest(OrderedDict): + + def __contains__(self, item): + if hasattr(item, 'identifier'): + return super(Manifest, self).__contains__(item.identifier) + else: + return super(Manifest, self).__contains__(item) + + def __setitem__(self, key, value): + if hasattr(value, 'identifier') and hasattr(value, 'href'): + if value.identifier == key: + super(Manifest, self).__setitem__(key, value) + else: + raise ValueError('Value\'s id is different from insert key.') + else: + requierements = 'id and href attributes' + msg = 'Value does not fit the requirement (%s).' % requierements + raise ValueError(msg) + + def add_item(self, identifier, href, media_type=None, fallback=None, + required_namespace=None, required_modules=None, + fallback_style=None): + item = ManifestItem(identifier, href, media_type, + fallback, required_namespace, required_modules, + fallback_style) + self.append(item) + + def append(self, item): + if hasattr(item, 'identifier') and \ + hasattr(item, 'href') and \ + hasattr(item, 'as_xml_element'): + self.__setitem__(item.identifier, item) + else: + raise ValueError('Manifest item must have [identifier, href, ' + \ + 'as_xml_element()] attributes and method.') + + def as_xml_element(self): + """Return an xml dom Element node.""" + doc = minidom.Document() + manifest = doc.createElement('manifest') + + for item in self.values(): + manifest.appendChild(item.as_xml_element()) + + return manifest + + +class ManifestItem(object): + """ + Represent an item from the epub's manifest. + + """ + + def __init__(self, identifier, href, media_type=None, fallback=None, + required_namespace=None, required_modules=None, + fallback_style=None): + self.identifier = identifier + self.href = href + self.media_type = media_type + self.fallback = fallback + self.required_namespace = required_namespace + self.required_modules = required_modules + self.fallback_style = fallback_style + + def as_xml_element(self): + """Return an xml dom Element node.""" + + item = minidom.Document().createElement("item") + + item.setAttribute('id', self.identifier) + item.setAttribute('href', self.href) + if self.media_type: + item.setAttribute('media-type', self.media_type) + if self.fallback: + item.setAttribute('fallback', self.fallback) + if self.required_namespace: + item.setAttribute('required-namespace', self.required_namespace) + if self.required_modules: + item.setAttribute('required-modules', self.required_modules) + if self.fallback_style: + item.setAttribute('fallback-style', self.fallback_style) + + return item + + +class Spine(object): + + def __init__(self, toc=None, itemrefs=None): + self.toc = toc + if itemrefs is None: + self.itemrefs = [] + else: + self.itemrefs = itemrefs + + def add_itemref(self, idref, linear=True): + self.append((idref, linear)) + + def append(self, itemref): + self.itemrefs.append(itemref) + + def as_xml_element(self): + doc = minidom.Document() + spine = doc.createElement('spine') + spine.setAttribute('toc', self.toc) + + for idref, linear in self.itemrefs: + itemref = doc.createElement('itemref') + itemref.setAttribute('idref', idref) + if not linear: + itemref.setAttribute('linear', 'no') + spine.appendChild(itemref) + + return spine + + +class Guide(object): + + def __init__(self): + self.references = [] + + def add_reference(self, href, ref_type=None, title=None): + self.append((href, ref_type, title)) + + def append(self, reference): + self.references.append(reference) + + def as_xml_element(self): + doc = minidom.Document() + guide = doc.createElement('guide') + + for href, ref_type, title in self.references: + reference = doc.createElement('reference') + if type: + reference.setAttribute('type', ref_type) + if title: + reference.setAttribute('title', title) + if href: + reference.setAttribute('href', href) + guide.appendChild(reference) + + return guide diff --git a/.venv/lib/python3.12/site-packages/epub/utils.py b/.venv/lib/python3.12/site-packages/epub/utils.py new file mode 100644 index 00000000..dc3a73f3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/epub/utils.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + + +def get_node_text(node): + """ + Return the text content of an xml.dom Element Node. + + If node does not have content, this function return an empty string. + """ + text = '' + + node.normalize() + if node.firstChild and node.firstChild.data: + text = node.firstChild.data.strip() + + return text + + +def get_urlpath_part(urlpath): + """ + Return a path without url fragment (something like `#frag` at the end). + + This function allow to use path from references and NCX file to read + item from Manifest with a correct href (without losing the fragment part). + + eg.: + + url = 'text/chapter1.xhtml#part2' + href, fragment = get_urlpath_part(url) + print href # 'text/chapter1.xhtml' + print fragment # '#part2' + """ + href = urlpath + fragment = None + if urlpath.count('#'): + href, fragment = urlpath.split('#') + return (href, fragment) |
