diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/openpyxl/reader/excel.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/openpyxl/reader/excel.py | 349 |
1 files changed, 349 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/openpyxl/reader/excel.py b/.venv/lib/python3.12/site-packages/openpyxl/reader/excel.py new file mode 100644 index 00000000..dfd8eeac --- /dev/null +++ b/.venv/lib/python3.12/site-packages/openpyxl/reader/excel.py @@ -0,0 +1,349 @@ +# Copyright (c) 2010-2024 openpyxl + + +"""Read an xlsx file into Python""" + +# Python stdlib imports +from zipfile import ZipFile, ZIP_DEFLATED +from io import BytesIO +import os.path +import warnings + +from openpyxl.pivot.table import TableDefinition + +# Allow blanket setting of KEEP_VBA for testing +try: + from ..tests import KEEP_VBA +except ImportError: + KEEP_VBA = False + +# package imports +from openpyxl.utils.exceptions import InvalidFileException +from openpyxl.xml.constants import ( + ARC_CORE, + ARC_CUSTOM, + ARC_CONTENT_TYPES, + ARC_WORKBOOK, + ARC_THEME, + COMMENTS_NS, + SHARED_STRINGS, + XLTM, + XLTX, + XLSM, + XLSX, +) +from openpyxl.cell import MergedCell +from openpyxl.comments.comment_sheet import CommentSheet + +from .strings import read_string_table, read_rich_text +from .workbook import WorkbookParser +from openpyxl.styles.stylesheet import apply_stylesheet + +from openpyxl.packaging.core import DocumentProperties +from openpyxl.packaging.custom import CustomPropertyList +from openpyxl.packaging.manifest import Manifest, Override + +from openpyxl.packaging.relationship import ( + RelationshipList, + get_dependents, + get_rels_path, +) + +from openpyxl.worksheet._read_only import ReadOnlyWorksheet +from openpyxl.worksheet._reader import WorksheetReader +from openpyxl.chartsheet import Chartsheet +from openpyxl.worksheet.table import Table +from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing + +from openpyxl.xml.functions import fromstring + +from .drawings import find_images + + +SUPPORTED_FORMATS = ('.xlsx', '.xlsm', '.xltx', '.xltm') + + +def _validate_archive(filename): + """ + Does a first check whether filename is a string or a file-like + object. If it is a string representing a filename, a check is done + for supported formats by checking the given file-extension. If the + file-extension is not in SUPPORTED_FORMATS an InvalidFileException + will raised. Otherwise the filename (resp. file-like object) will + forwarded to zipfile.ZipFile returning a ZipFile-Instance. + """ + is_file_like = hasattr(filename, 'read') + if not is_file_like: + file_format = os.path.splitext(filename)[-1].lower() + if file_format not in SUPPORTED_FORMATS: + if file_format == '.xls': + msg = ('openpyxl does not support the old .xls file format, ' + 'please use xlrd to read this file, or convert it to ' + 'the more recent .xlsx file format.') + elif file_format == '.xlsb': + msg = ('openpyxl does not support binary format .xlsb, ' + 'please convert this file to .xlsx format if you want ' + 'to open it with openpyxl') + else: + msg = ('openpyxl does not support %s file format, ' + 'please check you can open ' + 'it with Excel first. ' + 'Supported formats are: %s') % (file_format, + ','.join(SUPPORTED_FORMATS)) + raise InvalidFileException(msg) + + archive = ZipFile(filename, 'r') + return archive + + +def _find_workbook_part(package): + workbook_types = [XLTM, XLTX, XLSM, XLSX] + for ct in workbook_types: + part = package.find(ct) + if part: + return part + + # some applications reassign the default for application/xml + defaults = {p.ContentType for p in package.Default} + workbook_type = defaults & set(workbook_types) + if workbook_type: + return Override("/" + ARC_WORKBOOK, workbook_type.pop()) + + raise IOError("File contains no valid workbook part") + + +class ExcelReader: + + """ + Read an Excel package and dispatch the contents to the relevant modules + """ + + def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA, + data_only=False, keep_links=True, rich_text=False): + self.archive = _validate_archive(fn) + self.valid_files = self.archive.namelist() + self.read_only = read_only + self.keep_vba = keep_vba + self.data_only = data_only + self.keep_links = keep_links + self.rich_text = rich_text + self.shared_strings = [] + + + def read_manifest(self): + src = self.archive.read(ARC_CONTENT_TYPES) + root = fromstring(src) + self.package = Manifest.from_tree(root) + + + def read_strings(self): + ct = self.package.find(SHARED_STRINGS) + reader = read_string_table + if self.rich_text: + reader = read_rich_text + if ct is not None: + strings_path = ct.PartName[1:] + with self.archive.open(strings_path,) as src: + self.shared_strings = reader(src) + + + def read_workbook(self): + wb_part = _find_workbook_part(self.package) + self.parser = WorkbookParser(self.archive, wb_part.PartName[1:], keep_links=self.keep_links) + self.parser.parse() + wb = self.parser.wb + wb._sheets = [] + wb._data_only = self.data_only + wb._read_only = self.read_only + wb.template = wb_part.ContentType in (XLTX, XLTM) + + # If are going to preserve the vba then attach a copy of the archive to the + # workbook so that is available for the save. + if self.keep_vba: + wb.vba_archive = ZipFile(BytesIO(), 'a', ZIP_DEFLATED) + for name in self.valid_files: + wb.vba_archive.writestr(name, self.archive.read(name)) + + if self.read_only: + wb._archive = self.archive + + self.wb = wb + + + def read_properties(self): + if ARC_CORE in self.valid_files: + src = fromstring(self.archive.read(ARC_CORE)) + self.wb.properties = DocumentProperties.from_tree(src) + + + def read_custom(self): + if ARC_CUSTOM in self.valid_files: + src = fromstring(self.archive.read(ARC_CUSTOM)) + self.wb.custom_doc_props = CustomPropertyList.from_tree(src) + + + def read_theme(self): + if ARC_THEME in self.valid_files: + self.wb.loaded_theme = self.archive.read(ARC_THEME) + + + def read_chartsheet(self, sheet, rel): + sheet_path = rel.target + rels_path = get_rels_path(sheet_path) + rels = [] + if rels_path in self.valid_files: + rels = get_dependents(self.archive, rels_path) + + with self.archive.open(sheet_path, "r") as src: + xml = src.read() + node = fromstring(xml) + cs = Chartsheet.from_tree(node) + cs._parent = self.wb + cs.title = sheet.name + self.wb._add_sheet(cs) + + drawings = rels.find(SpreadsheetDrawing._rel_type) + for rel in drawings: + charts, images = find_images(self.archive, rel.target) + for c in charts: + cs.add_chart(c) + + + def read_worksheets(self): + comment_warning = """Cell '{0}':{1} is part of a merged range but has a comment which will be removed because merged cells cannot contain any data.""" + for sheet, rel in self.parser.find_sheets(): + if rel.target not in self.valid_files: + continue + + if "chartsheet" in rel.Type: + self.read_chartsheet(sheet, rel) + continue + + rels_path = get_rels_path(rel.target) + rels = RelationshipList() + if rels_path in self.valid_files: + rels = get_dependents(self.archive, rels_path) + + if self.read_only: + ws = ReadOnlyWorksheet(self.wb, sheet.name, rel.target, self.shared_strings) + ws.sheet_state = sheet.state + self.wb._sheets.append(ws) + continue + else: + fh = self.archive.open(rel.target) + ws = self.wb.create_sheet(sheet.name) + ws._rels = rels + ws_parser = WorksheetReader(ws, fh, self.shared_strings, self.data_only, self.rich_text) + ws_parser.bind_all() + fh.close() + + # assign any comments to cells + for r in rels.find(COMMENTS_NS): + src = self.archive.read(r.target) + comment_sheet = CommentSheet.from_tree(fromstring(src)) + for ref, comment in comment_sheet.comments: + try: + ws[ref].comment = comment + except AttributeError: + c = ws[ref] + if isinstance(c, MergedCell): + warnings.warn(comment_warning.format(ws.title, c.coordinate)) + continue + + # preserve link to VML file if VBA + if self.wb.vba_archive and ws.legacy_drawing: + ws.legacy_drawing = rels.get(ws.legacy_drawing).target + else: + ws.legacy_drawing = None + + for t in ws_parser.tables: + src = self.archive.read(t) + xml = fromstring(src) + table = Table.from_tree(xml) + ws.add_table(table) + + drawings = rels.find(SpreadsheetDrawing._rel_type) + for rel in drawings: + charts, images = find_images(self.archive, rel.target) + for c in charts: + ws.add_chart(c, c.anchor) + for im in images: + ws.add_image(im, im.anchor) + + pivot_rel = rels.find(TableDefinition.rel_type) + pivot_caches = self.parser.pivot_caches + for r in pivot_rel: + pivot_path = r.Target + src = self.archive.read(pivot_path) + tree = fromstring(src) + pivot = TableDefinition.from_tree(tree) + pivot.cache = pivot_caches[pivot.cacheId] + ws.add_pivot(pivot) + + ws.sheet_state = sheet.state + + + def read(self): + action = "read manifest" + try: + self.read_manifest() + action = "read strings" + self.read_strings() + action = "read workbook" + self.read_workbook() + action = "read properties" + self.read_properties() + action = "read custom properties" + self.read_custom() + action = "read theme" + self.read_theme() + action = "read stylesheet" + apply_stylesheet(self.archive, self.wb) + action = "read worksheets" + self.read_worksheets() + action = "assign names" + self.parser.assign_names() + if not self.read_only: + self.archive.close() + except ValueError as e: + raise ValueError( + f"Unable to read workbook: could not {action} from {self.archive.filename}.\n" + "This is most probably because the workbook source files contain some invalid XML.\n" + "Please see the exception for more details." + ) from e + + +def load_workbook(filename, read_only=False, keep_vba=KEEP_VBA, + data_only=False, keep_links=True, rich_text=False): + """Open the given filename and return the workbook + + :param filename: the path to open or a file-like object + :type filename: string or a file-like object open in binary mode c.f., :class:`zipfile.ZipFile` + + :param read_only: optimised for reading, content cannot be edited + :type read_only: bool + + :param keep_vba: preserve vba content (this does NOT mean you can use it) + :type keep_vba: bool + + :param data_only: controls whether cells with formulae have either the formula (default) or the value stored the last time Excel read the sheet + :type data_only: bool + + :param keep_links: whether links to external workbooks should be preserved. The default is True + :type keep_links: bool + + :param rich_text: if set to True openpyxl will preserve any rich text formatting in cells. The default is False + :type rich_text: bool + + :rtype: :class:`openpyxl.workbook.Workbook` + + .. note:: + + When using lazy load, all worksheets will be :class:`openpyxl.worksheet.iter_worksheet.IterableWorksheet` + and the returned workbook will be read-only. + + """ + reader = ExcelReader(filename, read_only, keep_vba, + data_only, keep_links, rich_text) + reader.read() + return reader.wb |