diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py b/.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py new file mode 100644 index 00000000..45b0c145 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py @@ -0,0 +1,307 @@ +"""Code in here is only used by pypdf.filters._xobj_to_image""" + +import sys +from io import BytesIO +from typing import Any, List, Tuple, Union, cast + +from ._utils import check_if_whitespace_only, logger_warning +from .constants import ColorSpaces +from .errors import PdfReadError +from .generic import ( + ArrayObject, + DecodedStreamObject, + EncodedStreamObject, + IndirectObject, + NullObject, +) + +if sys.version_info[:2] >= (3, 8): + from typing import Literal +else: + # PEP 586 introduced typing.Literal with Python 3.8 + # For older Python versions, the backport typing_extensions is necessary: + from typing_extensions import Literal + +if sys.version_info[:2] >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + + +try: + from PIL import Image, UnidentifiedImageError # noqa: F401 +except ImportError: + raise ImportError( + "pillow is required to do image extraction. " + "It can be installed via 'pip install pypdf[image]'" + ) + +mode_str_type: TypeAlias = Literal[ + "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK" +] + +MAX_IMAGE_MODE_NESTING_DEPTH: int = 10 + + +def _get_imagemode( + color_space: Union[str, List[Any], Any], + color_components: int, + prev_mode: mode_str_type, + depth: int = 0, +) -> Tuple[mode_str_type, bool]: + """ + Returns + Image mode not taking into account mask(transparency) + ColorInversion is required (like for some DeviceCMYK) + """ + if depth > MAX_IMAGE_MODE_NESTING_DEPTH: + raise PdfReadError( + "Color spaces nested too deep. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH." + ) + if isinstance(color_space, NullObject): + return "", False + if isinstance(color_space, str): + pass + elif not isinstance(color_space, list): + raise PdfReadError( + "Cannot interpret colorspace", color_space + ) # pragma: no cover + elif color_space[0].startswith("/Cal"): # /CalRGB and /CalGray + color_space = "/Device" + color_space[0][4:] + elif color_space[0] == "/ICCBased": + icc_profile = color_space[1].get_object() + color_components = cast(int, icc_profile["/N"]) + color_space = icc_profile.get("/Alternate", "") + elif color_space[0] == "/Indexed": + color_space = color_space[1].get_object() + mode2, invert_color = _get_imagemode( + color_space, color_components, prev_mode, depth + 1 + ) + if mode2 in ("RGB", "CMYK"): + mode2 = "P" + return mode2, invert_color + elif color_space[0] == "/Separation": + color_space = color_space[2] + if isinstance(color_space, IndirectObject): + color_space = color_space.get_object() + mode2, invert_color = _get_imagemode( + color_space, color_components, prev_mode, depth + 1 + ) + return mode2, True + elif color_space[0] == "/DeviceN": + original_color_space = color_space + color_components = len(color_space[1]) + color_space = color_space[2] + if isinstance(color_space, IndirectObject): # pragma: no cover + color_space = color_space.get_object() + if color_space == "/DeviceCMYK" and color_components == 1: + if original_color_space[1][0] != "/Black": + logger_warning( + f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team", + __name__, + ) + return "L", True + mode2, invert_color = _get_imagemode( + color_space, color_components, prev_mode, depth + 1 + ) + return mode2, invert_color + + mode_map = { + "1bit": "1", # pos [0] will be used for 1 bit + "/DeviceGray": "L", # must be in pos [1] + "palette": "P", # must be in pos [2] for color_components align. + "/DeviceRGB": "RGB", # must be in pos [3] + "/DeviceCMYK": "CMYK", # must be in pos [4] + "2bit": "2bits", # 2 bits images + "4bit": "4bits", # 4 bits + } + mode: mode_str_type = ( + mode_map.get(color_space) # type: ignore + or list(mode_map.values())[color_components] + or prev_mode + ) + return mode, mode == "CMYK" + + +def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: + mask = (1 << bits) - 1 + nbuff = bytearray(size[0] * size[1]) + by = 0 + bit = 8 - bits + for y in range(size[1]): + if (bit != 0) and (bit != 8 - bits): + by += 1 + bit = 8 - bits + for x in range(size[0]): + nbuff[y * size[0] + x] = (data[by] >> bit) & mask + bit -= bits + if bit < 0: + by += 1 + bit = 8 - bits + return bytes(nbuff) + + +def _extended_image_frombytes( + mode: str, size: Tuple[int, int], data: bytes +) -> Image.Image: + try: + img = Image.frombytes(mode, size, data) + except ValueError as exc: + nb_pix = size[0] * size[1] + if len(data) % nb_pix != 0: + raise exc + k = nb_pix * len(mode) / len(data) + data = b"".join([bytes((x,) * int(k)) for x in data]) + img = Image.frombytes(mode, size, data) + return img + + +def _handle_flate( + size: Tuple[int, int], + data: bytes, + mode: mode_str_type, + color_space: str, + colors: int, + obj_as_text: str, +) -> Tuple[Image.Image, str, str, bool]: + """ + Process image encoded in flateEncode + Returns img, image_format, extension, color inversion + """ + extension = ".png" # mime_type = "image/png" + image_format = "PNG" + lookup: Any + base: Any + hival: Any + if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed": + color_space, base, hival, lookup = (value.get_object() for value in color_space) + if mode == "2bits": + mode = "P" + data = bits2byte(data, size, 2) + elif mode == "4bits": + mode = "P" + data = bits2byte(data, size, 4) + img = _extended_image_frombytes(mode, size, data) + if color_space == "/Indexed": + from .generic import TextStringObject + + if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)): + lookup = lookup.get_data() + if isinstance(lookup, TextStringObject): + lookup = lookup.original_bytes + if isinstance(lookup, str): + lookup = lookup.encode() + try: + nb, conv, mode = { # type: ignore + "1": (0, "", ""), + "L": (1, "P", "L"), + "P": (0, "", ""), + "RGB": (3, "P", "RGB"), + "CMYK": (4, "P", "CMYK"), + }[_get_imagemode(base, 0, "")[0]] + except KeyError: # pragma: no cover + logger_warning( + f"Base {base} not coded please share the pdf file with pypdf dev team", + __name__, + ) + lookup = None + else: + if img.mode == "1": + # Two values ("high" and "low"). + expected_count = 2 * nb + if len(lookup) != expected_count: + if len(lookup) < expected_count: + raise PdfReadError( + f"Not enough lookup values: Expected {expected_count}, got {len(lookup)}." + ) + if not check_if_whitespace_only(lookup[expected_count:]): + raise PdfReadError( + f"Too many lookup values: Expected {expected_count}, got {len(lookup)}." + ) + lookup = lookup[:expected_count] + colors_arr = [lookup[:nb], lookup[nb:]] + arr = b"".join( + [ + b"".join( + [ + colors_arr[1 if img.getpixel((x, y)) > 127 else 0] + for x in range(img.size[0]) + ] + ) + for y in range(img.size[1]) + ] + ) + img = Image.frombytes(mode, img.size, arr) + else: + img = img.convert(conv) + if len(lookup) != (hival + 1) * nb: + logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__) + lookup = None + elif mode == "L": + # gray lookup does not work : it is converted to a similar RGB lookup + lookup = b"".join([bytes([b, b, b]) for b in lookup]) + mode = "RGB" + # TODO : cf https://github.com/py-pdf/pypdf/pull/2039 + # this is a work around until PIL is able to process CMYK images + elif mode == "CMYK": + _rgb = [] + for _c, _m, _y, _k in ( + lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4) + ): + _r = int(255 * (1 - _c / 255) * (1 - _k / 255)) + _g = int(255 * (1 - _m / 255) * (1 - _k / 255)) + _b = int(255 * (1 - _y / 255) * (1 - _k / 255)) + _rgb.append(bytes((_r, _g, _b))) + lookup = b"".join(_rgb) + mode = "RGB" + if lookup is not None: + img.putpalette(lookup, rawmode=mode) + img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB") + elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased": + # see Table 66 - Additional Entries Specific to an ICC Profile + # Stream Dictionary + mode2 = _get_imagemode(color_space, colors, mode)[0] + if mode != mode2: + img = Image.frombytes(mode2, size, data) # reloaded as mode may have change + if mode == "CMYK": + extension = ".tif" + image_format = "TIFF" + return img, image_format, extension, False + + +def _handle_jpx( + size: Tuple[int, int], + data: bytes, + mode: mode_str_type, + color_space: str, + colors: int, +) -> Tuple[Image.Image, str, str, bool]: + """ + Process image encoded in flateEncode + Returns img, image_format, extension, inversion + """ + extension = ".jp2" # mime_type = "image/x-jp2" + img1 = Image.open(BytesIO(data), formats=("JPEG2000",)) + mode, invert_color = _get_imagemode(color_space, colors, mode) + if mode == "": + mode = cast(mode_str_type, img1.mode) + invert_color = mode in ("CMYK",) + if img1.mode == "RGBA" and mode == "RGB": + mode = "RGBA" + # we need to convert to the good mode + if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets + # L,P are indexed modes which should not be changed. + img = img1 + elif {img1.mode, mode} == {"RGBA", "CMYK"}: + # RGBA / CMYK are 4bytes encoding where + # the encoding should be corrected + img = Image.frombytes(mode, img1.size, img1.tobytes()) + else: # pragma: no cover + img = img1.convert(mode) + # for CMYK conversion : + # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop + # not implemented for the moment as I need to get properly the ICC + if img.mode == "CMYK": + img = img.convert("RGB") + image_format = "JPEG2000" + return img, image_format, extension, invert_color |