aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py')
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py307
1 files changed, 307 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py b/.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
new file mode 100644
index 00000000..45b0c145
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
@@ -0,0 +1,307 @@
+"""Code in here is only used by pypdf.filters._xobj_to_image"""
+
+import sys
+from io import BytesIO
+from typing import Any, List, Tuple, Union, cast
+
+from ._utils import check_if_whitespace_only, logger_warning
+from .constants import ColorSpaces
+from .errors import PdfReadError
+from .generic import (
+ ArrayObject,
+ DecodedStreamObject,
+ EncodedStreamObject,
+ IndirectObject,
+ NullObject,
+)
+
+if sys.version_info[:2] >= (3, 8):
+ from typing import Literal
+else:
+ # PEP 586 introduced typing.Literal with Python 3.8
+ # For older Python versions, the backport typing_extensions is necessary:
+ from typing_extensions import Literal
+
+if sys.version_info[:2] >= (3, 10):
+ from typing import TypeAlias
+else:
+ from typing_extensions import TypeAlias
+
+
+try:
+ from PIL import Image, UnidentifiedImageError # noqa: F401
+except ImportError:
+ raise ImportError(
+ "pillow is required to do image extraction. "
+ "It can be installed via 'pip install pypdf[image]'"
+ )
+
+mode_str_type: TypeAlias = Literal[
+ "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK"
+]
+
+MAX_IMAGE_MODE_NESTING_DEPTH: int = 10
+
+
+def _get_imagemode(
+ color_space: Union[str, List[Any], Any],
+ color_components: int,
+ prev_mode: mode_str_type,
+ depth: int = 0,
+) -> Tuple[mode_str_type, bool]:
+ """
+ Returns
+ Image mode not taking into account mask(transparency)
+ ColorInversion is required (like for some DeviceCMYK)
+ """
+ if depth > MAX_IMAGE_MODE_NESTING_DEPTH:
+ raise PdfReadError(
+ "Color spaces nested too deep. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH."
+ )
+ if isinstance(color_space, NullObject):
+ return "", False
+ if isinstance(color_space, str):
+ pass
+ elif not isinstance(color_space, list):
+ raise PdfReadError(
+ "Cannot interpret colorspace", color_space
+ ) # pragma: no cover
+ elif color_space[0].startswith("/Cal"): # /CalRGB and /CalGray
+ color_space = "/Device" + color_space[0][4:]
+ elif color_space[0] == "/ICCBased":
+ icc_profile = color_space[1].get_object()
+ color_components = cast(int, icc_profile["/N"])
+ color_space = icc_profile.get("/Alternate", "")
+ elif color_space[0] == "/Indexed":
+ color_space = color_space[1].get_object()
+ mode2, invert_color = _get_imagemode(
+ color_space, color_components, prev_mode, depth + 1
+ )
+ if mode2 in ("RGB", "CMYK"):
+ mode2 = "P"
+ return mode2, invert_color
+ elif color_space[0] == "/Separation":
+ color_space = color_space[2]
+ if isinstance(color_space, IndirectObject):
+ color_space = color_space.get_object()
+ mode2, invert_color = _get_imagemode(
+ color_space, color_components, prev_mode, depth + 1
+ )
+ return mode2, True
+ elif color_space[0] == "/DeviceN":
+ original_color_space = color_space
+ color_components = len(color_space[1])
+ color_space = color_space[2]
+ if isinstance(color_space, IndirectObject): # pragma: no cover
+ color_space = color_space.get_object()
+ if color_space == "/DeviceCMYK" and color_components == 1:
+ if original_color_space[1][0] != "/Black":
+ logger_warning(
+ f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team",
+ __name__,
+ )
+ return "L", True
+ mode2, invert_color = _get_imagemode(
+ color_space, color_components, prev_mode, depth + 1
+ )
+ return mode2, invert_color
+
+ mode_map = {
+ "1bit": "1", # pos [0] will be used for 1 bit
+ "/DeviceGray": "L", # must be in pos [1]
+ "palette": "P", # must be in pos [2] for color_components align.
+ "/DeviceRGB": "RGB", # must be in pos [3]
+ "/DeviceCMYK": "CMYK", # must be in pos [4]
+ "2bit": "2bits", # 2 bits images
+ "4bit": "4bits", # 4 bits
+ }
+ mode: mode_str_type = (
+ mode_map.get(color_space) # type: ignore
+ or list(mode_map.values())[color_components]
+ or prev_mode
+ )
+ return mode, mode == "CMYK"
+
+
+def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
+ mask = (1 << bits) - 1
+ nbuff = bytearray(size[0] * size[1])
+ by = 0
+ bit = 8 - bits
+ for y in range(size[1]):
+ if (bit != 0) and (bit != 8 - bits):
+ by += 1
+ bit = 8 - bits
+ for x in range(size[0]):
+ nbuff[y * size[0] + x] = (data[by] >> bit) & mask
+ bit -= bits
+ if bit < 0:
+ by += 1
+ bit = 8 - bits
+ return bytes(nbuff)
+
+
+def _extended_image_frombytes(
+ mode: str, size: Tuple[int, int], data: bytes
+) -> Image.Image:
+ try:
+ img = Image.frombytes(mode, size, data)
+ except ValueError as exc:
+ nb_pix = size[0] * size[1]
+ if len(data) % nb_pix != 0:
+ raise exc
+ k = nb_pix * len(mode) / len(data)
+ data = b"".join([bytes((x,) * int(k)) for x in data])
+ img = Image.frombytes(mode, size, data)
+ return img
+
+
+def _handle_flate(
+ size: Tuple[int, int],
+ data: bytes,
+ mode: mode_str_type,
+ color_space: str,
+ colors: int,
+ obj_as_text: str,
+) -> Tuple[Image.Image, str, str, bool]:
+ """
+ Process image encoded in flateEncode
+ Returns img, image_format, extension, color inversion
+ """
+ extension = ".png" # mime_type = "image/png"
+ image_format = "PNG"
+ lookup: Any
+ base: Any
+ hival: Any
+ if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed":
+ color_space, base, hival, lookup = (value.get_object() for value in color_space)
+ if mode == "2bits":
+ mode = "P"
+ data = bits2byte(data, size, 2)
+ elif mode == "4bits":
+ mode = "P"
+ data = bits2byte(data, size, 4)
+ img = _extended_image_frombytes(mode, size, data)
+ if color_space == "/Indexed":
+ from .generic import TextStringObject
+
+ if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)):
+ lookup = lookup.get_data()
+ if isinstance(lookup, TextStringObject):
+ lookup = lookup.original_bytes
+ if isinstance(lookup, str):
+ lookup = lookup.encode()
+ try:
+ nb, conv, mode = { # type: ignore
+ "1": (0, "", ""),
+ "L": (1, "P", "L"),
+ "P": (0, "", ""),
+ "RGB": (3, "P", "RGB"),
+ "CMYK": (4, "P", "CMYK"),
+ }[_get_imagemode(base, 0, "")[0]]
+ except KeyError: # pragma: no cover
+ logger_warning(
+ f"Base {base} not coded please share the pdf file with pypdf dev team",
+ __name__,
+ )
+ lookup = None
+ else:
+ if img.mode == "1":
+ # Two values ("high" and "low").
+ expected_count = 2 * nb
+ if len(lookup) != expected_count:
+ if len(lookup) < expected_count:
+ raise PdfReadError(
+ f"Not enough lookup values: Expected {expected_count}, got {len(lookup)}."
+ )
+ if not check_if_whitespace_only(lookup[expected_count:]):
+ raise PdfReadError(
+ f"Too many lookup values: Expected {expected_count}, got {len(lookup)}."
+ )
+ lookup = lookup[:expected_count]
+ colors_arr = [lookup[:nb], lookup[nb:]]
+ arr = b"".join(
+ [
+ b"".join(
+ [
+ colors_arr[1 if img.getpixel((x, y)) > 127 else 0]
+ for x in range(img.size[0])
+ ]
+ )
+ for y in range(img.size[1])
+ ]
+ )
+ img = Image.frombytes(mode, img.size, arr)
+ else:
+ img = img.convert(conv)
+ if len(lookup) != (hival + 1) * nb:
+ logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__)
+ lookup = None
+ elif mode == "L":
+ # gray lookup does not work : it is converted to a similar RGB lookup
+ lookup = b"".join([bytes([b, b, b]) for b in lookup])
+ mode = "RGB"
+ # TODO : cf https://github.com/py-pdf/pypdf/pull/2039
+ # this is a work around until PIL is able to process CMYK images
+ elif mode == "CMYK":
+ _rgb = []
+ for _c, _m, _y, _k in (
+ lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4)
+ ):
+ _r = int(255 * (1 - _c / 255) * (1 - _k / 255))
+ _g = int(255 * (1 - _m / 255) * (1 - _k / 255))
+ _b = int(255 * (1 - _y / 255) * (1 - _k / 255))
+ _rgb.append(bytes((_r, _g, _b)))
+ lookup = b"".join(_rgb)
+ mode = "RGB"
+ if lookup is not None:
+ img.putpalette(lookup, rawmode=mode)
+ img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
+ elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
+ # see Table 66 - Additional Entries Specific to an ICC Profile
+ # Stream Dictionary
+ mode2 = _get_imagemode(color_space, colors, mode)[0]
+ if mode != mode2:
+ img = Image.frombytes(mode2, size, data) # reloaded as mode may have change
+ if mode == "CMYK":
+ extension = ".tif"
+ image_format = "TIFF"
+ return img, image_format, extension, False
+
+
+def _handle_jpx(
+ size: Tuple[int, int],
+ data: bytes,
+ mode: mode_str_type,
+ color_space: str,
+ colors: int,
+) -> Tuple[Image.Image, str, str, bool]:
+ """
+ Process image encoded in flateEncode
+ Returns img, image_format, extension, inversion
+ """
+ extension = ".jp2" # mime_type = "image/x-jp2"
+ img1 = Image.open(BytesIO(data), formats=("JPEG2000",))
+ mode, invert_color = _get_imagemode(color_space, colors, mode)
+ if mode == "":
+ mode = cast(mode_str_type, img1.mode)
+ invert_color = mode in ("CMYK",)
+ if img1.mode == "RGBA" and mode == "RGB":
+ mode = "RGBA"
+ # we need to convert to the good mode
+ if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets
+ # L,P are indexed modes which should not be changed.
+ img = img1
+ elif {img1.mode, mode} == {"RGBA", "CMYK"}:
+ # RGBA / CMYK are 4bytes encoding where
+ # the encoding should be corrected
+ img = Image.frombytes(mode, img1.size, img1.tobytes())
+ else: # pragma: no cover
+ img = img1.convert(mode)
+ # for CMYK conversion :
+ # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
+ # not implemented for the moment as I need to get properly the ICC
+ if img.mode == "CMYK":
+ img = img.convert("RGB")
+ image_format = "JPEG2000"
+ return img, image_format, extension, invert_color