two version of R2R are hereHEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/filters.py
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
1 files changed, 910 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/filters.py b/.venv/lib/python3.12/site-packages/pypdf/filters.py
new file mode 100644
index 00000000..5e6a10f7
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/filters.py
@@ -0,0 +1,910 @@
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Implementation of stream filters for PDF.
+
+See TABLE H.1 Abbreviations for standard filter names
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "biziqe@mathieu.fenniak.net"
+
+import math
+import struct
+import zlib
+from base64 import a85decode
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+from ._utils import (
+    WHITESPACES_AS_BYTES,
+    b_,
+    deprecate_with_replacement,
+    deprecation_no_replacement,
+    logger_warning,
+    ord_,
+)
+from .constants import CcittFaxDecodeParameters as CCITT
+from .constants import ColorSpaces
+from .constants import FilterTypeAbbreviations as FTA
+from .constants import FilterTypes as FT
+from .constants import ImageAttributes as IA
+from .constants import LzwFilterParameters as LZW
+from .constants import StreamAttributes as SA
+from .errors import DeprecationError, PdfReadError, PdfStreamError
+from .generic import (
+    ArrayObject,
+    DictionaryObject,
+    IndirectObject,
+    NullObject,
+)
+
+
+def decompress(data: bytes) -> bytes:
+    """
+    Decompress the given data using zlib.
+
+    This function attempts to decompress the input data using zlib. If the
+    decompression fails due to a zlib error, it falls back to using a
+    decompression object with a larger window size.
+
+    Args:
+        data: The input data to be decompressed.
+
+    Returns:
+        The decompressed data.
+    """
+    try:
+        return zlib.decompress(data)
+    except zlib.error:
+        try:
+            # For larger files, use Decompress object to enable buffered reading
+            return zlib.decompressobj().decompress(data)
+        except zlib.error:
+            # If still failed, then try with increased window size
+            d = zlib.decompressobj(zlib.MAX_WBITS | 32)
+            result_str = b""
+            for b in [data[i : i + 1] for i in range(len(data))]:
+                try:
+                    result_str += d.decompress(b)
+                except zlib.error:
+                    pass
+            return result_str
+
+
+class FlateDecode:
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        """
+        Decode data which is flate-encoded.
+
+        Args:
+          data: flate-encoded data.
+          decode_parms: a dictionary of values, understanding the
+            "/Predictor":<int> key only
+
+        Returns:
+          The flate-decoded data.
+
+        Raises:
+          PdfReadError:
+        """
+        if "decodeParms" in kwargs:  # deprecated
+            deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
+            decode_parms = kwargs["decodeParms"]
+        if isinstance(decode_parms, ArrayObject):
+            raise DeprecationError("decode_parms as ArrayObject is depreciated")
+
+        str_data = decompress(data)
+        predictor = 1
+
+        if decode_parms:
+            try:
+                predictor = decode_parms.get("/Predictor", 1)
+            except (AttributeError, TypeError):  # Type Error is NullObject
+                pass  # Usually an array with a null object was read
+        # predictor 1 == no predictor
+        if predictor != 1:
+            # /Columns, the number of samples in each row, has a default value of 1;
+            # §7.4.4.3, ISO 32000.
+            DEFAULT_BITS_PER_COMPONENT = 8
+            try:
+                columns = cast(int, decode_parms[LZW.COLUMNS].get_object())  # type: ignore
+            except (TypeError, KeyError):
+                columns = 1
+            try:
+                colors = cast(int, decode_parms[LZW.COLORS].get_object())  # type: ignore
+            except (TypeError, KeyError):
+                colors = 1
+            try:
+                bits_per_component = cast(
+                    int,
+                    decode_parms[LZW.BITS_PER_COMPONENT].get_object(),  # type: ignore
+                )
+            except (TypeError, KeyError):
+                bits_per_component = DEFAULT_BITS_PER_COMPONENT
+
+            # PNG predictor can vary by row and so is the lead byte on each row
+            rowlength = (
+                math.ceil(columns * colors * bits_per_component / 8) + 1
+            )  # number of bytes
+
+            # TIFF prediction:
+            if predictor == 2:
+                rowlength -= 1  # remove the predictor byte
+                bpp = rowlength // columns
+                str_data = bytearray(str_data)
+                for i in range(len(str_data)):
+                    if i % rowlength >= bpp:
+                        str_data[i] = (str_data[i] + str_data[i - bpp]) % 256
+                str_data = bytes(str_data)
+            # PNG prediction:
+            elif 10 <= predictor <= 15:
+                str_data = FlateDecode._decode_png_prediction(
+                    str_data, columns, rowlength
+                )
+            else:
+                # unsupported predictor
+                raise PdfReadError(f"Unsupported flatedecode predictor {predictor!r}")
+        return str_data
+
+    @staticmethod
+    def _decode_png_prediction(data: bytes, columns: int, rowlength: int) -> bytes:
+        # PNG prediction can vary from row to row
+        if len(data) % rowlength != 0:
+            raise PdfReadError("Image data is not rectangular")
+        output = []
+        prev_rowdata = (0,) * rowlength
+        bpp = (rowlength - 1) // columns  # recomputed locally to not change params
+        for row in range(0, len(data), rowlength):
+            rowdata: List[int] = list(data[row : row + rowlength])
+            filter_byte = rowdata[0]
+
+            if filter_byte == 0:
+                pass
+            elif filter_byte == 1:
+                for i in range(bpp + 1, rowlength):
+                    rowdata[i] = (rowdata[i] + rowdata[i - bpp]) % 256
+            elif filter_byte == 2:
+                for i in range(1, rowlength):
+                    rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
+            elif filter_byte == 3:
+                for i in range(1, bpp + 1):
+                    # left = 0
+                    floor = prev_rowdata[i] // 2
+                    rowdata[i] = (rowdata[i] + floor) % 256
+                for i in range(bpp + 1, rowlength):
+                    left = rowdata[i - bpp]
+                    floor = (left + prev_rowdata[i]) // 2
+                    rowdata[i] = (rowdata[i] + floor) % 256
+            elif filter_byte == 4:
+                for i in range(1, bpp + 1):
+                    # left = 0
+                    up = prev_rowdata[i]
+                    # up_left = 0
+                    paeth = up
+                    rowdata[i] = (rowdata[i] + paeth) % 256
+                for i in range(bpp + 1, rowlength):
+                    left = rowdata[i - bpp]
+                    up = prev_rowdata[i]
+                    up_left = prev_rowdata[i - bpp]
+
+                    p = left + up - up_left
+                    dist_left = abs(p - left)
+                    dist_up = abs(p - up)
+                    dist_up_left = abs(p - up_left)
+
+                    if dist_left <= dist_up and dist_left <= dist_up_left:
+                        paeth = left
+                    elif dist_up <= dist_up_left:
+                        paeth = up
+                    else:
+                        paeth = up_left
+
+                    rowdata[i] = (rowdata[i] + paeth) % 256
+            else:
+                # unsupported PNG filter
+                raise PdfReadError(
+                    f"Unsupported PNG filter {filter_byte!r}"
+                )  # pragma: no cover
+            prev_rowdata = tuple(rowdata)
+            output.extend(rowdata[1:])
+        return bytes(output)
+
+    @staticmethod
+    def encode(data: bytes, level: int = -1) -> bytes:
+        """
+        Compress the input data using zlib.
+
+        Args:
+            data: The data to be compressed.
+            level: See https://docs.python.org/3/library/zlib.html#zlib.compress
+
+        Returns:
+            The compressed data.
+        """
+        return zlib.compress(data, level)
+
+
+class ASCIIHexDecode:
+    """
+    The ASCIIHexDecode filter decodes data that has been encoded in ASCII
+    hexadecimal form into a base-7 ASCII format.
+    """
+
+    @staticmethod
+    def decode(
+        data: Union[str, bytes],
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        """
+        Decode an ASCII-Hex encoded data stream.
+
+        Args:
+          data: a str sequence of hexadecimal-encoded values to be
+            converted into a base-7 ASCII string
+          decode_parms: a string conversion in base-7 ASCII, where each of its values
+            v is such that 0 <= ord(v) <= 127.
+
+        Returns:
+          A string conversion in base-7 ASCII, where each of its values
+          v is such that 0 <= ord(v) <= 127.
+
+        Raises:
+          PdfStreamError:
+        """
+        # decode_parms is unused here
+
+        if isinstance(data, str):
+            data = data.encode()
+        retval = b""
+        hex_pair = b""
+        index = 0
+        while True:
+            if index >= len(data):
+                logger_warning(
+                    "missing EOD in ASCIIHexDecode, check if output is OK", __name__
+                )
+                break  # reach End Of String even if no EOD
+            char = data[index : index + 1]
+            if char == b">":
+                break
+            elif char.isspace():
+                index += 1
+                continue
+            hex_pair += char
+            if len(hex_pair) == 2:
+                retval += bytes((int(hex_pair, base=16),))
+                hex_pair = b""
+            index += 1
+        assert hex_pair == b""
+        return retval
+
+
+class RunLengthDecode:
+    """
+    The RunLengthDecode filter decodes data that has been encoded in a
+    simple byte-oriented format based on run length.
+    The encoded data is a sequence of runs, where each run consists of
+    a length byte followed by 1 to 128 bytes of data. If the length byte is
+    in the range 0 to 127,
+    the following length + 1 (1 to 128) bytes are copied literally during
+    decompression.
+    If length is in the range 129 to 255, the following single byte is to be
+    copied 257 − length (2 to 128) times during decompression. A length value
+    of 128 denotes EOD.
+    """
+
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        """
+        Decode a run length encoded data stream.
+
+        Args:
+          data: a bytes sequence of length/data
+          decode_parms: ignored.
+
+        Returns:
+          A bytes decompressed sequence.
+
+        Raises:
+          PdfStreamError:
+        """
+        # decode_parms is unused here
+
+        lst = []
+        index = 0
+        while True:
+            if index >= len(data):
+                logger_warning(
+                    "missing EOD in RunLengthDecode, check if output is OK", __name__
+                )
+                break  # reach End Of String even if no EOD
+            length = data[index]
+            index += 1
+            if length == 128:
+                if index < len(data):
+                    raise PdfStreamError("early EOD in RunLengthDecode")
+                else:
+                    break
+            elif length < 128:
+                length += 1
+                lst.append(data[index : (index + length)])
+                index += length
+            else:  # >128
+                length = 257 - length
+                lst.append(bytes((data[index],)) * length)
+                index += 1
+        return b"".join(lst)
+
+
+class LZWDecode:
+    """
+    Taken from:
+
+    http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-
+    Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
+    """
+
+    class Decoder:
+        def __init__(self, data: bytes) -> None:
+            self.STOP = 257
+            self.CLEARDICT = 256
+            self.data = data
+            self.bytepos = 0
+            self.bitpos = 0
+            self.dict = [""] * 4096
+            for i in range(256):
+                self.dict[i] = chr(i)
+            self.reset_dict()
+
+        def reset_dict(self) -> None:
+            self.dictlen = 258
+            self.bitspercode = 9
+
+        def next_code(self) -> int:
+            fillbits = self.bitspercode
+            value = 0
+            while fillbits > 0:
+                if self.bytepos >= len(self.data):
+                    return -1
+                nextbits = ord_(self.data[self.bytepos])
+                bitsfromhere = 8 - self.bitpos
+                bitsfromhere = min(bitsfromhere, fillbits)
+                value |= (
+                    (nextbits >> (8 - self.bitpos - bitsfromhere))
+                    & (0xFF >> (8 - bitsfromhere))
+                ) << (fillbits - bitsfromhere)
+                fillbits -= bitsfromhere
+                self.bitpos += bitsfromhere
+                if self.bitpos >= 8:
+                    self.bitpos = 0
+                    self.bytepos = self.bytepos + 1
+            return value
+
+        def decode(self) -> str:
+            """
+            TIFF 6.0 specification explains in sufficient details the steps to
+            implement the LZW encode() and decode() algorithms.
+
+            algorithm derived from:
+            http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
+            and the PDFReference
+
+            Raises:
+              PdfReadError: If the stop code is missing
+            """
+            cW = self.CLEARDICT
+            baos = ""
+            while True:
+                pW = cW
+                cW = self.next_code()
+                if cW == -1:
+                    raise PdfReadError("Missed the stop code in LZWDecode!")
+                if cW == self.STOP:
+                    break
+                elif cW == self.CLEARDICT:
+                    self.reset_dict()
+                elif pW == self.CLEARDICT:
+                    baos += self.dict[cW]
+                else:
+                    if cW < self.dictlen:
+                        baos += self.dict[cW]
+                        p = self.dict[pW] + self.dict[cW][0]
+                        self.dict[self.dictlen] = p
+                        self.dictlen += 1
+                    else:
+                        p = self.dict[pW] + self.dict[pW][0]
+                        baos += p
+                        self.dict[self.dictlen] = p
+                        self.dictlen += 1
+                    if (
+                        self.dictlen >= (1 << self.bitspercode) - 1
+                        and self.bitspercode < 12
+                    ):
+                        self.bitspercode += 1
+            return baos
+
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> str:
+        """
+        Decode an LZW encoded data stream.
+
+        Args:
+          data: ``bytes`` or ``str`` text to decode.
+          decode_parms: a dictionary of parameter values.
+
+        Returns:
+          decoded data.
+        """
+        # decode_parms is unused here
+
+        return LZWDecode.Decoder(data).decode()
+
+
+class ASCII85Decode:
+    """Decodes string ASCII85-encoded data into a byte format."""
+
+    @staticmethod
+    def decode(
+        data: Union[str, bytes],
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        """
+        Decode an Ascii85 encoded data stream.
+
+        Args:
+          data: ``bytes`` or ``str`` text to decode.
+          decode_parms: a dictionary of parameter values.
+
+        Returns:
+          decoded data.
+        """
+        if isinstance(data, str):
+            data = data.encode()
+        data = data.strip(WHITESPACES_AS_BYTES)
+        return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
+
+
+class DCTDecode:
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        # decode_parms is unused here
+        return data
+
+
+class JPXDecode:
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        # decode_parms is unused here
+        return data
+
+
+class CCITParameters:
+    """§7.4.6, optional parameters for the CCITTFaxDecode filter."""
+
+    def __init__(self, K: int = 0, columns: int = 0, rows: int = 0) -> None:
+        self.K = K
+        self.EndOfBlock = None
+        self.EndOfLine = None
+        self.EncodedByteAlign = None
+        self.columns = columns  # width
+        self.rows = rows  # height
+        self.DamagedRowsBeforeError = None
+
+    @property
+    def group(self) -> int:
+        if self.K < 0:
+            CCITTgroup = 4
+        else:
+            # k == 0: Pure one-dimensional encoding (Group 3, 1-D)
+            # k > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D)
+            CCITTgroup = 3
+        return CCITTgroup
+
+
+class CCITTFaxDecode:
+    """
+    §7.4.6, CCITTFaxDecode filter (ISO 32000).
+
+    Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
+    CCITT encoding is bit-oriented, not byte-oriented.
+
+    §7.4.6, optional parameters for the CCITTFaxDecode filter.
+    """
+
+    @staticmethod
+    def _get_parameters(
+        parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject],
+        rows: int,
+    ) -> CCITParameters:
+        # §7.4.6, optional parameters for the CCITTFaxDecode filter
+        k = 0
+        columns = 1728
+        if parameters:
+            parameters_unwrapped = cast(
+                Union[ArrayObject, DictionaryObject], parameters.get_object()
+            )
+            if isinstance(parameters_unwrapped, ArrayObject):
+                for decode_parm in parameters_unwrapped:
+                    if CCITT.COLUMNS in decode_parm:
+                        columns = decode_parm[CCITT.COLUMNS]
+                    if CCITT.K in decode_parm:
+                        k = decode_parm[CCITT.K]
+            else:
+                if CCITT.COLUMNS in parameters_unwrapped:
+                    columns = parameters_unwrapped[CCITT.COLUMNS]  # type: ignore
+                if CCITT.K in parameters_unwrapped:
+                    k = parameters_unwrapped[CCITT.K]  # type: ignore
+
+        return CCITParameters(k, columns, rows)
+
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        height: int = 0,
+        **kwargs: Any,
+    ) -> bytes:
+        # decode_parms is unused here
+        if "decodeParms" in kwargs:  # deprecated
+            deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
+            decode_parms = kwargs["decodeParms"]
+        if isinstance(decode_parms, ArrayObject):  # deprecated
+            deprecation_no_replacement(
+                "decode_parms being an ArrayObject", removed_in="3.15.5"
+            )
+        params = CCITTFaxDecode._get_parameters(decode_parms, height)
+
+        img_size = len(data)
+        tiff_header_struct = "<2shlh" + "hhll" * 8 + "h"
+        tiff_header = struct.pack(
+            tiff_header_struct,
+            b"II",  # Byte order indication: Little endian
+            42,  # Version number (always 42)
+            8,  # Offset to first IFD
+            8,  # Number of tags in IFD
+            256,
+            4,
+            1,
+            params.columns,  # ImageWidth, LONG, 1, width
+            257,
+            4,
+            1,
+            params.rows,  # ImageLength, LONG, 1, length
+            258,
+            3,
+            1,
+            1,  # BitsPerSample, SHORT, 1, 1
+            259,
+            3,
+            1,
+            params.group,  # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
+            262,
+            3,
+            1,
+            0,  # Thresholding, SHORT, 1, 0 = WhiteIsZero
+            273,
+            4,
+            1,
+            struct.calcsize(
+                tiff_header_struct
+            ),  # StripOffsets, LONG, 1, length of header
+            278,
+            4,
+            1,
+            params.rows,  # RowsPerStrip, LONG, 1, length
+            279,
+            4,
+            1,
+            img_size,  # StripByteCounts, LONG, 1, size of image
+            0,  # last IFD
+        )
+
+        return tiff_header + data
+
+
+def decode_stream_data(stream: Any) -> Union[bytes, str]:  # utils.StreamObject
+    """
+    Decode the stream data based on the specified filters.
+
+    This function decodes the stream data using the filters provided in the
+    stream. It supports various filter types, including FlateDecode,
+    ASCIIHexDecode, RunLengthDecode, LZWDecode, ASCII85Decode, DCTDecode, JPXDecode, and
+    CCITTFaxDecode.
+
+    Args:
+        stream: The input stream object containing the data and filters.
+
+    Returns:
+        The decoded stream data.
+
+    Raises:
+        NotImplementedError: If an unsupported filter type is encountered.
+    """
+    filters = stream.get(SA.FILTER, ())
+    if isinstance(filters, IndirectObject):
+        filters = cast(ArrayObject, filters.get_object())
+    if not isinstance(filters, ArrayObject):
+        # we have a single filter instance
+        filters = (filters,)
+    decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
+    if not isinstance(decodparms, (list, tuple)):
+        decodparms = (decodparms,)
+    data: bytes = b_(stream._data)
+    # If there is not data to decode we should not try to decode the data.
+    if data:
+        for filter_type, params in zip(filters, decodparms):
+            if isinstance(params, NullObject):
+                params = {}
+            if filter_type in (FT.FLATE_DECODE, FTA.FL):
+                data = FlateDecode.decode(data, params)
+            elif filter_type in (FT.ASCII_HEX_DECODE, FTA.AHx):
+                data = ASCIIHexDecode.decode(data)
+            elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL):
+                data = RunLengthDecode.decode(data)
+            elif filter_type in (FT.LZW_DECODE, FTA.LZW):
+                data = LZWDecode.decode(data, params)  # type: ignore
+            elif filter_type in (FT.ASCII_85_DECODE, FTA.A85):
+                data = ASCII85Decode.decode(data)
+            elif filter_type == FT.DCT_DECODE:
+                data = DCTDecode.decode(data)
+            elif filter_type == FT.JPX_DECODE:
+                data = JPXDecode.decode(data)
+            elif filter_type == FT.CCITT_FAX_DECODE:
+                height = stream.get(IA.HEIGHT, ())
+                data = CCITTFaxDecode.decode(data, params, height)
+            elif filter_type == "/Crypt":
+                if "/Name" in params or "/Type" in params:
+                    raise NotImplementedError(
+                        "/Crypt filter with /Name or /Type not supported yet"
+                    )
+            else:
+                # Unsupported filter
+                raise NotImplementedError(f"unsupported filter {filter_type}")
+    return data
+
+
+def decodeStreamData(stream: Any) -> Union[str, bytes]:  # deprecated
+    """Deprecated. Use decode_stream_data."""
+    deprecate_with_replacement("decodeStreamData", "decode_stream_data", "4.0.0")
+    return decode_stream_data(stream)
+
+
+def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]:
+    """
+    Users need to have the pillow package installed.
+
+    It's unclear if pypdf will keep this function here, hence it's private.
+    It might get removed at any point.
+
+    Args:
+      x_object_obj:
+
+    Returns:
+        Tuple[file extension, bytes, PIL.Image.Image]
+    """
+    from ._xobj_image_helpers import (
+        Image,
+        UnidentifiedImageError,
+        _extended_image_frombytes,
+        _get_imagemode,
+        _handle_flate,
+        _handle_jpx,
+        mode_str_type,
+    )
+
+    # for error reporting
+    if (
+        hasattr(x_object_obj, "indirect_reference") and x_object_obj is None
+    ):  # pragma: no cover
+        obj_as_text = x_object_obj.indirect_reference.__repr__()
+    else:
+        obj_as_text = x_object_obj.__repr__()
+
+    size = (cast(int, x_object_obj[IA.WIDTH]), cast(int, x_object_obj[IA.HEIGHT]))
+    data = x_object_obj.get_data()  # type: ignore
+    if isinstance(data, str):  # pragma: no cover
+        data = data.encode()
+    if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A:  # ie. '\n'
+        data = data[:-1]
+    colors = x_object_obj.get("/Colors", 1)
+    color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
+    if isinstance(color_space, list) and len(color_space) == 1:
+        color_space = color_space[0].get_object()
+    if (
+        IA.COLOR_SPACE in x_object_obj
+        and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
+    ):
+        # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
+        mode: mode_str_type = "RGB"
+    if x_object_obj.get("/BitsPerComponent", 8) < 8:
+        mode, invert_color = _get_imagemode(
+            f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
+        )
+    else:
+        mode, invert_color = _get_imagemode(
+            color_space,
+            2
+            if (
+                colors == 1
+                and (
+                    not isinstance(color_space, NullObject)
+                    and "Gray" not in color_space
+                )
+            )
+            else colors,
+            "",
+        )
+    extension = None
+    alpha = None
+    filters = x_object_obj.get(SA.FILTER, NullObject()).get_object()
+    lfilters = filters[-1] if isinstance(filters, list) else filters
+    if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
+        img, image_format, extension, _ = _handle_flate(
+            size,
+            data,
+            mode,
+            color_space,
+            colors,
+            obj_as_text,
+        )
+    elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE):
+        # I'm not sure if the following logic is correct.
+        # There might not be any relationship between the filters and the
+        # extension
+        if lfilters in (FT.LZW_DECODE, FT.CCITT_FAX_DECODE):
+            extension = ".tiff"  # mime_type = "image/tiff"
+            image_format = "TIFF"
+        else:
+            extension = ".png"  # mime_type = "image/png"
+            image_format = "PNG"
+        try:
+            img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
+        except UnidentifiedImageError:
+            img = _extended_image_frombytes(mode, size, data)
+    elif lfilters == FT.DCT_DECODE:
+        img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
+        # invert_color kept unchanged
+    elif lfilters == FT.JPX_DECODE:
+        img, image_format, extension, invert_color = _handle_jpx(
+            size, data, mode, color_space, colors
+        )
+    elif lfilters == FT.CCITT_FAX_DECODE:
+        img, image_format, extension, invert_color = (
+            Image.open(BytesIO(data), formats=("TIFF",)),
+            "TIFF",
+            ".tiff",
+            False,
+        )
+    elif mode == "CMYK":
+        img, image_format, extension, invert_color = (
+            _extended_image_frombytes(mode, size, data),
+            "TIFF",
+            ".tif",
+            False,
+        )
+    elif mode == "":
+        raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
+    else:
+        img, image_format, extension, invert_color = (
+            _extended_image_frombytes(mode, size, data),
+            "PNG",
+            ".png",
+            False,
+        )
+    # CMYK image and other colorspaces without decode
+    # requires reverting scale (cf p243,2§ last sentence)
+    decode = x_object_obj.get(
+        IA.DECODE,
+        ([1.0, 0.0] * len(img.getbands()))
+        if (
+            (img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
+            or (invert_color and img.mode == "L")
+        )
+        else None,
+    )
+    if (
+        isinstance(color_space, ArrayObject)
+        and color_space[0].get_object() == "/Indexed"
+    ):
+        decode = None  # decode is meanless of Indexed
+    if (
+        isinstance(color_space, ArrayObject)
+        and color_space[0].get_object() == "/Separation"
+    ):
+        decode = [1.0, 0.0] * len(img.getbands())
+    if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
+        lut: List[int] = []
+        for i in range(0, len(decode), 2):
+            dmin = decode[i]
+            dmax = decode[i + 1]
+            lut.extend(
+                round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
+            )
+        img = img.point(lut)
+
+    if IA.S_MASK in x_object_obj:  # add alpha channel
+        alpha = _xobj_to_image(x_object_obj[IA.S_MASK])[2]
+        if img.size != alpha.size:
+            logger_warning(f"image and mask size not matching: {obj_as_text}", __name__)
+        else:
+            # TODO : implement mask
+            if alpha.mode != "L":
+                alpha = alpha.convert("L")
+            if img.mode == "P":
+                img = img.convert("RGB")
+            elif img.mode == "1":
+                img = img.convert("L")
+            img.putalpha(alpha)
+        if "JPEG" in image_format:
+            extension = ".jp2"
+            image_format = "JPEG2000"
+        else:
+            extension = ".png"
+            image_format = "PNG"
+
+    img_byte_arr = BytesIO()
+    try:
+        img.save(img_byte_arr, format=image_format)
+    except OSError:  # pragma: no cover  # covered with pillow 10.3
+        # in case of we convert to RGBA and then to PNG
+        img1 = img.convert("RGBA")
+        image_format = "PNG"
+        extension = ".png"
+        img_byte_arr = BytesIO()
+        img1.save(img_byte_arr, format=image_format)
+    data = img_byte_arr.getvalue()
+
+    try:  # temporary try/except until other fixes of images
+        img = Image.open(BytesIO(data))
+    except Exception:
+        img = None  # type: ignore
+    return extension, data, img
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/filters.py
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz