about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py')
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py235
1 files changed, 235 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
new file mode 100644
index 00000000..41826ac3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2024, pypdf contributors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import logging
+from io import BytesIO
+
+from .._utils import (
+    WHITESPACES,
+    StreamType,
+    read_non_whitespace,
+)
+from ..errors import PdfReadError
+
+logger = logging.getLogger(__name__)
+
+BUFFER_SIZE = 8192
+
+
+def extract_inline_AHx(stream: StreamType) -> bytes:
+    """
+    Extract HexEncoded Stream from Inline Image.
+    the stream will be moved onto the EI
+    """
+    data_out: bytes = b""
+    # Read data until delimiter > and EI as backup
+    # ignoring backup.
+    while True:
+        data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
+        if not data_buffered:
+            raise PdfReadError("Unexpected end of stream")
+        pos_tok = data_buffered.find(b">")
+        if pos_tok >= 0:  # found >
+            data_out += data_buffered[: (pos_tok + 1)]
+            stream.seek(-len(data_buffered) + pos_tok + 1, 1)
+            break
+        pos_ei = data_buffered.find(b"EI")
+        if pos_ei >= 0:  # found EI
+            stream.seek(-len(data_buffered) + pos_ei - 1, 1)
+            c = stream.read(1)
+            while c in WHITESPACES:
+                stream.seek(-2, 1)
+                c = stream.read(1)
+                pos_ei -= 1
+            data_out += data_buffered[:pos_ei]
+            break
+        elif len(data_buffered) == 2:
+            data_out += data_buffered
+            raise PdfReadError("Unexpected end of stream")
+        else:  # > nor EI found
+            data_out += data_buffered[:-2]
+            stream.seek(-2, 1)
+
+    ei_tok = read_non_whitespace(stream)
+    ei_tok += stream.read(2)
+    stream.seek(-3, 1)
+    if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
+        raise PdfReadError("EI stream not found")
+    return data_out
+
+
+def extract_inline_A85(stream: StreamType) -> bytes:
+    """
+    Extract A85 Stream from Inline Image.
+    the stream will be moved onto the EI
+    """
+    data_out: bytes = b""
+    # Read data up to delimiter ~>
+    # see §3.3.2 from PDF ref 1.7
+    while True:
+        data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
+        if not data_buffered:
+            raise PdfReadError("Unexpected end of stream")
+        pos_tok = data_buffered.find(b"~>")
+        if pos_tok >= 0:  # found!
+            data_out += data_buffered[: pos_tok + 2]
+            stream.seek(-len(data_buffered) + pos_tok + 2, 1)
+            break
+        elif len(data_buffered) == 2:  # end of buffer
+            data_out += data_buffered
+            raise PdfReadError("Unexpected end of stream")
+        data_out += data_buffered[
+            :-2
+        ]  # back by one char in case of in the middle of ~>
+        stream.seek(-2, 1)
+
+    ei_tok = read_non_whitespace(stream)
+    ei_tok += stream.read(2)
+    stream.seek(-3, 1)
+    if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
+        raise PdfReadError("EI stream not found")
+    return data_out
+
+
+def extract_inline_RL(stream: StreamType) -> bytes:
+    """
+    Extract RL Stream from Inline Image.
+    the stream will be moved onto the EI
+    """
+    data_out: bytes = b""
+    # Read data up to delimiter ~>
+    # see §3.3.4 from PDF ref 1.7
+    while True:
+        data_buffered = stream.read(BUFFER_SIZE)
+        if not data_buffered:
+            raise PdfReadError("Unexpected end of stream")
+        pos_tok = data_buffered.find(b"\x80")
+        if pos_tok >= 0:  # found
+            data_out += data_buffered[: pos_tok + 1]
+            stream.seek(-len(data_buffered) + pos_tok + 1, 1)
+            break
+        data_out += data_buffered
+
+    ei_tok = read_non_whitespace(stream)
+    ei_tok += stream.read(2)
+    stream.seek(-3, 1)
+    if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
+        raise PdfReadError("EI stream not found")
+    return data_out
+
+
+def extract_inline_DCT(stream: StreamType) -> bytes:
+    """
+    Extract DCT (JPEG) Stream from Inline Image.
+    the stream will be moved onto the EI
+    """
+    data_out: bytes = b""
+    # Read Blocks of data (ID/Size/data) up to ID=FF/D9
+    # see https://www.digicamsoft.com/itu/itu-t81-36.html
+    notfirst = False
+    while True:
+        c = stream.read(1)
+        if notfirst or (c == b"\xff"):
+            data_out += c
+        if c != b"\xff":
+            continue
+        else:
+            notfirst = True
+        c = stream.read(1)
+        data_out += c
+        if c == b"\xff":
+            stream.seek(-1, 1)  # pragma: no cover
+        elif c == b"\x00":  # stuffing
+            pass
+        elif c == b"\xd9":  # end
+            break
+        elif c in (
+            b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf"
+            b"\xda\xdb\xdc\xdd\xde\xdf"
+            b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
+        ):
+            c = stream.read(2)
+            data_out += c
+            sz = c[0] * 256 + c[1]
+            data_out += stream.read(sz - 2)
+        # else: pass
+
+    ei_tok = read_non_whitespace(stream)
+    ei_tok += stream.read(2)
+    stream.seek(-3, 1)
+    if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
+        raise PdfReadError("EI stream not found")
+    return data_out
+
+
+def extract_inline_default(stream: StreamType) -> bytes:
+    """
+    Legacy method
+    used by default
+    """
+    stream_out = BytesIO()
+    # Read the inline image, while checking for EI (End Image) operator.
+    while True:
+        data_buffered = stream.read(BUFFER_SIZE)
+        if not data_buffered:
+            raise PdfReadError("Unexpected end of stream")
+        pos_ei = data_buffered.find(
+            b"E"
+        )  # we can not look straight for "EI" because it may not have been loaded in the buffer
+
+        if pos_ei == -1:
+            stream_out.write(data_buffered)
+        else:
+            # Write out everything including E (the one from EI to be removed).
+            stream_out.write(data_buffered[0 : pos_ei + 1])
+            sav_pos_ei = stream_out.tell() - 1
+            # Seek back in the stream to read the E next.
+            stream.seek(pos_ei + 1 - len(data_buffered), 1)
+            saved_pos = stream.tell()
+            # Check for End Image
+            tok2 = stream.read(1)  # I of "EI"
+            if tok2 != b"I":
+                stream.seek(saved_pos, 0)
+                continue
+            tok3 = stream.read(1)  # possible space after "EI"
+            if tok3 not in WHITESPACES:
+                stream.seek(saved_pos, 0)
+                continue
+            while tok3 in WHITESPACES:
+                tok3 = stream.read(1)
+            if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in {
+                b"Q",
+                b"E",
+            }:  # for Q ou EMC
+                stream.seek(saved_pos, 0)
+                continue
+            # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficients
+            # remove E(I) wrongly inserted earlier
+            stream_out.truncate(sav_pos_ei)
+            break
+
+    return stream_out.getvalue()