aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/pdf2image
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/pdf2image')
-rw-r--r--.venv/lib/python3.12/site-packages/pdf2image/__init__.py8
-rw-r--r--.venv/lib/python3.12/site-packages/pdf2image/exceptions.py33
-rw-r--r--.venv/lib/python3.12/site-packages/pdf2image/generators.py46
-rw-r--r--.venv/lib/python3.12/site-packages/pdf2image/parsers.py98
-rw-r--r--.venv/lib/python3.12/site-packages/pdf2image/pdf2image.py683
-rw-r--r--.venv/lib/python3.12/site-packages/pdf2image/py.typed0
6 files changed, 868 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pdf2image/__init__.py b/.venv/lib/python3.12/site-packages/pdf2image/__init__.py
new file mode 100644
index 00000000..72601399
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pdf2image/__init__.py
@@ -0,0 +1,8 @@
+"""
+ __init__ of the pdf2image module
+"""
+
+from .pdf2image import convert_from_bytes as convert_from_bytes
+from .pdf2image import convert_from_path as convert_from_path
+from .pdf2image import pdfinfo_from_bytes as pdfinfo_from_bytes
+from .pdf2image import pdfinfo_from_path as pdfinfo_from_path
diff --git a/.venv/lib/python3.12/site-packages/pdf2image/exceptions.py b/.venv/lib/python3.12/site-packages/pdf2image/exceptions.py
new file mode 100644
index 00000000..bf201089
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pdf2image/exceptions.py
@@ -0,0 +1,33 @@
+"""
+ Define exceptions specific to pdf2image
+"""
+
+
+class PopplerNotInstalledError(Exception):
+ """Raised when poppler is not installed"""
+
+ pass
+
+
+class PDFInfoNotInstalledError(PopplerNotInstalledError):
+ """Raised when pdfinfo is not installed"""
+
+ pass
+
+
+class PDFPageCountError(Exception):
+ """Raised when the pdfinfo was unable to retrieve the page count"""
+
+ pass
+
+
+class PDFSyntaxError(Exception):
+ """Raised when a syntax error was thrown during rendering"""
+
+ pass
+
+
+class PDFPopplerTimeoutError(Exception):
+ """Raised when the timeout is exceeded while converting a PDF"""
+
+ pass
diff --git a/.venv/lib/python3.12/site-packages/pdf2image/generators.py b/.venv/lib/python3.12/site-packages/pdf2image/generators.py
new file mode 100644
index 00000000..5d79a3ce
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pdf2image/generators.py
@@ -0,0 +1,46 @@
+"""
+ pdf2image filename generators
+"""
+
+import uuid
+import threading
+
+
+class ThreadSafeGenerator(object):
+ """Wrapper around generator that protects concurrent access"""
+
+ def __init__(self, gen):
+ self.gen = gen
+ self.lock = threading.Lock()
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ with self.lock:
+ return next(self.gen)
+
+
+def threadsafe(f):
+ """Decorator to make generator threadsafe. Fix #125"""
+
+ def g(*a, **kw):
+ return ThreadSafeGenerator(f(*a, **kw))
+
+ return g
+
+
+@threadsafe
+def uuid_generator():
+ """Returns a UUID4"""
+ while True:
+ yield str(uuid.uuid4())
+
+
+@threadsafe
+def counter_generator(prefix="", suffix="", padding_goal=4):
+ """Returns a joined prefix, iteration number, and suffix"""
+ i = 0
+ while True:
+ i += 1
+ yield str(prefix) + str(i).zfill(padding_goal) + str(suffix)
diff --git a/.venv/lib/python3.12/site-packages/pdf2image/parsers.py b/.venv/lib/python3.12/site-packages/pdf2image/parsers.py
new file mode 100644
index 00000000..72f51250
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pdf2image/parsers.py
@@ -0,0 +1,98 @@
+"""
+ pdf2image custom buffer parsers
+"""
+
+from io import BytesIO
+from typing import List
+
+from PIL import Image
+
+
+def parse_buffer_to_ppm(data: bytes) -> List[Image.Image]:
+ """Parse PPM file bytes to Pillow Image
+
+ :param data: pdftoppm/pdftocairo output bytes
+ :type data: bytes
+ :return: List of PPM images parsed from the output
+ :rtype: List[Image.Image]
+ """
+
+ images = []
+
+ index = 0
+
+ while index < len(data):
+ code, size, rgb = tuple(data[index : index + 40].split(b"\n")[0:3])
+ size_x, size_y = tuple(size.split(b" "))
+ file_size = len(code) + len(size) + len(rgb) + 3 + int(size_x) * int(size_y) * 3
+ images.append(Image.open(BytesIO(data[index : index + file_size])))
+ index += file_size
+
+ return images
+
+
+def parse_buffer_to_pgm(data: bytes) -> List[Image.Image]:
+ """Parse PGM file bytes to Pillow Image
+
+ :param data: pdftoppm/pdftocairo output bytes
+ :type data: bytes
+ :return: List of PGM images parsed from the output
+ :rtype: List[Image.Image]
+ """
+
+ images = []
+
+ index = 0
+
+ while index < len(data):
+ code, size, maxval = tuple(data[index : index + 40].split(b"\n")[0:3])
+ size_x, size_y = tuple(size.split(b" "))
+ file_size = len(code) + len(size) + len(maxval) + 3 + int(size_x) * int(size_y)
+ images.append(Image.open(BytesIO(data[index : index + file_size])))
+ index += file_size
+
+ return images
+
+
+def parse_buffer_to_jpeg(data: bytes) -> List[Image.Image]:
+ """Parse JPEG file bytes to Pillow Image
+
+ :param data: pdftoppm/pdftocairo output bytes
+ :type data: bytes
+ :return: List of JPEG images parsed from the output
+ :rtype: List[Image.Image]
+ """
+
+ return [
+ Image.open(BytesIO(image_data + b"\xff\xd9"))
+ for image_data in data.split(b"\xff\xd9")[
+ :-1
+ ] # Last element is obviously empty
+ ]
+
+
+def parse_buffer_to_png(data: bytes) -> List[Image.Image]:
+ """Parse PNG file bytes to Pillow Image
+
+ :param data: pdftoppm/pdftocairo output bytes
+ :type data: bytes
+ :return: List of PNG images parsed from the output
+ :rtype: List[Image.Image]
+ """
+
+ images = []
+
+ c1 = 0
+ c2 = 0
+ data_len = len(data)
+ while c1 < data_len:
+ # IEND can appear in a PNG without being the actual end
+ if data[c2 : c2 + 4] == b"IEND" and (
+ c2 + 8 == data_len or data[c2 + 9 : c2 + 12] == b"PNG"
+ ):
+ images.append(Image.open(BytesIO(data[c1 : c2 + 8])))
+ c1 = c2 + 8
+ c2 = c1
+ c2 += 1
+
+ return images
diff --git a/.venv/lib/python3.12/site-packages/pdf2image/pdf2image.py b/.venv/lib/python3.12/site-packages/pdf2image/pdf2image.py
new file mode 100644
index 00000000..21d19c88
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pdf2image/pdf2image.py
@@ -0,0 +1,683 @@
+"""
+ pdf2image is a light wrapper for the poppler-utils tools that can convert your
+ PDFs into Pillow images.
+"""
+
+import os
+import platform
+import tempfile
+import types
+import shutil
+import subprocess
+from subprocess import Popen, PIPE, TimeoutExpired
+from typing import Any, Union, Tuple, List, Dict, Callable
+from pathlib import PurePath
+from PIL import Image
+
+from pdf2image.generators import uuid_generator, counter_generator, ThreadSafeGenerator
+
+from pdf2image.parsers import (
+ parse_buffer_to_pgm,
+ parse_buffer_to_ppm,
+ parse_buffer_to_jpeg,
+ parse_buffer_to_png,
+)
+
+from pdf2image.exceptions import (
+ PDFInfoNotInstalledError,
+ PDFPageCountError,
+ PDFSyntaxError,
+ PDFPopplerTimeoutError,
+)
+
+TRANSPARENT_FILE_TYPES = ["png", "tiff"]
+PDFINFO_CONVERT_TO_INT = ["Pages"]
+
+
+def convert_from_path(
+ pdf_path: Union[str, PurePath],
+ dpi: int = 200,
+ output_folder: Union[str, PurePath] = None,
+ first_page: int = None,
+ last_page: int = None,
+ fmt: str = "ppm",
+ jpegopt: Dict = None,
+ thread_count: int = 1,
+ userpw: str = None,
+ ownerpw: str = None,
+ use_cropbox: bool = False,
+ strict: bool = False,
+ transparent: bool = False,
+ single_file: bool = False,
+ output_file: Any = uuid_generator(),
+ poppler_path: Union[str, PurePath] = None,
+ grayscale: bool = False,
+ size: Union[Tuple, int] = None,
+ paths_only: bool = False,
+ use_pdftocairo: bool = False,
+ timeout: int = None,
+ hide_annotations: bool = False,
+) -> List[Image.Image]:
+ """Function wrapping pdftoppm and pdftocairo
+
+ :param pdf_path: Path to the PDF that you want to convert
+ :type pdf_path: Union[str, PurePath]
+ :param dpi: Image quality in DPI (default 200), defaults to 200
+ :type dpi: int, optional
+ :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
+ :type output_folder: Union[str, PurePath], optional
+ :param first_page: First page to process, defaults to None
+ :type first_page: int, optional
+ :param last_page: Last page to process before stopping, defaults to None
+ :type last_page: int, optional
+ :param fmt: Output image format, defaults to "ppm"
+ :type fmt: str, optional
+ :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
+ :type jpegopt: Dict, optional
+ :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
+ :type thread_count: int, optional
+ :param userpw: PDF's password, defaults to None
+ :type userpw: str, optional
+ :param ownerpw: PDF's owner password, defaults to None
+ :type ownerpw: str, optional
+ :param use_cropbox: Use cropbox instead of mediabox, defaults to False
+ :type use_cropbox: bool, optional
+ :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
+ :type strict: bool, optional
+ :param transparent: Output with a transparent background instead of a white one, defaults to False
+ :type transparent: bool, optional
+ :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
+ :type single_file: bool, optional
+ :param output_file: What is the output filename or generator, defaults to uuid_generator()
+ :type output_file: Any, optional
+ :param poppler_path: Path to look for poppler binaries, defaults to None
+ :type poppler_path: Union[str, PurePath], optional
+ :param grayscale: Output grayscale image(s), defaults to False
+ :type grayscale: bool, optional
+ :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
+ :type size: Union[Tuple, int], optional
+ :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
+ :type paths_only: bool, optional
+ :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
+ :type use_pdftocairo: bool, optional
+ :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
+ :type timeout: int, optional
+ :param hide_annotations: Hide PDF annotations in the output, defaults to False
+ :type hide_annotations: bool, optional
+ :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
+ :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
+ :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
+ :return: A list of Pillow images, one for each page between first_page and last_page
+ :rtype: List[Image.Image]
+ """
+
+ if use_pdftocairo and fmt == "ppm":
+ fmt = "png"
+
+ # We make sure that if passed arguments are Path objects, they're converted to strings
+ if isinstance(pdf_path, PurePath):
+ pdf_path = pdf_path.as_posix()
+
+ if isinstance(output_folder, PurePath):
+ output_folder = output_folder.as_posix()
+
+ if isinstance(poppler_path, PurePath):
+ poppler_path = poppler_path.as_posix()
+
+ page_count = pdfinfo_from_path(
+ pdf_path, userpw, ownerpw, poppler_path=poppler_path
+ )["Pages"]
+
+ # We start by getting the output format, the buffer processing function and if we need pdftocairo
+ parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format(
+ fmt, grayscale
+ )
+
+ # We use pdftocairo is the format requires it OR we need a transparent output
+ use_pdfcairo = (
+ use_pdftocairo
+ or use_pdfcairo_format
+ or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES)
+ )
+
+ poppler_version_major, poppler_version_minor = _get_poppler_version(
+ "pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
+ )
+
+ if poppler_version_major == 0 and poppler_version_minor <= 57:
+ jpegopt = None
+
+ if poppler_version_major == 0 and poppler_version_minor <= 83:
+ hide_annotations = False
+
+ # If output_file isn't a generator, it will be turned into one
+ if not isinstance(output_file, types.GeneratorType) and not isinstance(
+ output_file, ThreadSafeGenerator
+ ):
+ if single_file:
+ output_file = iter([output_file])
+ thread_count = 1
+ else:
+ output_file = counter_generator(output_file)
+
+ if thread_count < 1:
+ thread_count = 1
+
+ if first_page is None or first_page < 1:
+ first_page = 1
+
+ if last_page is None or last_page > page_count:
+ last_page = page_count
+
+ if first_page > last_page:
+ return []
+
+ try:
+ auto_temp_dir = False
+ if output_folder is None and use_pdfcairo:
+ output_folder = tempfile.mkdtemp()
+ auto_temp_dir = True
+
+ # Recalculate page count based on first and last page
+ page_count = last_page - first_page + 1
+
+ if thread_count > page_count:
+ thread_count = page_count
+
+ reminder = page_count % thread_count
+ current_page = first_page
+ processes = []
+ for _ in range(thread_count):
+ thread_output_file = next(output_file)
+
+ # Get the number of pages the thread will be processing
+ thread_page_count = page_count // thread_count + int(reminder > 0)
+ # Build the command accordingly
+ args = _build_command(
+ ["-r", str(dpi), pdf_path],
+ output_folder,
+ current_page,
+ current_page + thread_page_count - 1,
+ parsed_fmt,
+ jpegopt,
+ thread_output_file,
+ userpw,
+ ownerpw,
+ use_cropbox,
+ transparent,
+ single_file,
+ grayscale,
+ size,
+ hide_annotations,
+ )
+
+ if use_pdfcairo:
+ if hide_annotations:
+ raise NotImplementedError(
+ "Hide annotations flag not implemented in pdftocairo."
+ )
+ args = [_get_command_path("pdftocairo", poppler_path)] + args
+ else:
+ args = [_get_command_path("pdftoppm", poppler_path)] + args
+
+ # Update page values
+ current_page = current_page + thread_page_count
+ reminder -= int(reminder > 0)
+ # Add poppler path to LD_LIBRARY_PATH
+ env = os.environ.copy()
+ if poppler_path is not None:
+ env["LD_LIBRARY_PATH"] = (
+ poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
+ )
+ # Spawn the process and save its uuid
+ startupinfo = None
+ if platform.system() == "Windows":
+ # this startupinfo structure prevents a console window from popping up on Windows
+ startupinfo = subprocess.STARTUPINFO()
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ processes.append(
+ (
+ thread_output_file,
+ Popen(
+ args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo
+ ),
+ )
+ )
+
+ images = []
+
+ for uid, proc in processes:
+ try:
+ data, err = proc.communicate(timeout=timeout)
+ except TimeoutExpired:
+ proc.kill()
+ outs, errs = proc.communicate()
+ raise PDFPopplerTimeoutError("Run poppler timeout.")
+
+ if b"Syntax Error" in err and strict:
+ raise PDFSyntaxError(err.decode("utf8", "ignore"))
+
+ if output_folder is not None:
+ images += _load_from_output_folder(
+ output_folder,
+ uid,
+ final_extension,
+ paths_only,
+ in_memory=auto_temp_dir,
+ )
+ else:
+ images += parse_buffer_func(data)
+ finally:
+ if auto_temp_dir:
+ shutil.rmtree(output_folder)
+
+ return images
+
+
+def convert_from_bytes(
+ pdf_file: bytes,
+ dpi: int = 200,
+ output_folder: Union[str, PurePath] = None,
+ first_page: int = None,
+ last_page: int = None,
+ fmt: str = "ppm",
+ jpegopt: Dict = None,
+ thread_count: int = 1,
+ userpw: str = None,
+ ownerpw: str = None,
+ use_cropbox: bool = False,
+ strict: bool = False,
+ transparent: bool = False,
+ single_file: bool = False,
+ output_file: Union[str, PurePath] = uuid_generator(),
+ poppler_path: Union[str, PurePath] = None,
+ grayscale: bool = False,
+ size: Union[Tuple, int] = None,
+ paths_only: bool = False,
+ use_pdftocairo: bool = False,
+ timeout: int = None,
+ hide_annotations: bool = False,
+) -> List[Image.Image]:
+ """Function wrapping pdftoppm and pdftocairo.
+
+ :param pdf_bytes: Bytes of the PDF that you want to convert
+ :type pdf_bytes: bytes
+ :param dpi: Image quality in DPI (default 200), defaults to 200
+ :type dpi: int, optional
+ :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
+ :type output_folder: Union[str, PurePath], optional
+ :param first_page: First page to process, defaults to None
+ :type first_page: int, optional
+ :param last_page: Last page to process before stopping, defaults to None
+ :type last_page: int, optional
+ :param fmt: Output image format, defaults to "ppm"
+ :type fmt: str, optional
+ :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
+ :type jpegopt: Dict, optional
+ :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
+ :type thread_count: int, optional
+ :param userpw: PDF's password, defaults to None
+ :type userpw: str, optional
+ :param ownerpw: PDF's owner password, defaults to None
+ :type ownerpw: str, optional
+ :param use_cropbox: Use cropbox instead of mediabox, defaults to False
+ :type use_cropbox: bool, optional
+ :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
+ :type strict: bool, optional
+ :param transparent: Output with a transparent background instead of a white one, defaults to False
+ :type transparent: bool, optional
+ :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
+ :type single_file: bool, optional
+ :param output_file: What is the output filename or generator, defaults to uuid_generator()
+ :type output_file: Any, optional
+ :param poppler_path: Path to look for poppler binaries, defaults to None
+ :type poppler_path: Union[str, PurePath], optional
+ :param grayscale: Output grayscale image(s), defaults to False
+ :type grayscale: bool, optional
+ :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
+ :type size: Union[Tuple, int], optional
+ :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
+ :type paths_only: bool, optional
+ :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
+ :type use_pdftocairo: bool, optional
+ :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
+ :type timeout: int, optional
+ :param hide_annotations: Hide PDF annotations in the output, defaults to False
+ :type hide_annotations: bool, optional
+ :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
+ :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
+ :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
+ :return: A list of Pillow images, one for each page between first_page and last_page
+ :rtype: List[Image.Image]
+ """
+
+ fh, temp_filename = tempfile.mkstemp()
+ try:
+ with open(temp_filename, "wb") as f:
+ f.write(pdf_file)
+ f.flush()
+ return convert_from_path(
+ f.name,
+ dpi=dpi,
+ output_folder=output_folder,
+ first_page=first_page,
+ last_page=last_page,
+ fmt=fmt,
+ jpegopt=jpegopt,
+ thread_count=thread_count,
+ userpw=userpw,
+ ownerpw=ownerpw,
+ use_cropbox=use_cropbox,
+ strict=strict,
+ transparent=transparent,
+ single_file=single_file,
+ output_file=output_file,
+ poppler_path=poppler_path,
+ grayscale=grayscale,
+ size=size,
+ paths_only=paths_only,
+ use_pdftocairo=use_pdftocairo,
+ timeout=timeout,
+ hide_annotations=hide_annotations,
+ )
+ finally:
+ os.close(fh)
+ os.remove(temp_filename)
+
+
+def _build_command(
+ args: List,
+ output_folder: str,
+ first_page: int,
+ last_page: int,
+ fmt: str,
+ jpegopt: Dict,
+ output_file: str,
+ userpw: str,
+ ownerpw: str,
+ use_cropbox: bool,
+ transparent: bool,
+ single_file: bool,
+ grayscale: bool,
+ size: Union[int, Tuple[int, int]],
+ hide_annotations: bool,
+) -> List[str]:
+ if use_cropbox:
+ args.append("-cropbox")
+
+ if hide_annotations:
+ args.append("-hide-annotations")
+
+ if transparent and fmt in TRANSPARENT_FILE_TYPES:
+ args.append("-transp")
+
+ if first_page is not None:
+ args.extend(["-f", str(first_page)])
+
+ if last_page is not None:
+ args.extend(["-l", str(last_page)])
+
+ if fmt not in ["pgm", "ppm"]:
+ args.append("-" + fmt)
+
+ if fmt in ["jpeg", "jpg"] and jpegopt:
+ args.extend(["-jpegopt", _parse_jpegopt(jpegopt)])
+
+ if single_file:
+ args.append("-singlefile")
+
+ if output_folder is not None:
+ args.append(os.path.join(output_folder, output_file))
+
+ if userpw is not None:
+ args.extend(["-upw", userpw])
+
+ if ownerpw is not None:
+ args.extend(["-opw", ownerpw])
+
+ if grayscale:
+ args.append("-gray")
+
+ if size is None:
+ pass
+ elif isinstance(size, tuple) and len(size) == 2:
+ if size[0] is not None:
+ args.extend(["-scale-to-x", str(int(size[0]))])
+ else:
+ args.extend(["-scale-to-x", str(-1)])
+ if size[1] is not None:
+ args.extend(["-scale-to-y", str(int(size[1]))])
+ else:
+ args.extend(["-scale-to-y", str(-1)])
+ elif isinstance(size, tuple) and len(size) == 1:
+ args.extend(["-scale-to", str(int(size[0]))])
+ elif isinstance(size, int) or isinstance(size, float):
+ args.extend(["-scale-to", str(int(size))])
+ else:
+ raise ValueError(f"Size {size} is not a tuple or an integer")
+
+ return args
+
+
+def _parse_format(fmt: str, grayscale: bool = False) -> Tuple[str, str, Callable, bool]:
+ fmt = fmt.lower()
+ if fmt[0] == ".":
+ fmt = fmt[1:]
+ if fmt in ("jpeg", "jpg"):
+ return "jpeg", "jpg", parse_buffer_to_jpeg, False
+ if fmt == "png":
+ return "png", "png", parse_buffer_to_png, False
+ if fmt in ("tif", "tiff"):
+ return "tiff", "tif", None, True
+ if fmt == "ppm" and grayscale:
+ return "pgm", "pgm", parse_buffer_to_pgm, False
+ # Unable to parse the format so we'll use the default
+ return "ppm", "ppm", parse_buffer_to_ppm, False
+
+
+def _parse_jpegopt(jpegopt: Dict) -> str:
+ parts = []
+ for k, v in jpegopt.items():
+ if v is True:
+ v = "y"
+ if v is False:
+ v = "n"
+ parts.append("{}={}".format(k, v))
+ return ",".join(parts)
+
+
+def _get_command_path(command: str, poppler_path: str = None) -> str:
+ if platform.system() == "Windows":
+ command = command + ".exe"
+
+ if poppler_path is not None:
+ command = os.path.join(poppler_path, command)
+
+ return command
+
+
+def _get_poppler_version(
+ command: str, poppler_path: str = None, timeout: int = None
+) -> Tuple[int, int]:
+ command = [_get_command_path(command, poppler_path), "-v"]
+
+ env = os.environ.copy()
+ if poppler_path is not None:
+ env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
+ proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
+
+ try:
+ data, err = proc.communicate(timeout=timeout)
+ except TimeoutExpired:
+ proc.kill()
+ outs, errs = proc.communicate()
+ raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
+
+ try:
+ # TODO: Make this more robust
+ version = err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")
+ return int(version[0]), int(version[1])
+ except:
+ # Lowest version that includes pdftocairo (2011)
+ return 0, 17
+
+
+def pdfinfo_from_path(
+ pdf_path: str,
+ userpw: str = None,
+ ownerpw: str = None,
+ poppler_path: str = None,
+ rawdates: bool = False,
+ timeout: int = None,
+ first_page: int = None,
+ last_page: int = None,
+) -> Dict:
+ """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary.
+
+ :param pdf_path: Path to the PDF that you want to convert
+ :type pdf_path: str
+ :param userpw: PDF's password, defaults to None
+ :type userpw: str, optional
+ :param ownerpw: PDF's owner password, defaults to None
+ :type ownerpw: str, optional
+ :param poppler_path: Path to look for poppler binaries, defaults to None
+ :type poppler_path: Union[str, PurePath], optional
+ :param rawdates: Return the undecoded data strings, defaults to False
+ :type rawdates: bool, optional
+ :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
+ :type timeout: int, optional
+ :param first_page: First page to process, defaults to None
+ :type first_page: int, optional
+ :param last_page: Last page to process before stopping, defaults to None
+ :type last_page: int, optional
+ :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
+ :raises PDFInfoNotInstalledError: Raised if pdfinfo is not installed
+ :raises PDFPageCountError: Raised if the output could not be parsed
+ :return: Dictionary containing various information on the PDF
+ :rtype: Dict
+ """
+ try:
+ command = [_get_command_path("pdfinfo", poppler_path), pdf_path]
+
+ if userpw is not None:
+ command.extend(["-upw", userpw])
+
+ if ownerpw is not None:
+ command.extend(["-opw", ownerpw])
+
+ if rawdates:
+ command.extend(["-rawdates"])
+
+ if first_page:
+ command.extend(["-f", str(first_page)])
+
+ if last_page:
+ command.extend(["-l", str(last_page)])
+
+ # Add poppler path to LD_LIBRARY_PATH
+ env = os.environ.copy()
+ if poppler_path is not None:
+ env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
+ proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
+
+ try:
+ out, err = proc.communicate(timeout=timeout)
+ except TimeoutExpired:
+ proc.kill()
+ outs, errs = proc.communicate()
+ raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
+
+ d = {}
+ for field in out.decode("utf8", "ignore").split("\n"):
+ sf = field.split(":")
+ key, value = sf[0], ":".join(sf[1:])
+ if key != "":
+ d[key] = (
+ int(value.strip())
+ if key in PDFINFO_CONVERT_TO_INT
+ else value.strip()
+ )
+
+ if "Pages" not in d:
+ raise ValueError
+
+ return d
+
+ except OSError:
+ raise PDFInfoNotInstalledError(
+ "Unable to get page count. Is poppler installed and in PATH?"
+ )
+ except ValueError:
+ raise PDFPageCountError(
+ f"Unable to get page count.\n{err.decode('utf8', 'ignore')}"
+ )
+
+
+def pdfinfo_from_bytes(
+ pdf_bytes: bytes,
+ userpw: str = None,
+ ownerpw: str = None,
+ poppler_path: str = None,
+ rawdates: bool = False,
+ timeout: int = None,
+ first_page: int = None,
+ last_page: int = None,
+) -> Dict:
+ """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary.
+
+ :param pdf_bytes: Bytes of the PDF that you want to convert
+ :type pdf_bytes: bytes
+ :param userpw: PDF's password, defaults to None
+ :type userpw: str, optional
+ :param ownerpw: PDF's owner password, defaults to None
+ :type ownerpw: str, optional
+ :param poppler_path: Path to look for poppler binaries, defaults to None
+ :type poppler_path: Union[str, PurePath], optional
+ :param rawdates: Return the undecoded data strings, defaults to False
+ :type rawdates: bool, optional
+ :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
+ :type timeout: int, optional
+ :param first_page: First page to process, defaults to None
+ :type first_page: int, optional
+ :param last_page: Last page to process before stopping, defaults to None
+ :type last_page: int, optional
+ :return: Dictionary containing various information on the PDF
+ :rtype: Dict
+ """
+ fh, temp_filename = tempfile.mkstemp()
+ try:
+ with open(temp_filename, "wb") as f:
+ f.write(pdf_bytes)
+ f.flush()
+ return pdfinfo_from_path(
+ temp_filename,
+ userpw=userpw,
+ ownerpw=ownerpw,
+ poppler_path=poppler_path,
+ rawdates=rawdates,
+ timeout=timeout,
+ first_page=first_page,
+ last_page=last_page,
+ )
+ finally:
+ os.close(fh)
+ os.remove(temp_filename)
+
+
+def _load_from_output_folder(
+ output_folder: str,
+ output_file: str,
+ ext: str,
+ paths_only: bool,
+ in_memory: bool = False,
+) -> List[Image.Image]:
+ images = []
+ for f in sorted(os.listdir(output_folder)):
+ if f.startswith(output_file) and f.split(".")[-1] == ext:
+ if paths_only:
+ images.append(os.path.join(output_folder, f))
+ else:
+ images.append(Image.open(os.path.join(output_folder, f)))
+ if in_memory:
+ images[-1].load()
+ return images
diff --git a/.venv/lib/python3.12/site-packages/pdf2image/py.typed b/.venv/lib/python3.12/site-packages/pdf2image/py.typed
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pdf2image/py.typed