diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/pdf2image')
6 files changed, 868 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pdf2image/__init__.py b/.venv/lib/python3.12/site-packages/pdf2image/__init__.py new file mode 100644 index 00000000..72601399 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pdf2image/__init__.py @@ -0,0 +1,8 @@ +""" + __init__ of the pdf2image module +""" + +from .pdf2image import convert_from_bytes as convert_from_bytes +from .pdf2image import convert_from_path as convert_from_path +from .pdf2image import pdfinfo_from_bytes as pdfinfo_from_bytes +from .pdf2image import pdfinfo_from_path as pdfinfo_from_path diff --git a/.venv/lib/python3.12/site-packages/pdf2image/exceptions.py b/.venv/lib/python3.12/site-packages/pdf2image/exceptions.py new file mode 100644 index 00000000..bf201089 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pdf2image/exceptions.py @@ -0,0 +1,33 @@ +""" + Define exceptions specific to pdf2image +""" + + +class PopplerNotInstalledError(Exception): + """Raised when poppler is not installed""" + + pass + + +class PDFInfoNotInstalledError(PopplerNotInstalledError): + """Raised when pdfinfo is not installed""" + + pass + + +class PDFPageCountError(Exception): + """Raised when the pdfinfo was unable to retrieve the page count""" + + pass + + +class PDFSyntaxError(Exception): + """Raised when a syntax error was thrown during rendering""" + + pass + + +class PDFPopplerTimeoutError(Exception): + """Raised when the timeout is exceeded while converting a PDF""" + + pass diff --git a/.venv/lib/python3.12/site-packages/pdf2image/generators.py b/.venv/lib/python3.12/site-packages/pdf2image/generators.py new file mode 100644 index 00000000..5d79a3ce --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pdf2image/generators.py @@ -0,0 +1,46 @@ +""" + pdf2image filename generators +""" + +import uuid +import threading + + +class ThreadSafeGenerator(object): + """Wrapper around generator that protects concurrent access""" + + def __init__(self, gen): + self.gen = gen + self.lock = threading.Lock() + + def __iter__(self): + return self + + def __next__(self): + with self.lock: + return next(self.gen) + + +def threadsafe(f): + """Decorator to make generator threadsafe. Fix #125""" + + def g(*a, **kw): + return ThreadSafeGenerator(f(*a, **kw)) + + return g + + +@threadsafe +def uuid_generator(): + """Returns a UUID4""" + while True: + yield str(uuid.uuid4()) + + +@threadsafe +def counter_generator(prefix="", suffix="", padding_goal=4): + """Returns a joined prefix, iteration number, and suffix""" + i = 0 + while True: + i += 1 + yield str(prefix) + str(i).zfill(padding_goal) + str(suffix) diff --git a/.venv/lib/python3.12/site-packages/pdf2image/parsers.py b/.venv/lib/python3.12/site-packages/pdf2image/parsers.py new file mode 100644 index 00000000..72f51250 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pdf2image/parsers.py @@ -0,0 +1,98 @@ +""" + pdf2image custom buffer parsers +""" + +from io import BytesIO +from typing import List + +from PIL import Image + + +def parse_buffer_to_ppm(data: bytes) -> List[Image.Image]: + """Parse PPM file bytes to Pillow Image + + :param data: pdftoppm/pdftocairo output bytes + :type data: bytes + :return: List of PPM images parsed from the output + :rtype: List[Image.Image] + """ + + images = [] + + index = 0 + + while index < len(data): + code, size, rgb = tuple(data[index : index + 40].split(b"\n")[0:3]) + size_x, size_y = tuple(size.split(b" ")) + file_size = len(code) + len(size) + len(rgb) + 3 + int(size_x) * int(size_y) * 3 + images.append(Image.open(BytesIO(data[index : index + file_size]))) + index += file_size + + return images + + +def parse_buffer_to_pgm(data: bytes) -> List[Image.Image]: + """Parse PGM file bytes to Pillow Image + + :param data: pdftoppm/pdftocairo output bytes + :type data: bytes + :return: List of PGM images parsed from the output + :rtype: List[Image.Image] + """ + + images = [] + + index = 0 + + while index < len(data): + code, size, maxval = tuple(data[index : index + 40].split(b"\n")[0:3]) + size_x, size_y = tuple(size.split(b" ")) + file_size = len(code) + len(size) + len(maxval) + 3 + int(size_x) * int(size_y) + images.append(Image.open(BytesIO(data[index : index + file_size]))) + index += file_size + + return images + + +def parse_buffer_to_jpeg(data: bytes) -> List[Image.Image]: + """Parse JPEG file bytes to Pillow Image + + :param data: pdftoppm/pdftocairo output bytes + :type data: bytes + :return: List of JPEG images parsed from the output + :rtype: List[Image.Image] + """ + + return [ + Image.open(BytesIO(image_data + b"\xff\xd9")) + for image_data in data.split(b"\xff\xd9")[ + :-1 + ] # Last element is obviously empty + ] + + +def parse_buffer_to_png(data: bytes) -> List[Image.Image]: + """Parse PNG file bytes to Pillow Image + + :param data: pdftoppm/pdftocairo output bytes + :type data: bytes + :return: List of PNG images parsed from the output + :rtype: List[Image.Image] + """ + + images = [] + + c1 = 0 + c2 = 0 + data_len = len(data) + while c1 < data_len: + # IEND can appear in a PNG without being the actual end + if data[c2 : c2 + 4] == b"IEND" and ( + c2 + 8 == data_len or data[c2 + 9 : c2 + 12] == b"PNG" + ): + images.append(Image.open(BytesIO(data[c1 : c2 + 8]))) + c1 = c2 + 8 + c2 = c1 + c2 += 1 + + return images diff --git a/.venv/lib/python3.12/site-packages/pdf2image/pdf2image.py b/.venv/lib/python3.12/site-packages/pdf2image/pdf2image.py new file mode 100644 index 00000000..21d19c88 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pdf2image/pdf2image.py @@ -0,0 +1,683 @@ +""" + pdf2image is a light wrapper for the poppler-utils tools that can convert your + PDFs into Pillow images. +""" + +import os +import platform +import tempfile +import types +import shutil +import subprocess +from subprocess import Popen, PIPE, TimeoutExpired +from typing import Any, Union, Tuple, List, Dict, Callable +from pathlib import PurePath +from PIL import Image + +from pdf2image.generators import uuid_generator, counter_generator, ThreadSafeGenerator + +from pdf2image.parsers import ( + parse_buffer_to_pgm, + parse_buffer_to_ppm, + parse_buffer_to_jpeg, + parse_buffer_to_png, +) + +from pdf2image.exceptions import ( + PDFInfoNotInstalledError, + PDFPageCountError, + PDFSyntaxError, + PDFPopplerTimeoutError, +) + +TRANSPARENT_FILE_TYPES = ["png", "tiff"] +PDFINFO_CONVERT_TO_INT = ["Pages"] + + +def convert_from_path( + pdf_path: Union[str, PurePath], + dpi: int = 200, + output_folder: Union[str, PurePath] = None, + first_page: int = None, + last_page: int = None, + fmt: str = "ppm", + jpegopt: Dict = None, + thread_count: int = 1, + userpw: str = None, + ownerpw: str = None, + use_cropbox: bool = False, + strict: bool = False, + transparent: bool = False, + single_file: bool = False, + output_file: Any = uuid_generator(), + poppler_path: Union[str, PurePath] = None, + grayscale: bool = False, + size: Union[Tuple, int] = None, + paths_only: bool = False, + use_pdftocairo: bool = False, + timeout: int = None, + hide_annotations: bool = False, +) -> List[Image.Image]: + """Function wrapping pdftoppm and pdftocairo + + :param pdf_path: Path to the PDF that you want to convert + :type pdf_path: Union[str, PurePath] + :param dpi: Image quality in DPI (default 200), defaults to 200 + :type dpi: int, optional + :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None + :type output_folder: Union[str, PurePath], optional + :param first_page: First page to process, defaults to None + :type first_page: int, optional + :param last_page: Last page to process before stopping, defaults to None + :type last_page: int, optional + :param fmt: Output image format, defaults to "ppm" + :type fmt: str, optional + :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None + :type jpegopt: Dict, optional + :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1 + :type thread_count: int, optional + :param userpw: PDF's password, defaults to None + :type userpw: str, optional + :param ownerpw: PDF's owner password, defaults to None + :type ownerpw: str, optional + :param use_cropbox: Use cropbox instead of mediabox, defaults to False + :type use_cropbox: bool, optional + :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False + :type strict: bool, optional + :param transparent: Output with a transparent background instead of a white one, defaults to False + :type transparent: bool, optional + :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False + :type single_file: bool, optional + :param output_file: What is the output filename or generator, defaults to uuid_generator() + :type output_file: Any, optional + :param poppler_path: Path to look for poppler binaries, defaults to None + :type poppler_path: Union[str, PurePath], optional + :param grayscale: Output grayscale image(s), defaults to False + :type grayscale: bool, optional + :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None + :type size: Union[Tuple, int], optional + :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False + :type paths_only: bool, optional + :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False + :type use_pdftocairo: bool, optional + :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None + :type timeout: int, optional + :param hide_annotations: Hide PDF annotations in the output, defaults to False + :type hide_annotations: bool, optional + :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo) + :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded + :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True + :return: A list of Pillow images, one for each page between first_page and last_page + :rtype: List[Image.Image] + """ + + if use_pdftocairo and fmt == "ppm": + fmt = "png" + + # We make sure that if passed arguments are Path objects, they're converted to strings + if isinstance(pdf_path, PurePath): + pdf_path = pdf_path.as_posix() + + if isinstance(output_folder, PurePath): + output_folder = output_folder.as_posix() + + if isinstance(poppler_path, PurePath): + poppler_path = poppler_path.as_posix() + + page_count = pdfinfo_from_path( + pdf_path, userpw, ownerpw, poppler_path=poppler_path + )["Pages"] + + # We start by getting the output format, the buffer processing function and if we need pdftocairo + parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format( + fmt, grayscale + ) + + # We use pdftocairo is the format requires it OR we need a transparent output + use_pdfcairo = ( + use_pdftocairo + or use_pdfcairo_format + or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES) + ) + + poppler_version_major, poppler_version_minor = _get_poppler_version( + "pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path + ) + + if poppler_version_major == 0 and poppler_version_minor <= 57: + jpegopt = None + + if poppler_version_major == 0 and poppler_version_minor <= 83: + hide_annotations = False + + # If output_file isn't a generator, it will be turned into one + if not isinstance(output_file, types.GeneratorType) and not isinstance( + output_file, ThreadSafeGenerator + ): + if single_file: + output_file = iter([output_file]) + thread_count = 1 + else: + output_file = counter_generator(output_file) + + if thread_count < 1: + thread_count = 1 + + if first_page is None or first_page < 1: + first_page = 1 + + if last_page is None or last_page > page_count: + last_page = page_count + + if first_page > last_page: + return [] + + try: + auto_temp_dir = False + if output_folder is None and use_pdfcairo: + output_folder = tempfile.mkdtemp() + auto_temp_dir = True + + # Recalculate page count based on first and last page + page_count = last_page - first_page + 1 + + if thread_count > page_count: + thread_count = page_count + + reminder = page_count % thread_count + current_page = first_page + processes = [] + for _ in range(thread_count): + thread_output_file = next(output_file) + + # Get the number of pages the thread will be processing + thread_page_count = page_count // thread_count + int(reminder > 0) + # Build the command accordingly + args = _build_command( + ["-r", str(dpi), pdf_path], + output_folder, + current_page, + current_page + thread_page_count - 1, + parsed_fmt, + jpegopt, + thread_output_file, + userpw, + ownerpw, + use_cropbox, + transparent, + single_file, + grayscale, + size, + hide_annotations, + ) + + if use_pdfcairo: + if hide_annotations: + raise NotImplementedError( + "Hide annotations flag not implemented in pdftocairo." + ) + args = [_get_command_path("pdftocairo", poppler_path)] + args + else: + args = [_get_command_path("pdftoppm", poppler_path)] + args + + # Update page values + current_page = current_page + thread_page_count + reminder -= int(reminder > 0) + # Add poppler path to LD_LIBRARY_PATH + env = os.environ.copy() + if poppler_path is not None: + env["LD_LIBRARY_PATH"] = ( + poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") + ) + # Spawn the process and save its uuid + startupinfo = None + if platform.system() == "Windows": + # this startupinfo structure prevents a console window from popping up on Windows + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + processes.append( + ( + thread_output_file, + Popen( + args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo + ), + ) + ) + + images = [] + + for uid, proc in processes: + try: + data, err = proc.communicate(timeout=timeout) + except TimeoutExpired: + proc.kill() + outs, errs = proc.communicate() + raise PDFPopplerTimeoutError("Run poppler timeout.") + + if b"Syntax Error" in err and strict: + raise PDFSyntaxError(err.decode("utf8", "ignore")) + + if output_folder is not None: + images += _load_from_output_folder( + output_folder, + uid, + final_extension, + paths_only, + in_memory=auto_temp_dir, + ) + else: + images += parse_buffer_func(data) + finally: + if auto_temp_dir: + shutil.rmtree(output_folder) + + return images + + +def convert_from_bytes( + pdf_file: bytes, + dpi: int = 200, + output_folder: Union[str, PurePath] = None, + first_page: int = None, + last_page: int = None, + fmt: str = "ppm", + jpegopt: Dict = None, + thread_count: int = 1, + userpw: str = None, + ownerpw: str = None, + use_cropbox: bool = False, + strict: bool = False, + transparent: bool = False, + single_file: bool = False, + output_file: Union[str, PurePath] = uuid_generator(), + poppler_path: Union[str, PurePath] = None, + grayscale: bool = False, + size: Union[Tuple, int] = None, + paths_only: bool = False, + use_pdftocairo: bool = False, + timeout: int = None, + hide_annotations: bool = False, +) -> List[Image.Image]: + """Function wrapping pdftoppm and pdftocairo. + + :param pdf_bytes: Bytes of the PDF that you want to convert + :type pdf_bytes: bytes + :param dpi: Image quality in DPI (default 200), defaults to 200 + :type dpi: int, optional + :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None + :type output_folder: Union[str, PurePath], optional + :param first_page: First page to process, defaults to None + :type first_page: int, optional + :param last_page: Last page to process before stopping, defaults to None + :type last_page: int, optional + :param fmt: Output image format, defaults to "ppm" + :type fmt: str, optional + :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None + :type jpegopt: Dict, optional + :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1 + :type thread_count: int, optional + :param userpw: PDF's password, defaults to None + :type userpw: str, optional + :param ownerpw: PDF's owner password, defaults to None + :type ownerpw: str, optional + :param use_cropbox: Use cropbox instead of mediabox, defaults to False + :type use_cropbox: bool, optional + :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False + :type strict: bool, optional + :param transparent: Output with a transparent background instead of a white one, defaults to False + :type transparent: bool, optional + :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False + :type single_file: bool, optional + :param output_file: What is the output filename or generator, defaults to uuid_generator() + :type output_file: Any, optional + :param poppler_path: Path to look for poppler binaries, defaults to None + :type poppler_path: Union[str, PurePath], optional + :param grayscale: Output grayscale image(s), defaults to False + :type grayscale: bool, optional + :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None + :type size: Union[Tuple, int], optional + :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False + :type paths_only: bool, optional + :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False + :type use_pdftocairo: bool, optional + :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None + :type timeout: int, optional + :param hide_annotations: Hide PDF annotations in the output, defaults to False + :type hide_annotations: bool, optional + :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo) + :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded + :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True + :return: A list of Pillow images, one for each page between first_page and last_page + :rtype: List[Image.Image] + """ + + fh, temp_filename = tempfile.mkstemp() + try: + with open(temp_filename, "wb") as f: + f.write(pdf_file) + f.flush() + return convert_from_path( + f.name, + dpi=dpi, + output_folder=output_folder, + first_page=first_page, + last_page=last_page, + fmt=fmt, + jpegopt=jpegopt, + thread_count=thread_count, + userpw=userpw, + ownerpw=ownerpw, + use_cropbox=use_cropbox, + strict=strict, + transparent=transparent, + single_file=single_file, + output_file=output_file, + poppler_path=poppler_path, + grayscale=grayscale, + size=size, + paths_only=paths_only, + use_pdftocairo=use_pdftocairo, + timeout=timeout, + hide_annotations=hide_annotations, + ) + finally: + os.close(fh) + os.remove(temp_filename) + + +def _build_command( + args: List, + output_folder: str, + first_page: int, + last_page: int, + fmt: str, + jpegopt: Dict, + output_file: str, + userpw: str, + ownerpw: str, + use_cropbox: bool, + transparent: bool, + single_file: bool, + grayscale: bool, + size: Union[int, Tuple[int, int]], + hide_annotations: bool, +) -> List[str]: + if use_cropbox: + args.append("-cropbox") + + if hide_annotations: + args.append("-hide-annotations") + + if transparent and fmt in TRANSPARENT_FILE_TYPES: + args.append("-transp") + + if first_page is not None: + args.extend(["-f", str(first_page)]) + + if last_page is not None: + args.extend(["-l", str(last_page)]) + + if fmt not in ["pgm", "ppm"]: + args.append("-" + fmt) + + if fmt in ["jpeg", "jpg"] and jpegopt: + args.extend(["-jpegopt", _parse_jpegopt(jpegopt)]) + + if single_file: + args.append("-singlefile") + + if output_folder is not None: + args.append(os.path.join(output_folder, output_file)) + + if userpw is not None: + args.extend(["-upw", userpw]) + + if ownerpw is not None: + args.extend(["-opw", ownerpw]) + + if grayscale: + args.append("-gray") + + if size is None: + pass + elif isinstance(size, tuple) and len(size) == 2: + if size[0] is not None: + args.extend(["-scale-to-x", str(int(size[0]))]) + else: + args.extend(["-scale-to-x", str(-1)]) + if size[1] is not None: + args.extend(["-scale-to-y", str(int(size[1]))]) + else: + args.extend(["-scale-to-y", str(-1)]) + elif isinstance(size, tuple) and len(size) == 1: + args.extend(["-scale-to", str(int(size[0]))]) + elif isinstance(size, int) or isinstance(size, float): + args.extend(["-scale-to", str(int(size))]) + else: + raise ValueError(f"Size {size} is not a tuple or an integer") + + return args + + +def _parse_format(fmt: str, grayscale: bool = False) -> Tuple[str, str, Callable, bool]: + fmt = fmt.lower() + if fmt[0] == ".": + fmt = fmt[1:] + if fmt in ("jpeg", "jpg"): + return "jpeg", "jpg", parse_buffer_to_jpeg, False + if fmt == "png": + return "png", "png", parse_buffer_to_png, False + if fmt in ("tif", "tiff"): + return "tiff", "tif", None, True + if fmt == "ppm" and grayscale: + return "pgm", "pgm", parse_buffer_to_pgm, False + # Unable to parse the format so we'll use the default + return "ppm", "ppm", parse_buffer_to_ppm, False + + +def _parse_jpegopt(jpegopt: Dict) -> str: + parts = [] + for k, v in jpegopt.items(): + if v is True: + v = "y" + if v is False: + v = "n" + parts.append("{}={}".format(k, v)) + return ",".join(parts) + + +def _get_command_path(command: str, poppler_path: str = None) -> str: + if platform.system() == "Windows": + command = command + ".exe" + + if poppler_path is not None: + command = os.path.join(poppler_path, command) + + return command + + +def _get_poppler_version( + command: str, poppler_path: str = None, timeout: int = None +) -> Tuple[int, int]: + command = [_get_command_path(command, poppler_path), "-v"] + + env = os.environ.copy() + if poppler_path is not None: + env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") + proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE) + + try: + data, err = proc.communicate(timeout=timeout) + except TimeoutExpired: + proc.kill() + outs, errs = proc.communicate() + raise PDFPopplerTimeoutError("Run poppler poppler timeout.") + + try: + # TODO: Make this more robust + version = err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".") + return int(version[0]), int(version[1]) + except: + # Lowest version that includes pdftocairo (2011) + return 0, 17 + + +def pdfinfo_from_path( + pdf_path: str, + userpw: str = None, + ownerpw: str = None, + poppler_path: str = None, + rawdates: bool = False, + timeout: int = None, + first_page: int = None, + last_page: int = None, +) -> Dict: + """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary. + + :param pdf_path: Path to the PDF that you want to convert + :type pdf_path: str + :param userpw: PDF's password, defaults to None + :type userpw: str, optional + :param ownerpw: PDF's owner password, defaults to None + :type ownerpw: str, optional + :param poppler_path: Path to look for poppler binaries, defaults to None + :type poppler_path: Union[str, PurePath], optional + :param rawdates: Return the undecoded data strings, defaults to False + :type rawdates: bool, optional + :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None + :type timeout: int, optional + :param first_page: First page to process, defaults to None + :type first_page: int, optional + :param last_page: Last page to process before stopping, defaults to None + :type last_page: int, optional + :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded + :raises PDFInfoNotInstalledError: Raised if pdfinfo is not installed + :raises PDFPageCountError: Raised if the output could not be parsed + :return: Dictionary containing various information on the PDF + :rtype: Dict + """ + try: + command = [_get_command_path("pdfinfo", poppler_path), pdf_path] + + if userpw is not None: + command.extend(["-upw", userpw]) + + if ownerpw is not None: + command.extend(["-opw", ownerpw]) + + if rawdates: + command.extend(["-rawdates"]) + + if first_page: + command.extend(["-f", str(first_page)]) + + if last_page: + command.extend(["-l", str(last_page)]) + + # Add poppler path to LD_LIBRARY_PATH + env = os.environ.copy() + if poppler_path is not None: + env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") + proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE) + + try: + out, err = proc.communicate(timeout=timeout) + except TimeoutExpired: + proc.kill() + outs, errs = proc.communicate() + raise PDFPopplerTimeoutError("Run poppler poppler timeout.") + + d = {} + for field in out.decode("utf8", "ignore").split("\n"): + sf = field.split(":") + key, value = sf[0], ":".join(sf[1:]) + if key != "": + d[key] = ( + int(value.strip()) + if key in PDFINFO_CONVERT_TO_INT + else value.strip() + ) + + if "Pages" not in d: + raise ValueError + + return d + + except OSError: + raise PDFInfoNotInstalledError( + "Unable to get page count. Is poppler installed and in PATH?" + ) + except ValueError: + raise PDFPageCountError( + f"Unable to get page count.\n{err.decode('utf8', 'ignore')}" + ) + + +def pdfinfo_from_bytes( + pdf_bytes: bytes, + userpw: str = None, + ownerpw: str = None, + poppler_path: str = None, + rawdates: bool = False, + timeout: int = None, + first_page: int = None, + last_page: int = None, +) -> Dict: + """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary. + + :param pdf_bytes: Bytes of the PDF that you want to convert + :type pdf_bytes: bytes + :param userpw: PDF's password, defaults to None + :type userpw: str, optional + :param ownerpw: PDF's owner password, defaults to None + :type ownerpw: str, optional + :param poppler_path: Path to look for poppler binaries, defaults to None + :type poppler_path: Union[str, PurePath], optional + :param rawdates: Return the undecoded data strings, defaults to False + :type rawdates: bool, optional + :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None + :type timeout: int, optional + :param first_page: First page to process, defaults to None + :type first_page: int, optional + :param last_page: Last page to process before stopping, defaults to None + :type last_page: int, optional + :return: Dictionary containing various information on the PDF + :rtype: Dict + """ + fh, temp_filename = tempfile.mkstemp() + try: + with open(temp_filename, "wb") as f: + f.write(pdf_bytes) + f.flush() + return pdfinfo_from_path( + temp_filename, + userpw=userpw, + ownerpw=ownerpw, + poppler_path=poppler_path, + rawdates=rawdates, + timeout=timeout, + first_page=first_page, + last_page=last_page, + ) + finally: + os.close(fh) + os.remove(temp_filename) + + +def _load_from_output_folder( + output_folder: str, + output_file: str, + ext: str, + paths_only: bool, + in_memory: bool = False, +) -> List[Image.Image]: + images = [] + for f in sorted(os.listdir(output_folder)): + if f.startswith(output_file) and f.split(".")[-1] == ext: + if paths_only: + images.append(os.path.join(output_folder, f)) + else: + images.append(Image.open(os.path.join(output_folder, f))) + if in_memory: + images[-1].load() + return images diff --git a/.venv/lib/python3.12/site-packages/pdf2image/py.typed b/.venv/lib/python3.12/site-packages/pdf2image/py.typed new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pdf2image/py.typed |