diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py new file mode 100644 index 00000000..27dc5e03 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py @@ -0,0 +1,79 @@ +import io +import logging +from typing import Generator, Tuple, Optional + +from pypdf import PdfReader, PdfWriter +from pypdf.errors import PdfReadError + +from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME +from unstructured_client.models import shared + +logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) + +# Loading pdfs with strict=False can dump a lot of warnings +# We don't need to display these +pdf_logger = logging.getLogger("pypdf") +pdf_logger.setLevel(logging.ERROR) + + +def get_pdf_pages( + pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None +) -> Generator[Tuple[io.BytesIO, int, int], None, None]: + """Reads given bytes of a pdf file and split it into n file-like objects, each + with `split_size` pages. + + Args: + file_content: Content of the PDF file. + split_size: Split size, e.g. if the given file has 10 pages + and this value is set to 2 it will yield 5 documents, each containing 2 pages + of the original document. By default it will split each page to a separate file. + page_start: Begin splitting at this page number + page_end: If provided, split up to and including this page number + + Yields: + The file contents with their page number and overall pages number of the original document. + """ + + offset = page_start - 1 + offset_end = page_end or len(pdf.pages) + + while offset < offset_end: + new_pdf = PdfWriter() + pdf_buffer = io.BytesIO() + + end = min(offset + split_size, offset_end) + + for page in list(pdf.pages[offset:end]): + new_pdf.add_page(page) + + new_pdf.write(pdf_buffer) + pdf_buffer.seek(0) + + yield pdf_buffer, offset, offset_end + offset += split_size + + +def is_pdf(file: shared.Files) -> bool: + """Checks if the given file is a PDF. + + First it checks the file extension and if it is equal to `.pdf`, then + it tries to read that file. If there is no error then we assume it is a proper PDF. + + Args: + file: The file to be checked. + + Returns: + True if the file is a PDF, False otherwise. + """ + if not file.file_name.endswith(".pdf"): + logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.") + return False + + try: + PdfReader(io.BytesIO(file.content), strict=True) + except (PdfReadError, UnicodeDecodeError) as exc: + logger.error(exc) + logger.warning("The file does not appear to be a valid PDF.") + return False + + return True |