about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py')
-rw-r--r--.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py
new file mode 100644
index 00000000..27dc5e03
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py
@@ -0,0 +1,79 @@
+import io
+import logging
+from typing import Generator, Tuple, Optional
+
+from pypdf import PdfReader, PdfWriter
+from pypdf.errors import PdfReadError
+
+from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
+from unstructured_client.models import shared
+
+logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
+
+# Loading pdfs with strict=False can dump a lot of warnings
+# We don't need to display these
+pdf_logger = logging.getLogger("pypdf")
+pdf_logger.setLevel(logging.ERROR)
+
+
+def get_pdf_pages(
+    pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None
+) -> Generator[Tuple[io.BytesIO, int, int], None, None]:
+    """Reads given bytes of a pdf file and split it into n file-like objects, each
+    with `split_size` pages.
+
+    Args:
+        file_content: Content of the PDF file.
+        split_size: Split size, e.g. if the given file has 10 pages
+            and this value is set to 2 it will yield 5 documents, each containing 2 pages
+            of the original document. By default it will split each page to a separate file.
+        page_start: Begin splitting at this page number
+        page_end: If provided, split up to and including this page number
+
+    Yields:
+        The file contents with their page number and overall pages number of the original document.
+    """
+
+    offset = page_start - 1
+    offset_end = page_end or len(pdf.pages)
+
+    while offset < offset_end:
+        new_pdf = PdfWriter()
+        pdf_buffer = io.BytesIO()
+
+        end = min(offset + split_size, offset_end)
+
+        for page in list(pdf.pages[offset:end]):
+            new_pdf.add_page(page)
+
+        new_pdf.write(pdf_buffer)
+        pdf_buffer.seek(0)
+
+        yield pdf_buffer, offset, offset_end
+        offset += split_size
+
+
+def is_pdf(file: shared.Files) -> bool:
+    """Checks if the given file is a PDF.
+
+    First it checks the file extension and if it is equal to `.pdf`, then
+    it tries to read that file. If there is no error then we assume it is a proper PDF.
+
+    Args:
+        file: The file to be checked.
+
+    Returns:
+        True if the file is a PDF, False otherwise.
+    """
+    if not file.file_name.endswith(".pdf"):
+        logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.")
+        return False
+
+    try:
+        PdfReader(io.BytesIO(file.content), strict=True)
+    except (PdfReadError, UnicodeDecodeError) as exc:
+        logger.error(exc)
+        logger.warning("The file does not appear to be a valid PDF.")
+        return False
+
+    return True