import io import logging from typing import Generator, Tuple, Optional from pypdf import PdfReader, PdfWriter from pypdf.errors import PdfReadError from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME from unstructured_client.models import shared logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) # Loading pdfs with strict=False can dump a lot of warnings # We don't need to display these pdf_logger = logging.getLogger("pypdf") pdf_logger.setLevel(logging.ERROR) def get_pdf_pages( pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None ) -> Generator[Tuple[io.BytesIO, int, int], None, None]: """Reads given bytes of a pdf file and split it into n file-like objects, each with `split_size` pages. Args: file_content: Content of the PDF file. split_size: Split size, e.g. if the given file has 10 pages and this value is set to 2 it will yield 5 documents, each containing 2 pages of the original document. By default it will split each page to a separate file. page_start: Begin splitting at this page number page_end: If provided, split up to and including this page number Yields: The file contents with their page number and overall pages number of the original document. """ offset = page_start - 1 offset_end = page_end or len(pdf.pages) while offset < offset_end: new_pdf = PdfWriter() pdf_buffer = io.BytesIO() end = min(offset + split_size, offset_end) for page in list(pdf.pages[offset:end]): new_pdf.add_page(page) new_pdf.write(pdf_buffer) pdf_buffer.seek(0) yield pdf_buffer, offset, offset_end offset += split_size def is_pdf(file: shared.Files) -> bool: """Checks if the given file is a PDF. First it checks the file extension and if it is equal to `.pdf`, then it tries to read that file. If there is no error then we assume it is a proper PDF. Args: file: The file to be checked. Returns: True if the file is a PDF, False otherwise. """ if not file.file_name.endswith(".pdf"): logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.") return False try: PdfReader(io.BytesIO(file.content), strict=True) except (PdfReadError, UnicodeDecodeError) as exc: logger.error(exc) logger.warning("The file does not appear to be a valid PDF.") return False return True