1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
import io
import logging
from typing import Generator, Tuple, Optional
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
from unstructured_client.models import shared
logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
# Loading pdfs with strict=False can dump a lot of warnings
# We don't need to display these
pdf_logger = logging.getLogger("pypdf")
pdf_logger.setLevel(logging.ERROR)
def get_pdf_pages(
pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None
) -> Generator[Tuple[io.BytesIO, int, int], None, None]:
"""Reads given bytes of a pdf file and split it into n file-like objects, each
with `split_size` pages.
Args:
file_content: Content of the PDF file.
split_size: Split size, e.g. if the given file has 10 pages
and this value is set to 2 it will yield 5 documents, each containing 2 pages
of the original document. By default it will split each page to a separate file.
page_start: Begin splitting at this page number
page_end: If provided, split up to and including this page number
Yields:
The file contents with their page number and overall pages number of the original document.
"""
offset = page_start - 1
offset_end = page_end or len(pdf.pages)
while offset < offset_end:
new_pdf = PdfWriter()
pdf_buffer = io.BytesIO()
end = min(offset + split_size, offset_end)
for page in list(pdf.pages[offset:end]):
new_pdf.add_page(page)
new_pdf.write(pdf_buffer)
pdf_buffer.seek(0)
yield pdf_buffer, offset, offset_end
offset += split_size
def is_pdf(file: shared.Files) -> bool:
"""Checks if the given file is a PDF.
First it checks the file extension and if it is equal to `.pdf`, then
it tries to read that file. If there is no error then we assume it is a proper PDF.
Args:
file: The file to be checked.
Returns:
True if the file is a PDF, False otherwise.
"""
if not file.file_name.endswith(".pdf"):
logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.")
return False
try:
PdfReader(io.BytesIO(file.content), strict=True)
except (PdfReadError, UnicodeDecodeError) as exc:
logger.error(exc)
logger.warning("The file does not appear to be a valid PDF.")
return False
return True
|