1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
|
"""
pdf2image is a light wrapper for the poppler-utils tools that can convert your
PDFs into Pillow images.
"""
import os
import platform
import tempfile
import types
import shutil
import subprocess
from subprocess import Popen, PIPE, TimeoutExpired
from typing import Any, Union, Tuple, List, Dict, Callable
from pathlib import PurePath
from PIL import Image
from pdf2image.generators import uuid_generator, counter_generator, ThreadSafeGenerator
from pdf2image.parsers import (
parse_buffer_to_pgm,
parse_buffer_to_ppm,
parse_buffer_to_jpeg,
parse_buffer_to_png,
)
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError,
PDFPopplerTimeoutError,
)
TRANSPARENT_FILE_TYPES = ["png", "tiff"]
PDFINFO_CONVERT_TO_INT = ["Pages"]
def convert_from_path(
pdf_path: Union[str, PurePath],
dpi: int = 200,
output_folder: Union[str, PurePath] = None,
first_page: int = None,
last_page: int = None,
fmt: str = "ppm",
jpegopt: Dict = None,
thread_count: int = 1,
userpw: str = None,
ownerpw: str = None,
use_cropbox: bool = False,
strict: bool = False,
transparent: bool = False,
single_file: bool = False,
output_file: Any = uuid_generator(),
poppler_path: Union[str, PurePath] = None,
grayscale: bool = False,
size: Union[Tuple, int] = None,
paths_only: bool = False,
use_pdftocairo: bool = False,
timeout: int = None,
hide_annotations: bool = False,
) -> List[Image.Image]:
"""Function wrapping pdftoppm and pdftocairo
:param pdf_path: Path to the PDF that you want to convert
:type pdf_path: Union[str, PurePath]
:param dpi: Image quality in DPI (default 200), defaults to 200
:type dpi: int, optional
:param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
:type output_folder: Union[str, PurePath], optional
:param first_page: First page to process, defaults to None
:type first_page: int, optional
:param last_page: Last page to process before stopping, defaults to None
:type last_page: int, optional
:param fmt: Output image format, defaults to "ppm"
:type fmt: str, optional
:param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
:type jpegopt: Dict, optional
:param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
:type thread_count: int, optional
:param userpw: PDF's password, defaults to None
:type userpw: str, optional
:param ownerpw: PDF's owner password, defaults to None
:type ownerpw: str, optional
:param use_cropbox: Use cropbox instead of mediabox, defaults to False
:type use_cropbox: bool, optional
:param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
:type strict: bool, optional
:param transparent: Output with a transparent background instead of a white one, defaults to False
:type transparent: bool, optional
:param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
:type single_file: bool, optional
:param output_file: What is the output filename or generator, defaults to uuid_generator()
:type output_file: Any, optional
:param poppler_path: Path to look for poppler binaries, defaults to None
:type poppler_path: Union[str, PurePath], optional
:param grayscale: Output grayscale image(s), defaults to False
:type grayscale: bool, optional
:param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
:type size: Union[Tuple, int], optional
:param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
:type paths_only: bool, optional
:param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
:type use_pdftocairo: bool, optional
:param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
:type timeout: int, optional
:param hide_annotations: Hide PDF annotations in the output, defaults to False
:type hide_annotations: bool, optional
:raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
:raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
:raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
:return: A list of Pillow images, one for each page between first_page and last_page
:rtype: List[Image.Image]
"""
if use_pdftocairo and fmt == "ppm":
fmt = "png"
# We make sure that if passed arguments are Path objects, they're converted to strings
if isinstance(pdf_path, PurePath):
pdf_path = pdf_path.as_posix()
if isinstance(output_folder, PurePath):
output_folder = output_folder.as_posix()
if isinstance(poppler_path, PurePath):
poppler_path = poppler_path.as_posix()
page_count = pdfinfo_from_path(
pdf_path, userpw, ownerpw, poppler_path=poppler_path
)["Pages"]
# We start by getting the output format, the buffer processing function and if we need pdftocairo
parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format(
fmt, grayscale
)
# We use pdftocairo is the format requires it OR we need a transparent output
use_pdfcairo = (
use_pdftocairo
or use_pdfcairo_format
or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES)
)
poppler_version_major, poppler_version_minor = _get_poppler_version(
"pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
)
if poppler_version_major == 0 and poppler_version_minor <= 57:
jpegopt = None
if poppler_version_major == 0 and poppler_version_minor <= 83:
hide_annotations = False
# If output_file isn't a generator, it will be turned into one
if not isinstance(output_file, types.GeneratorType) and not isinstance(
output_file, ThreadSafeGenerator
):
if single_file:
output_file = iter([output_file])
thread_count = 1
else:
output_file = counter_generator(output_file)
if thread_count < 1:
thread_count = 1
if first_page is None or first_page < 1:
first_page = 1
if last_page is None or last_page > page_count:
last_page = page_count
if first_page > last_page:
return []
try:
auto_temp_dir = False
if output_folder is None and use_pdfcairo:
output_folder = tempfile.mkdtemp()
auto_temp_dir = True
# Recalculate page count based on first and last page
page_count = last_page - first_page + 1
if thread_count > page_count:
thread_count = page_count
reminder = page_count % thread_count
current_page = first_page
processes = []
for _ in range(thread_count):
thread_output_file = next(output_file)
# Get the number of pages the thread will be processing
thread_page_count = page_count // thread_count + int(reminder > 0)
# Build the command accordingly
args = _build_command(
["-r", str(dpi), pdf_path],
output_folder,
current_page,
current_page + thread_page_count - 1,
parsed_fmt,
jpegopt,
thread_output_file,
userpw,
ownerpw,
use_cropbox,
transparent,
single_file,
grayscale,
size,
hide_annotations,
)
if use_pdfcairo:
if hide_annotations:
raise NotImplementedError(
"Hide annotations flag not implemented in pdftocairo."
)
args = [_get_command_path("pdftocairo", poppler_path)] + args
else:
args = [_get_command_path("pdftoppm", poppler_path)] + args
# Update page values
current_page = current_page + thread_page_count
reminder -= int(reminder > 0)
# Add poppler path to LD_LIBRARY_PATH
env = os.environ.copy()
if poppler_path is not None:
env["LD_LIBRARY_PATH"] = (
poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
)
# Spawn the process and save its uuid
startupinfo = None
if platform.system() == "Windows":
# this startupinfo structure prevents a console window from popping up on Windows
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
processes.append(
(
thread_output_file,
Popen(
args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo
),
)
)
images = []
for uid, proc in processes:
try:
data, err = proc.communicate(timeout=timeout)
except TimeoutExpired:
proc.kill()
outs, errs = proc.communicate()
raise PDFPopplerTimeoutError("Run poppler timeout.")
if b"Syntax Error" in err and strict:
raise PDFSyntaxError(err.decode("utf8", "ignore"))
if output_folder is not None:
images += _load_from_output_folder(
output_folder,
uid,
final_extension,
paths_only,
in_memory=auto_temp_dir,
)
else:
images += parse_buffer_func(data)
finally:
if auto_temp_dir:
shutil.rmtree(output_folder)
return images
def convert_from_bytes(
pdf_file: bytes,
dpi: int = 200,
output_folder: Union[str, PurePath] = None,
first_page: int = None,
last_page: int = None,
fmt: str = "ppm",
jpegopt: Dict = None,
thread_count: int = 1,
userpw: str = None,
ownerpw: str = None,
use_cropbox: bool = False,
strict: bool = False,
transparent: bool = False,
single_file: bool = False,
output_file: Union[str, PurePath] = uuid_generator(),
poppler_path: Union[str, PurePath] = None,
grayscale: bool = False,
size: Union[Tuple, int] = None,
paths_only: bool = False,
use_pdftocairo: bool = False,
timeout: int = None,
hide_annotations: bool = False,
) -> List[Image.Image]:
"""Function wrapping pdftoppm and pdftocairo.
:param pdf_bytes: Bytes of the PDF that you want to convert
:type pdf_bytes: bytes
:param dpi: Image quality in DPI (default 200), defaults to 200
:type dpi: int, optional
:param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
:type output_folder: Union[str, PurePath], optional
:param first_page: First page to process, defaults to None
:type first_page: int, optional
:param last_page: Last page to process before stopping, defaults to None
:type last_page: int, optional
:param fmt: Output image format, defaults to "ppm"
:type fmt: str, optional
:param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
:type jpegopt: Dict, optional
:param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
:type thread_count: int, optional
:param userpw: PDF's password, defaults to None
:type userpw: str, optional
:param ownerpw: PDF's owner password, defaults to None
:type ownerpw: str, optional
:param use_cropbox: Use cropbox instead of mediabox, defaults to False
:type use_cropbox: bool, optional
:param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
:type strict: bool, optional
:param transparent: Output with a transparent background instead of a white one, defaults to False
:type transparent: bool, optional
:param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
:type single_file: bool, optional
:param output_file: What is the output filename or generator, defaults to uuid_generator()
:type output_file: Any, optional
:param poppler_path: Path to look for poppler binaries, defaults to None
:type poppler_path: Union[str, PurePath], optional
:param grayscale: Output grayscale image(s), defaults to False
:type grayscale: bool, optional
:param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
:type size: Union[Tuple, int], optional
:param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
:type paths_only: bool, optional
:param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
:type use_pdftocairo: bool, optional
:param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
:type timeout: int, optional
:param hide_annotations: Hide PDF annotations in the output, defaults to False
:type hide_annotations: bool, optional
:raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
:raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
:raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
:return: A list of Pillow images, one for each page between first_page and last_page
:rtype: List[Image.Image]
"""
fh, temp_filename = tempfile.mkstemp()
try:
with open(temp_filename, "wb") as f:
f.write(pdf_file)
f.flush()
return convert_from_path(
f.name,
dpi=dpi,
output_folder=output_folder,
first_page=first_page,
last_page=last_page,
fmt=fmt,
jpegopt=jpegopt,
thread_count=thread_count,
userpw=userpw,
ownerpw=ownerpw,
use_cropbox=use_cropbox,
strict=strict,
transparent=transparent,
single_file=single_file,
output_file=output_file,
poppler_path=poppler_path,
grayscale=grayscale,
size=size,
paths_only=paths_only,
use_pdftocairo=use_pdftocairo,
timeout=timeout,
hide_annotations=hide_annotations,
)
finally:
os.close(fh)
os.remove(temp_filename)
def _build_command(
args: List,
output_folder: str,
first_page: int,
last_page: int,
fmt: str,
jpegopt: Dict,
output_file: str,
userpw: str,
ownerpw: str,
use_cropbox: bool,
transparent: bool,
single_file: bool,
grayscale: bool,
size: Union[int, Tuple[int, int]],
hide_annotations: bool,
) -> List[str]:
if use_cropbox:
args.append("-cropbox")
if hide_annotations:
args.append("-hide-annotations")
if transparent and fmt in TRANSPARENT_FILE_TYPES:
args.append("-transp")
if first_page is not None:
args.extend(["-f", str(first_page)])
if last_page is not None:
args.extend(["-l", str(last_page)])
if fmt not in ["pgm", "ppm"]:
args.append("-" + fmt)
if fmt in ["jpeg", "jpg"] and jpegopt:
args.extend(["-jpegopt", _parse_jpegopt(jpegopt)])
if single_file:
args.append("-singlefile")
if output_folder is not None:
args.append(os.path.join(output_folder, output_file))
if userpw is not None:
args.extend(["-upw", userpw])
if ownerpw is not None:
args.extend(["-opw", ownerpw])
if grayscale:
args.append("-gray")
if size is None:
pass
elif isinstance(size, tuple) and len(size) == 2:
if size[0] is not None:
args.extend(["-scale-to-x", str(int(size[0]))])
else:
args.extend(["-scale-to-x", str(-1)])
if size[1] is not None:
args.extend(["-scale-to-y", str(int(size[1]))])
else:
args.extend(["-scale-to-y", str(-1)])
elif isinstance(size, tuple) and len(size) == 1:
args.extend(["-scale-to", str(int(size[0]))])
elif isinstance(size, int) or isinstance(size, float):
args.extend(["-scale-to", str(int(size))])
else:
raise ValueError(f"Size {size} is not a tuple or an integer")
return args
def _parse_format(fmt: str, grayscale: bool = False) -> Tuple[str, str, Callable, bool]:
fmt = fmt.lower()
if fmt[0] == ".":
fmt = fmt[1:]
if fmt in ("jpeg", "jpg"):
return "jpeg", "jpg", parse_buffer_to_jpeg, False
if fmt == "png":
return "png", "png", parse_buffer_to_png, False
if fmt in ("tif", "tiff"):
return "tiff", "tif", None, True
if fmt == "ppm" and grayscale:
return "pgm", "pgm", parse_buffer_to_pgm, False
# Unable to parse the format so we'll use the default
return "ppm", "ppm", parse_buffer_to_ppm, False
def _parse_jpegopt(jpegopt: Dict) -> str:
parts = []
for k, v in jpegopt.items():
if v is True:
v = "y"
if v is False:
v = "n"
parts.append("{}={}".format(k, v))
return ",".join(parts)
def _get_command_path(command: str, poppler_path: str = None) -> str:
if platform.system() == "Windows":
command = command + ".exe"
if poppler_path is not None:
command = os.path.join(poppler_path, command)
return command
def _get_poppler_version(
command: str, poppler_path: str = None, timeout: int = None
) -> Tuple[int, int]:
command = [_get_command_path(command, poppler_path), "-v"]
env = os.environ.copy()
if poppler_path is not None:
env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
try:
data, err = proc.communicate(timeout=timeout)
except TimeoutExpired:
proc.kill()
outs, errs = proc.communicate()
raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
try:
# TODO: Make this more robust
version = err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")
return int(version[0]), int(version[1])
except:
# Lowest version that includes pdftocairo (2011)
return 0, 17
def pdfinfo_from_path(
pdf_path: str,
userpw: str = None,
ownerpw: str = None,
poppler_path: str = None,
rawdates: bool = False,
timeout: int = None,
first_page: int = None,
last_page: int = None,
) -> Dict:
"""Function wrapping poppler's pdfinfo utility and returns the result as a dictionary.
:param pdf_path: Path to the PDF that you want to convert
:type pdf_path: str
:param userpw: PDF's password, defaults to None
:type userpw: str, optional
:param ownerpw: PDF's owner password, defaults to None
:type ownerpw: str, optional
:param poppler_path: Path to look for poppler binaries, defaults to None
:type poppler_path: Union[str, PurePath], optional
:param rawdates: Return the undecoded data strings, defaults to False
:type rawdates: bool, optional
:param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
:type timeout: int, optional
:param first_page: First page to process, defaults to None
:type first_page: int, optional
:param last_page: Last page to process before stopping, defaults to None
:type last_page: int, optional
:raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
:raises PDFInfoNotInstalledError: Raised if pdfinfo is not installed
:raises PDFPageCountError: Raised if the output could not be parsed
:return: Dictionary containing various information on the PDF
:rtype: Dict
"""
try:
command = [_get_command_path("pdfinfo", poppler_path), pdf_path]
if userpw is not None:
command.extend(["-upw", userpw])
if ownerpw is not None:
command.extend(["-opw", ownerpw])
if rawdates:
command.extend(["-rawdates"])
if first_page:
command.extend(["-f", str(first_page)])
if last_page:
command.extend(["-l", str(last_page)])
# Add poppler path to LD_LIBRARY_PATH
env = os.environ.copy()
if poppler_path is not None:
env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
try:
out, err = proc.communicate(timeout=timeout)
except TimeoutExpired:
proc.kill()
outs, errs = proc.communicate()
raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
d = {}
for field in out.decode("utf8", "ignore").split("\n"):
sf = field.split(":")
key, value = sf[0], ":".join(sf[1:])
if key != "":
d[key] = (
int(value.strip())
if key in PDFINFO_CONVERT_TO_INT
else value.strip()
)
if "Pages" not in d:
raise ValueError
return d
except OSError:
raise PDFInfoNotInstalledError(
"Unable to get page count. Is poppler installed and in PATH?"
)
except ValueError:
raise PDFPageCountError(
f"Unable to get page count.\n{err.decode('utf8', 'ignore')}"
)
def pdfinfo_from_bytes(
pdf_bytes: bytes,
userpw: str = None,
ownerpw: str = None,
poppler_path: str = None,
rawdates: bool = False,
timeout: int = None,
first_page: int = None,
last_page: int = None,
) -> Dict:
"""Function wrapping poppler's pdfinfo utility and returns the result as a dictionary.
:param pdf_bytes: Bytes of the PDF that you want to convert
:type pdf_bytes: bytes
:param userpw: PDF's password, defaults to None
:type userpw: str, optional
:param ownerpw: PDF's owner password, defaults to None
:type ownerpw: str, optional
:param poppler_path: Path to look for poppler binaries, defaults to None
:type poppler_path: Union[str, PurePath], optional
:param rawdates: Return the undecoded data strings, defaults to False
:type rawdates: bool, optional
:param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
:type timeout: int, optional
:param first_page: First page to process, defaults to None
:type first_page: int, optional
:param last_page: Last page to process before stopping, defaults to None
:type last_page: int, optional
:return: Dictionary containing various information on the PDF
:rtype: Dict
"""
fh, temp_filename = tempfile.mkstemp()
try:
with open(temp_filename, "wb") as f:
f.write(pdf_bytes)
f.flush()
return pdfinfo_from_path(
temp_filename,
userpw=userpw,
ownerpw=ownerpw,
poppler_path=poppler_path,
rawdates=rawdates,
timeout=timeout,
first_page=first_page,
last_page=last_page,
)
finally:
os.close(fh)
os.remove(temp_filename)
def _load_from_output_folder(
output_folder: str,
output_file: str,
ext: str,
paths_only: bool,
in_memory: bool = False,
) -> List[Image.Image]:
images = []
for f in sorted(os.listdir(output_folder)):
if f.startswith(output_file) and f.split(".")[-1] == ext:
if paths_only:
images.append(os.path.join(output_folder, f))
else:
images.append(Image.open(os.path.join(output_folder, f)))
if in_memory:
images[-1].load()
return images
|