import string
from io import BytesIO
from typing import AsyncGenerator
from r2r.base.abstractions.document import DataType
from r2r.base.parsers.base_parser import AsyncParser
class PDFParser(AsyncParser[DataType]):
"""A parser for PDF data."""
def __init__(self):
try:
from pypdf import PdfReader
self.PdfReader = PdfReader
except ImportError:
raise ValueError(
"Error, `pypdf` is required to run `PyPDFParser`. Please install it using `pip install pypdf`."
)
async def ingest(self, data: DataType) -> AsyncGenerator[str, None]:
"""Ingest PDF data and yield text from each page."""
if isinstance(data, str):
raise ValueError("PDF data must be in bytes format.")
pdf = self.PdfReader(BytesIO(data))
for page in pdf.pages:
page_text = page.extract_text()
if page_text is not None:
page_text = "".join(
filter(lambda x: x in string.printable, page_text)
)
yield page_text