import string from io import BytesIO from typing import AsyncGenerator from r2r.base.abstractions.document import DataType from r2r.base.parsers.base_parser import AsyncParser class PDFParser(AsyncParser[DataType]): """A parser for PDF data.""" def __init__(self): try: from pypdf import PdfReader self.PdfReader = PdfReader except ImportError: raise ValueError( "Error, `pypdf` is required to run `PyPDFParser`. Please install it using `pip install pypdf`." ) async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: """Ingest PDF data and yield text from each page.""" if isinstance(data, str): raise ValueError("PDF data must be in bytes format.") pdf = self.PdfReader(BytesIO(data)) for page in pdf.pages: page_text = page.extract_text() if page_text is not None: page_text = "".join( filter(lambda x: x in string.printable, page_text) ) yield page_text