blob: 9edced810120fc90862228f2a59f6c9570fd3df9 (
about) (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
from io import BytesIO
from typing import AsyncGenerator
from r2r.base.abstractions.document import DataType
from r2r.base.parsers.base_parser import AsyncParser
class DOCXParser(AsyncParser[DataType]):
"""A parser for DOCX data."""
def __init__(self):
try:
from docx import Document
self.Document = Document
except ImportError:
raise ValueError(
"Error, `python-docx` is required to run `DOCXParser`. Please install it using `pip install python-docx`."
)
async def ingest(self, data: DataType) -> AsyncGenerator[str, None]:
"""Ingest DOCX data and yield text from each paragraph."""
if isinstance(data, str):
raise ValueError("DOCX data must be in bytes format.")
doc = self.Document(BytesIO(data))
for paragraph in doc.paragraphs:
yield paragraph.text
|