aboutsummaryrefslogtreecommitdiff
path: root/R2R/r2r/parsers/media/docx_parser.py
blob: 9edced810120fc90862228f2a59f6c9570fd3df9 (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from io import BytesIO
from typing import AsyncGenerator

from r2r.base.abstractions.document import DataType
from r2r.base.parsers.base_parser import AsyncParser


class DOCXParser(AsyncParser[DataType]):
    """A parser for DOCX data."""

    def __init__(self):
        try:
            from docx import Document

            self.Document = Document
        except ImportError:
            raise ValueError(
                "Error, `python-docx` is required to run `DOCXParser`. Please install it using `pip install python-docx`."
            )

    async def ingest(self, data: DataType) -> AsyncGenerator[str, None]:
        """Ingest DOCX data and yield text from each paragraph."""
        if isinstance(data, str):
            raise ValueError("DOCX data must be in bytes format.")

        doc = self.Document(BytesIO(data))
        for paragraph in doc.paragraphs:
            yield paragraph.text