blob: 988f8341abd992e8b0a071ebdcee61762c0dad5a (
about) (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
# type: ignore
from io import BytesIO
from typing import AsyncGenerator
from docx import Document
from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
DatabaseProvider,
IngestionConfig,
)
class DOCXParser(AsyncParser[str | bytes]):
"""A parser for DOCX data."""
def __init__(
self,
config: IngestionConfig,
database_provider: DatabaseProvider,
llm_provider: CompletionProvider,
):
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
self.Document = Document
async def ingest(
self, data: str | bytes, *args, **kwargs
) -> AsyncGenerator[str, None]: # type: ignore
"""Ingest DOCX data and yield text from each paragraph."""
if isinstance(data, str):
raise ValueError("DOCX data must be in bytes format.")
doc = self.Document(BytesIO(data))
for paragraph in doc.paragraphs:
yield paragraph.text
|