aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/core/parsers/media/odt_parser.py
blob: cb1464649507ce7040dcb4d1b7e2b2a4a9d961ab (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# type: ignore
import xml.etree.ElementTree as ET
import zipfile
from typing import AsyncGenerator

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
    CompletionProvider,
    DatabaseProvider,
    IngestionConfig,
)


class ODTParser(AsyncParser[str | bytes]):
    def __init__(
        self,
        config: IngestionConfig,
        database_provider: DatabaseProvider,
        llm_provider: CompletionProvider,
    ):
        self.database_provider = database_provider
        self.llm_provider = llm_provider
        self.config = config
        self.zipfile = zipfile
        self.ET = ET

    async def ingest(
        self, data: str | bytes, **kwargs
    ) -> AsyncGenerator[str, None]:
        if isinstance(data, str):
            raise ValueError("ODT data must be in bytes format.")

        from io import BytesIO

        file_obj = BytesIO(data)

        try:
            with self.zipfile.ZipFile(file_obj) as odt:
                # ODT files are zip archives containing content.xml
                content = odt.read("content.xml")
                root = self.ET.fromstring(content)

                # ODT XML namespace
                ns = {"text": "urn:oasis:names:tc:opendocument:xmlns:text:1.0"}

                # Extract paragraphs and headers
                for p in root.findall(".//text:p", ns):
                    text = "".join(p.itertext())
                    if text.strip():
                        yield text.strip()

                for h in root.findall(".//text:h", ns):
                    text = "".join(h.itertext())
                    if text.strip():
                        yield text.strip()

        except Exception as e:
            raise ValueError(f"Error processing ODT file: {str(e)}") from e
        finally:
            file_obj.close()