blob: 8685c8fb365407b38b5fabb9575b9d4f034de962 (
about) (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# type: ignore
from io import BytesIO
from typing import AsyncGenerator
from pptx import Presentation
from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
DatabaseProvider,
IngestionConfig,
)
class PPTXParser(AsyncParser[str | bytes]):
"""A parser for PPT data."""
def __init__(
self,
config: IngestionConfig,
database_provider: DatabaseProvider,
llm_provider: CompletionProvider,
):
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
self.Presentation = Presentation
async def ingest(
self, data: str | bytes, **kwargs
) -> AsyncGenerator[str, None]: # type: ignore
"""Ingest PPT data and yield text from each slide."""
if isinstance(data, str):
raise ValueError("PPT data must be in bytes format.")
prs = self.Presentation(BytesIO(data))
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
yield shape.text
|