blob: 68a3bdc6efb39cfc298eb4653903ab89fac454b9 (
about) (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
from io import BytesIO
from typing import AsyncGenerator
from r2r.base.abstractions.document import DataType
from r2r.base.parsers.base_parser import AsyncParser
class XLSXParser(AsyncParser[DataType]):
"""A parser for XLSX data."""
def __init__(self):
try:
from openpyxl import load_workbook
self.load_workbook = load_workbook
except ImportError:
raise ValueError(
"Error, `openpyxl` is required to run `XLSXParser`. Please install it using `pip install openpyxl`."
)
async def ingest(self, data: bytes) -> AsyncGenerator[str, None]:
"""Ingest XLSX data and yield text from each row."""
if isinstance(data, str):
raise ValueError("XLSX data must be in bytes format.")
wb = self.load_workbook(filename=BytesIO(data))
for sheet in wb.worksheets:
for row in sheet.iter_rows(values_only=True):
yield ", ".join(map(str, row))
|