diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /R2R/r2r/parsers/structured | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to 'R2R/r2r/parsers/structured')
-rwxr-xr-x | R2R/r2r/parsers/structured/__init__.py | 0 | ||||
-rwxr-xr-x | R2R/r2r/parsers/structured/csv_parser.py | 25 | ||||
-rwxr-xr-x | R2R/r2r/parsers/structured/json_parser.py | 49 | ||||
-rwxr-xr-x | R2R/r2r/parsers/structured/xlsx_parser.py | 29 |
4 files changed, 103 insertions, 0 deletions
diff --git a/R2R/r2r/parsers/structured/__init__.py b/R2R/r2r/parsers/structured/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/parsers/structured/__init__.py diff --git a/R2R/r2r/parsers/structured/csv_parser.py b/R2R/r2r/parsers/structured/csv_parser.py new file mode 100755 index 00000000..343d9fbf --- /dev/null +++ b/R2R/r2r/parsers/structured/csv_parser.py @@ -0,0 +1,25 @@ +from typing import AsyncGenerator, Union + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class CSVParser(AsyncParser[DataType]): + """A parser for CSV data.""" + + def __init__(self): + import csv + from io import StringIO + + self.csv = csv + self.StringIO = StringIO + + async def ingest( + self, data: Union[str, bytes] + ) -> AsyncGenerator[str, None]: + """Ingest CSV data and yield text from each row.""" + if isinstance(data, bytes): + data = data.decode("utf-8") + csv_reader = self.csv.reader(self.StringIO(data)) + for row in csv_reader: + yield ", ".join(row) diff --git a/R2R/r2r/parsers/structured/json_parser.py b/R2R/r2r/parsers/structured/json_parser.py new file mode 100755 index 00000000..23d63065 --- /dev/null +++ b/R2R/r2r/parsers/structured/json_parser.py @@ -0,0 +1,49 @@ +import json +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class JSONParser(AsyncParser[DataType]): + """A parser for JSON data.""" + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest JSON data and yield a formatted text representation.""" + if isinstance(data, bytes): + data = data.decode("utf-8") + yield self._parse_json(json.loads(data)) + + def _parse_json(self, data: dict) -> str: + def remove_objects_with_null(obj): + if not isinstance(obj, dict): + return obj + result = obj.copy() + for key, value in obj.items(): + if isinstance(value, dict): + result[key] = remove_objects_with_null(value) + elif value is None: + del result[key] + return result + + def format_json_as_text(obj, indent=0): + lines = [] + indent_str = " " * indent + + if isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(value, (dict, list)): + nested = format_json_as_text(value, indent + 2) + lines.append(f"{indent_str}{key}:\n{nested}") + else: + lines.append(f"{indent_str}{key}: {value}") + elif isinstance(obj, list): + for item in obj: + nested = format_json_as_text(item, indent + 2) + lines.append(f"{nested}") + else: + return f"{indent_str}{obj}" + + return "\n".join(lines) + + return format_json_as_text(remove_objects_with_null(data)) diff --git a/R2R/r2r/parsers/structured/xlsx_parser.py b/R2R/r2r/parsers/structured/xlsx_parser.py new file mode 100755 index 00000000..68a3bdc6 --- /dev/null +++ b/R2R/r2r/parsers/structured/xlsx_parser.py @@ -0,0 +1,29 @@ +from io import BytesIO +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class XLSXParser(AsyncParser[DataType]): + """A parser for XLSX data.""" + + def __init__(self): + try: + from openpyxl import load_workbook + + self.load_workbook = load_workbook + except ImportError: + raise ValueError( + "Error, `openpyxl` is required to run `XLSXParser`. Please install it using `pip install openpyxl`." + ) + + async def ingest(self, data: bytes) -> AsyncGenerator[str, None]: + """Ingest XLSX data and yield text from each row.""" + if isinstance(data, str): + raise ValueError("XLSX data must be in bytes format.") + + wb = self.load_workbook(filename=BytesIO(data)) + for sheet in wb.worksheets: + for row in sheet.iter_rows(values_only=True): + yield ", ".join(map(str, row)) |