diff options
Diffstat (limited to 'R2R/r2r/parsers/structured/json_parser.py')
-rwxr-xr-x | R2R/r2r/parsers/structured/json_parser.py | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/R2R/r2r/parsers/structured/json_parser.py b/R2R/r2r/parsers/structured/json_parser.py new file mode 100755 index 00000000..23d63065 --- /dev/null +++ b/R2R/r2r/parsers/structured/json_parser.py @@ -0,0 +1,49 @@ +import json +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class JSONParser(AsyncParser[DataType]): + """A parser for JSON data.""" + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest JSON data and yield a formatted text representation.""" + if isinstance(data, bytes): + data = data.decode("utf-8") + yield self._parse_json(json.loads(data)) + + def _parse_json(self, data: dict) -> str: + def remove_objects_with_null(obj): + if not isinstance(obj, dict): + return obj + result = obj.copy() + for key, value in obj.items(): + if isinstance(value, dict): + result[key] = remove_objects_with_null(value) + elif value is None: + del result[key] + return result + + def format_json_as_text(obj, indent=0): + lines = [] + indent_str = " " * indent + + if isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(value, (dict, list)): + nested = format_json_as_text(value, indent + 2) + lines.append(f"{indent_str}{key}:\n{nested}") + else: + lines.append(f"{indent_str}{key}: {value}") + elif isinstance(obj, list): + for item in obj: + nested = format_json_as_text(item, indent + 2) + lines.append(f"{nested}") + else: + return f"{indent_str}{obj}" + + return "\n".join(lines) + + return format_json_as_text(remove_objects_with_null(data)) |