aboutsummaryrefslogtreecommitdiff
path: root/R2R/tests/test_parser.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /R2R/tests/test_parser.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to 'R2R/tests/test_parser.py')
-rwxr-xr-xR2R/tests/test_parser.py159
1 files changed, 159 insertions, 0 deletions
diff --git a/R2R/tests/test_parser.py b/R2R/tests/test_parser.py
new file mode 100755
index 00000000..6965c5a9
--- /dev/null
+++ b/R2R/tests/test_parser.py
@@ -0,0 +1,159 @@
+import asyncio
+import json
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from r2r.parsers.media.docx_parser import DOCXParser
+from r2r.parsers.media.pdf_parser import PDFParser
+from r2r.parsers.media.ppt_parser import PPTParser
+from r2r.parsers.structured.csv_parser import CSVParser
+from r2r.parsers.structured.json_parser import JSONParser
+from r2r.parsers.structured.xlsx_parser import XLSXParser
+from r2r.parsers.text.html_parser import HTMLParser
+from r2r.parsers.text.md_parser import MDParser
+from r2r.parsers.text.text_parser import TextParser
+
+
+@pytest.fixture(scope="session", autouse=True)
+def event_loop_policy():
+ asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
+
+
+@pytest.fixture(scope="function")
+def event_loop():
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+ asyncio.set_event_loop(None)
+
+
+@pytest.fixture(scope="session", autouse=True)
+async def cleanup_tasks():
+ yield
+ for task in asyncio.all_tasks():
+ if task is not asyncio.current_task():
+ task.cancel()
+ try:
+ await task
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+async def test_text_parser():
+ try:
+ parser = TextParser()
+ data = "Simple text"
+ async for result in parser.ingest(data):
+ assert result == "Simple text"
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+async def test_json_parser():
+ try:
+ parser = JSONParser()
+ data = json.dumps({"key": "value", "null_key": None})
+ async for result in parser.ingest(data):
+ assert "key: value" in result
+ assert "null_key" not in result
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+async def test_html_parser():
+ try:
+ parser = HTMLParser()
+ data = "<html><body><p>Hello World</p></body></html>"
+ async for result in parser.ingest(data):
+ assert result.strip() == "Hello World"
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+@patch("pypdf.PdfReader")
+async def test_pdf_parser(mock_pdf_reader):
+ try:
+ parser = PDFParser()
+ mock_pdf_reader.return_value.pages = [
+ MagicMock(extract_text=lambda: "Page text")
+ ]
+ data = b"fake PDF data"
+ async for result in parser.ingest(data):
+ assert result == "Page text"
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+@patch("pptx.Presentation")
+async def test_ppt_parser(mock_presentation):
+ try:
+ mock_slide = MagicMock()
+ mock_shape = MagicMock(text="Slide text")
+ mock_slide.shapes = [mock_shape]
+ mock_presentation.return_value.slides = [mock_slide]
+ parser = PPTParser()
+ data = b"fake PPT data"
+ async for result in parser.ingest(data):
+ assert result == "Slide text"
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+@patch("docx.Document")
+async def test_docx_parser(mock_document):
+ try:
+ mock_paragraph = MagicMock(text="Paragraph text")
+ mock_document.return_value.paragraphs = [mock_paragraph]
+ parser = DOCXParser()
+ data = b"fake DOCX data"
+ async for result in parser.ingest(data):
+ assert result == "Paragraph text"
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+async def test_csv_parser():
+ try:
+ parser = CSVParser()
+ data = "col1,col2\nvalue1,value2"
+ async for result in parser.ingest(data):
+ assert result == "col1, col2"
+ break
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+@patch("openpyxl.load_workbook")
+async def test_xlsx_parser(mock_load_workbook):
+ try:
+ mock_sheet = MagicMock()
+ mock_sheet.iter_rows.return_value = [(1, 2), (3, 4)]
+ mock_workbook = MagicMock(worksheets=[mock_sheet])
+ mock_load_workbook.return_value = mock_workbook
+ parser = XLSXParser()
+ data = b"fake XLSX data"
+ async for result in parser.ingest(data):
+ assert result == "1, 2"
+ break
+ except asyncio.CancelledError:
+ pass
+
+
+@pytest.mark.asyncio
+async def test_markdown_parser():
+ try:
+ parser = MDParser()
+ data = "# Header\nContent"
+ async for result in parser.ingest(data):
+ assert result.strip() == "Header\nContent"
+ except asyncio.CancelledError:
+ pass