about summary refs log tree commit diff
path: root/R2R/tests/test_parser.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /R2R/tests/test_parser.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to 'R2R/tests/test_parser.py')
-rwxr-xr-xR2R/tests/test_parser.py159
1 files changed, 159 insertions, 0 deletions
diff --git a/R2R/tests/test_parser.py b/R2R/tests/test_parser.py
new file mode 100755
index 00000000..6965c5a9
--- /dev/null
+++ b/R2R/tests/test_parser.py
@@ -0,0 +1,159 @@
+import asyncio
+import json
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from r2r.parsers.media.docx_parser import DOCXParser
+from r2r.parsers.media.pdf_parser import PDFParser
+from r2r.parsers.media.ppt_parser import PPTParser
+from r2r.parsers.structured.csv_parser import CSVParser
+from r2r.parsers.structured.json_parser import JSONParser
+from r2r.parsers.structured.xlsx_parser import XLSXParser
+from r2r.parsers.text.html_parser import HTMLParser
+from r2r.parsers.text.md_parser import MDParser
+from r2r.parsers.text.text_parser import TextParser
+
+
+@pytest.fixture(scope="session", autouse=True)
+def event_loop_policy():
+    asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
+
+
+@pytest.fixture(scope="function")
+def event_loop():
+    loop = asyncio.get_event_loop_policy().new_event_loop()
+    yield loop
+    loop.close()
+    asyncio.set_event_loop(None)
+
+
+@pytest.fixture(scope="session", autouse=True)
+async def cleanup_tasks():
+    yield
+    for task in asyncio.all_tasks():
+        if task is not asyncio.current_task():
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
+
+
+@pytest.mark.asyncio
+async def test_text_parser():
+    try:
+        parser = TextParser()
+        data = "Simple text"
+        async for result in parser.ingest(data):
+            assert result == "Simple text"
+    except asyncio.CancelledError:
+        pass
+
+
+@pytest.mark.asyncio
+async def test_json_parser():
+    try:
+        parser = JSONParser()
+        data = json.dumps({"key": "value", "null_key": None})
+        async for result in parser.ingest(data):
+            assert "key: value" in result
+            assert "null_key" not in result
+    except asyncio.CancelledError:
+        pass
+
+
+@pytest.mark.asyncio
+async def test_html_parser():
+    try:
+        parser = HTMLParser()
+        data = "<html><body><p>Hello World</p></body></html>"
+        async for result in parser.ingest(data):
+            assert result.strip() == "Hello World"
+    except asyncio.CancelledError:
+        pass
+
+
+@pytest.mark.asyncio
+@patch("pypdf.PdfReader")
+async def test_pdf_parser(mock_pdf_reader):
+    try:
+        parser = PDFParser()
+        mock_pdf_reader.return_value.pages = [
+            MagicMock(extract_text=lambda: "Page text")
+        ]
+        data = b"fake PDF data"
+        async for result in parser.ingest(data):
+            assert result == "Page text"
+    except asyncio.CancelledError:
+        pass
+
+
+@pytest.mark.asyncio
+@patch("pptx.Presentation")
+async def test_ppt_parser(mock_presentation):
+    try:
+        mock_slide = MagicMock()
+        mock_shape = MagicMock(text="Slide text")
+        mock_slide.shapes = [mock_shape]
+        mock_presentation.return_value.slides = [mock_slide]
+        parser = PPTParser()
+        data = b"fake PPT data"
+        async for result in parser.ingest(data):
+            assert result == "Slide text"
+    except asyncio.CancelledError:
+        pass
+
+
+@pytest.mark.asyncio
+@patch("docx.Document")
+async def test_docx_parser(mock_document):
+    try:
+        mock_paragraph = MagicMock(text="Paragraph text")
+        mock_document.return_value.paragraphs = [mock_paragraph]
+        parser = DOCXParser()
+        data = b"fake DOCX data"
+        async for result in parser.ingest(data):
+            assert result == "Paragraph text"
+    except asyncio.CancelledError:
+        pass
+
+
+@pytest.mark.asyncio
+async def test_csv_parser():
+    try:
+        parser = CSVParser()
+        data = "col1,col2\nvalue1,value2"
+        async for result in parser.ingest(data):
+            assert result == "col1, col2"
+            break
+    except asyncio.CancelledError:
+        pass
+
+
+@pytest.mark.asyncio
+@patch("openpyxl.load_workbook")
+async def test_xlsx_parser(mock_load_workbook):
+    try:
+        mock_sheet = MagicMock()
+        mock_sheet.iter_rows.return_value = [(1, 2), (3, 4)]
+        mock_workbook = MagicMock(worksheets=[mock_sheet])
+        mock_load_workbook.return_value = mock_workbook
+        parser = XLSXParser()
+        data = b"fake XLSX data"
+        async for result in parser.ingest(data):
+            assert result == "1, 2"
+            break
+    except asyncio.CancelledError:
+        pass
+
+
+@pytest.mark.asyncio
+async def test_markdown_parser():
+    try:
+        parser = MDParser()
+        data = "# Header\nContent"
+        async for result in parser.ingest(data):
+            assert result.strip() == "Header\nContent"
+    except asyncio.CancelledError:
+        pass