diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /R2R/tests/test_parser.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz |
Diffstat (limited to 'R2R/tests/test_parser.py')
-rwxr-xr-x | R2R/tests/test_parser.py | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/R2R/tests/test_parser.py b/R2R/tests/test_parser.py new file mode 100755 index 00000000..6965c5a9 --- /dev/null +++ b/R2R/tests/test_parser.py @@ -0,0 +1,159 @@ +import asyncio +import json +from unittest.mock import MagicMock, patch + +import pytest + +from r2r.parsers.media.docx_parser import DOCXParser +from r2r.parsers.media.pdf_parser import PDFParser +from r2r.parsers.media.ppt_parser import PPTParser +from r2r.parsers.structured.csv_parser import CSVParser +from r2r.parsers.structured.json_parser import JSONParser +from r2r.parsers.structured.xlsx_parser import XLSXParser +from r2r.parsers.text.html_parser import HTMLParser +from r2r.parsers.text.md_parser import MDParser +from r2r.parsers.text.text_parser import TextParser + + +@pytest.fixture(scope="session", autouse=True) +def event_loop_policy(): + asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) + + +@pytest.fixture(scope="function") +def event_loop(): + loop = asyncio.get_event_loop_policy().new_event_loop() + yield loop + loop.close() + asyncio.set_event_loop(None) + + +@pytest.fixture(scope="session", autouse=True) +async def cleanup_tasks(): + yield + for task in asyncio.all_tasks(): + if task is not asyncio.current_task(): + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +async def test_text_parser(): + try: + parser = TextParser() + data = "Simple text" + async for result in parser.ingest(data): + assert result == "Simple text" + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +async def test_json_parser(): + try: + parser = JSONParser() + data = json.dumps({"key": "value", "null_key": None}) + async for result in parser.ingest(data): + assert "key: value" in result + assert "null_key" not in result + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +async def test_html_parser(): + try: + parser = HTMLParser() + data = "<html><body><p>Hello World</p></body></html>" + async for result in parser.ingest(data): + assert result.strip() == "Hello World" + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +@patch("pypdf.PdfReader") +async def test_pdf_parser(mock_pdf_reader): + try: + parser = PDFParser() + mock_pdf_reader.return_value.pages = [ + MagicMock(extract_text=lambda: "Page text") + ] + data = b"fake PDF data" + async for result in parser.ingest(data): + assert result == "Page text" + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +@patch("pptx.Presentation") +async def test_ppt_parser(mock_presentation): + try: + mock_slide = MagicMock() + mock_shape = MagicMock(text="Slide text") + mock_slide.shapes = [mock_shape] + mock_presentation.return_value.slides = [mock_slide] + parser = PPTParser() + data = b"fake PPT data" + async for result in parser.ingest(data): + assert result == "Slide text" + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +@patch("docx.Document") +async def test_docx_parser(mock_document): + try: + mock_paragraph = MagicMock(text="Paragraph text") + mock_document.return_value.paragraphs = [mock_paragraph] + parser = DOCXParser() + data = b"fake DOCX data" + async for result in parser.ingest(data): + assert result == "Paragraph text" + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +async def test_csv_parser(): + try: + parser = CSVParser() + data = "col1,col2\nvalue1,value2" + async for result in parser.ingest(data): + assert result == "col1, col2" + break + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +@patch("openpyxl.load_workbook") +async def test_xlsx_parser(mock_load_workbook): + try: + mock_sheet = MagicMock() + mock_sheet.iter_rows.return_value = [(1, 2), (3, 4)] + mock_workbook = MagicMock(worksheets=[mock_sheet]) + mock_load_workbook.return_value = mock_workbook + parser = XLSXParser() + data = b"fake XLSX data" + async for result in parser.ingest(data): + assert result == "1, 2" + break + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +async def test_markdown_parser(): + try: + parser = MDParser() + data = "# Header\nContent" + async for result in parser.ingest(data): + assert result.strip() == "Header\nContent" + except asyncio.CancelledError: + pass |