import asyncio import json from unittest.mock import MagicMock, patch import pytest from r2r.parsers.media.docx_parser import DOCXParser from r2r.parsers.media.pdf_parser import PDFParser from r2r.parsers.media.ppt_parser import PPTParser from r2r.parsers.structured.csv_parser import CSVParser from r2r.parsers.structured.json_parser import JSONParser from r2r.parsers.structured.xlsx_parser import XLSXParser from r2r.parsers.text.html_parser import HTMLParser from r2r.parsers.text.md_parser import MDParser from r2r.parsers.text.text_parser import TextParser @pytest.fixture(scope="session", autouse=True) def event_loop_policy(): asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) @pytest.fixture(scope="function") def event_loop(): loop = asyncio.get_event_loop_policy().new_event_loop() yield loop loop.close() asyncio.set_event_loop(None) @pytest.fixture(scope="session", autouse=True) async def cleanup_tasks(): yield for task in asyncio.all_tasks(): if task is not asyncio.current_task(): task.cancel() try: await task except asyncio.CancelledError: pass @pytest.mark.asyncio async def test_text_parser(): try: parser = TextParser() data = "Simple text" async for result in parser.ingest(data): assert result == "Simple text" except asyncio.CancelledError: pass @pytest.mark.asyncio async def test_json_parser(): try: parser = JSONParser() data = json.dumps({"key": "value", "null_key": None}) async for result in parser.ingest(data): assert "key: value" in result assert "null_key" not in result except asyncio.CancelledError: pass @pytest.mark.asyncio async def test_html_parser(): try: parser = HTMLParser() data = "<html><body><p>Hello World</p></body></html>" async for result in parser.ingest(data): assert result.strip() == "Hello World" except asyncio.CancelledError: pass @pytest.mark.asyncio @patch("pypdf.PdfReader") async def test_pdf_parser(mock_pdf_reader): try: parser = PDFParser() mock_pdf_reader.return_value.pages = [ MagicMock(extract_text=lambda: "Page text") ] data = b"fake PDF data" async for result in parser.ingest(data): assert result == "Page text" except asyncio.CancelledError: pass @pytest.mark.asyncio @patch("pptx.Presentation") async def test_ppt_parser(mock_presentation): try: mock_slide = MagicMock() mock_shape = MagicMock(text="Slide text") mock_slide.shapes = [mock_shape] mock_presentation.return_value.slides = [mock_slide] parser = PPTParser() data = b"fake PPT data" async for result in parser.ingest(data): assert result == "Slide text" except asyncio.CancelledError: pass @pytest.mark.asyncio @patch("docx.Document") async def test_docx_parser(mock_document): try: mock_paragraph = MagicMock(text="Paragraph text") mock_document.return_value.paragraphs = [mock_paragraph] parser = DOCXParser() data = b"fake DOCX data" async for result in parser.ingest(data): assert result == "Paragraph text" except asyncio.CancelledError: pass @pytest.mark.asyncio async def test_csv_parser(): try: parser = CSVParser() data = "col1,col2\nvalue1,value2" async for result in parser.ingest(data): assert result == "col1, col2" break except asyncio.CancelledError: pass @pytest.mark.asyncio @patch("openpyxl.load_workbook") async def test_xlsx_parser(mock_load_workbook): try: mock_sheet = MagicMock() mock_sheet.iter_rows.return_value = [(1, 2), (3, 4)] mock_workbook = MagicMock(worksheets=[mock_sheet]) mock_load_workbook.return_value = mock_workbook parser = XLSXParser() data = b"fake XLSX data" async for result in parser.ingest(data): assert result == "1, 2" break except asyncio.CancelledError: pass @pytest.mark.asyncio async def test_markdown_parser(): try: parser = MDParser() data = "# Header\nContent" async for result in parser.ingest(data): assert result.strip() == "Header\nContent" except asyncio.CancelledError: pass