aboutsummaryrefslogtreecommitdiff
path: root/R2R/tests/test_parser.py
blob: 6965c5a94c10fc8f333c358aeb6f6d9f725d0209 (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import asyncio
import json
from unittest.mock import MagicMock, patch

import pytest

from r2r.parsers.media.docx_parser import DOCXParser
from r2r.parsers.media.pdf_parser import PDFParser
from r2r.parsers.media.ppt_parser import PPTParser
from r2r.parsers.structured.csv_parser import CSVParser
from r2r.parsers.structured.json_parser import JSONParser
from r2r.parsers.structured.xlsx_parser import XLSXParser
from r2r.parsers.text.html_parser import HTMLParser
from r2r.parsers.text.md_parser import MDParser
from r2r.parsers.text.text_parser import TextParser


@pytest.fixture(scope="session", autouse=True)
def event_loop_policy():
    asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())


@pytest.fixture(scope="function")
def event_loop():
    loop = asyncio.get_event_loop_policy().new_event_loop()
    yield loop
    loop.close()
    asyncio.set_event_loop(None)


@pytest.fixture(scope="session", autouse=True)
async def cleanup_tasks():
    yield
    for task in asyncio.all_tasks():
        if task is not asyncio.current_task():
            task.cancel()
            try:
                await task
            except asyncio.CancelledError:
                pass


@pytest.mark.asyncio
async def test_text_parser():
    try:
        parser = TextParser()
        data = "Simple text"
        async for result in parser.ingest(data):
            assert result == "Simple text"
    except asyncio.CancelledError:
        pass


@pytest.mark.asyncio
async def test_json_parser():
    try:
        parser = JSONParser()
        data = json.dumps({"key": "value", "null_key": None})
        async for result in parser.ingest(data):
            assert "key: value" in result
            assert "null_key" not in result
    except asyncio.CancelledError:
        pass


@pytest.mark.asyncio
async def test_html_parser():
    try:
        parser = HTMLParser()
        data = "<html><body><p>Hello World</p></body></html>"
        async for result in parser.ingest(data):
            assert result.strip() == "Hello World"
    except asyncio.CancelledError:
        pass


@pytest.mark.asyncio
@patch("pypdf.PdfReader")
async def test_pdf_parser(mock_pdf_reader):
    try:
        parser = PDFParser()
        mock_pdf_reader.return_value.pages = [
            MagicMock(extract_text=lambda: "Page text")
        ]
        data = b"fake PDF data"
        async for result in parser.ingest(data):
            assert result == "Page text"
    except asyncio.CancelledError:
        pass


@pytest.mark.asyncio
@patch("pptx.Presentation")
async def test_ppt_parser(mock_presentation):
    try:
        mock_slide = MagicMock()
        mock_shape = MagicMock(text="Slide text")
        mock_slide.shapes = [mock_shape]
        mock_presentation.return_value.slides = [mock_slide]
        parser = PPTParser()
        data = b"fake PPT data"
        async for result in parser.ingest(data):
            assert result == "Slide text"
    except asyncio.CancelledError:
        pass


@pytest.mark.asyncio
@patch("docx.Document")
async def test_docx_parser(mock_document):
    try:
        mock_paragraph = MagicMock(text="Paragraph text")
        mock_document.return_value.paragraphs = [mock_paragraph]
        parser = DOCXParser()
        data = b"fake DOCX data"
        async for result in parser.ingest(data):
            assert result == "Paragraph text"
    except asyncio.CancelledError:
        pass


@pytest.mark.asyncio
async def test_csv_parser():
    try:
        parser = CSVParser()
        data = "col1,col2\nvalue1,value2"
        async for result in parser.ingest(data):
            assert result == "col1, col2"
            break
    except asyncio.CancelledError:
        pass


@pytest.mark.asyncio
@patch("openpyxl.load_workbook")
async def test_xlsx_parser(mock_load_workbook):
    try:
        mock_sheet = MagicMock()
        mock_sheet.iter_rows.return_value = [(1, 2), (3, 4)]
        mock_workbook = MagicMock(worksheets=[mock_sheet])
        mock_load_workbook.return_value = mock_workbook
        parser = XLSXParser()
        data = b"fake XLSX data"
        async for result in parser.ingest(data):
            assert result == "1, 2"
            break
    except asyncio.CancelledError:
        pass


@pytest.mark.asyncio
async def test_markdown_parser():
    try:
        parser = MDParser()
        data = "# Header\nContent"
        async for result in parser.ingest(data):
            assert result.strip() == "Header\nContent"
    except asyncio.CancelledError:
        pass