From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- .../site-packages/core/parsers/text/__init__.py | 10 ++++++ .../site-packages/core/parsers/text/html_parser.py | 32 ++++++++++++++++++ .../site-packages/core/parsers/text/md_parser.py | 39 ++++++++++++++++++++++ .../site-packages/core/parsers/text/text_parser.py | 30 +++++++++++++++++ 4 files changed, 111 insertions(+) create mode 100644 .venv/lib/python3.12/site-packages/core/parsers/text/__init__.py create mode 100644 .venv/lib/python3.12/site-packages/core/parsers/text/html_parser.py create mode 100644 .venv/lib/python3.12/site-packages/core/parsers/text/md_parser.py create mode 100644 .venv/lib/python3.12/site-packages/core/parsers/text/text_parser.py (limited to '.venv/lib/python3.12/site-packages/core/parsers/text') diff --git a/.venv/lib/python3.12/site-packages/core/parsers/text/__init__.py b/.venv/lib/python3.12/site-packages/core/parsers/text/__init__.py new file mode 100644 index 00000000..8f85d046 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/core/parsers/text/__init__.py @@ -0,0 +1,10 @@ +# type: ignore +from .html_parser import HTMLParser +from .md_parser import MDParser +from .text_parser import TextParser + +__all__ = [ + "MDParser", + "HTMLParser", + "TextParser", +] diff --git a/.venv/lib/python3.12/site-packages/core/parsers/text/html_parser.py b/.venv/lib/python3.12/site-packages/core/parsers/text/html_parser.py new file mode 100644 index 00000000..a04331e0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/core/parsers/text/html_parser.py @@ -0,0 +1,32 @@ +# type: ignore +from typing import AsyncGenerator + +from bs4 import BeautifulSoup + +from core.base.parsers.base_parser import AsyncParser +from core.base.providers import ( + CompletionProvider, + DatabaseProvider, + IngestionConfig, +) + + +class HTMLParser(AsyncParser[str | bytes]): + """A parser for HTML data.""" + + def __init__( + self, + config: IngestionConfig, + database_provider: DatabaseProvider, + llm_provider: CompletionProvider, + ): + self.database_provider = database_provider + self.llm_provider = llm_provider + self.config = config + + async def ingest( + self, data: str | bytes, *args, **kwargs + ) -> AsyncGenerator[str, None]: + """Ingest HTML data and yield text.""" + soup = BeautifulSoup(data, "html.parser") + yield soup.get_text() diff --git a/.venv/lib/python3.12/site-packages/core/parsers/text/md_parser.py b/.venv/lib/python3.12/site-packages/core/parsers/text/md_parser.py new file mode 100644 index 00000000..7ab11d92 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/core/parsers/text/md_parser.py @@ -0,0 +1,39 @@ +# type: ignore +from typing import AsyncGenerator + +from bs4 import BeautifulSoup + +from core.base.parsers.base_parser import AsyncParser +from core.base.providers import ( + CompletionProvider, + DatabaseProvider, + IngestionConfig, +) + + +class MDParser(AsyncParser[str | bytes]): + """A parser for Markdown data.""" + + def __init__( + self, + config: IngestionConfig, + database_provider: DatabaseProvider, + llm_provider: CompletionProvider, + ): + self.database_provider = database_provider + self.llm_provider = llm_provider + self.config = config + + import markdown + + self.markdown = markdown + + async def ingest( + self, data: str | bytes, *args, **kwargs + ) -> AsyncGenerator[str, None]: + """Ingest Markdown data and yield text.""" + if isinstance(data, bytes): + data = data.decode("utf-8") + html = self.markdown.markdown(data) + soup = BeautifulSoup(html, "html.parser") + yield soup.get_text() diff --git a/.venv/lib/python3.12/site-packages/core/parsers/text/text_parser.py b/.venv/lib/python3.12/site-packages/core/parsers/text/text_parser.py new file mode 100644 index 00000000..51ff1cbd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/core/parsers/text/text_parser.py @@ -0,0 +1,30 @@ +# type: ignore +from typing import AsyncGenerator + +from core.base.parsers.base_parser import AsyncParser +from core.base.providers import ( + CompletionProvider, + DatabaseProvider, + IngestionConfig, +) + + +class TextParser(AsyncParser[str | bytes]): + """A parser for raw text data.""" + + def __init__( + self, + config: IngestionConfig, + database_provider: DatabaseProvider, + llm_provider: CompletionProvider, + ): + self.database_provider = database_provider + self.llm_provider = llm_provider + self.config = config + + async def ingest( + self, data: str | bytes, *args, **kwargs + ) -> AsyncGenerator[str | bytes, None]: + if isinstance(data, bytes): + data = data.decode("utf-8") + yield data -- cgit v1.2.3