diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/core/parsers/text')
4 files changed, 111 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/core/parsers/text/__init__.py b/.venv/lib/python3.12/site-packages/core/parsers/text/__init__.py new file mode 100644 index 00000000..8f85d046 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/core/parsers/text/__init__.py @@ -0,0 +1,10 @@ +# type: ignore +from .html_parser import HTMLParser +from .md_parser import MDParser +from .text_parser import TextParser + +__all__ = [ + "MDParser", + "HTMLParser", + "TextParser", +] diff --git a/.venv/lib/python3.12/site-packages/core/parsers/text/html_parser.py b/.venv/lib/python3.12/site-packages/core/parsers/text/html_parser.py new file mode 100644 index 00000000..a04331e0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/core/parsers/text/html_parser.py @@ -0,0 +1,32 @@ +# type: ignore +from typing import AsyncGenerator + +from bs4 import BeautifulSoup + +from core.base.parsers.base_parser import AsyncParser +from core.base.providers import ( + CompletionProvider, + DatabaseProvider, + IngestionConfig, +) + + +class HTMLParser(AsyncParser[str | bytes]): + """A parser for HTML data.""" + + def __init__( + self, + config: IngestionConfig, + database_provider: DatabaseProvider, + llm_provider: CompletionProvider, + ): + self.database_provider = database_provider + self.llm_provider = llm_provider + self.config = config + + async def ingest( + self, data: str | bytes, *args, **kwargs + ) -> AsyncGenerator[str, None]: + """Ingest HTML data and yield text.""" + soup = BeautifulSoup(data, "html.parser") + yield soup.get_text() diff --git a/.venv/lib/python3.12/site-packages/core/parsers/text/md_parser.py b/.venv/lib/python3.12/site-packages/core/parsers/text/md_parser.py new file mode 100644 index 00000000..7ab11d92 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/core/parsers/text/md_parser.py @@ -0,0 +1,39 @@ +# type: ignore +from typing import AsyncGenerator + +from bs4 import BeautifulSoup + +from core.base.parsers.base_parser import AsyncParser +from core.base.providers import ( + CompletionProvider, + DatabaseProvider, + IngestionConfig, +) + + +class MDParser(AsyncParser[str | bytes]): + """A parser for Markdown data.""" + + def __init__( + self, + config: IngestionConfig, + database_provider: DatabaseProvider, + llm_provider: CompletionProvider, + ): + self.database_provider = database_provider + self.llm_provider = llm_provider + self.config = config + + import markdown + + self.markdown = markdown + + async def ingest( + self, data: str | bytes, *args, **kwargs + ) -> AsyncGenerator[str, None]: + """Ingest Markdown data and yield text.""" + if isinstance(data, bytes): + data = data.decode("utf-8") + html = self.markdown.markdown(data) + soup = BeautifulSoup(html, "html.parser") + yield soup.get_text() diff --git a/.venv/lib/python3.12/site-packages/core/parsers/text/text_parser.py b/.venv/lib/python3.12/site-packages/core/parsers/text/text_parser.py new file mode 100644 index 00000000..51ff1cbd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/core/parsers/text/text_parser.py @@ -0,0 +1,30 @@ +# type: ignore +from typing import AsyncGenerator + +from core.base.parsers.base_parser import AsyncParser +from core.base.providers import ( + CompletionProvider, + DatabaseProvider, + IngestionConfig, +) + + +class TextParser(AsyncParser[str | bytes]): + """A parser for raw text data.""" + + def __init__( + self, + config: IngestionConfig, + database_provider: DatabaseProvider, + llm_provider: CompletionProvider, + ): + self.database_provider = database_provider + self.llm_provider = llm_provider + self.config = config + + async def ingest( + self, data: str | bytes, *args, **kwargs + ) -> AsyncGenerator[str | bytes, None]: + if isinstance(data, bytes): + data = data.decode("utf-8") + yield data |