blob: ada9ae57e7b20d567d8010c0a89ef6eacdec0519 (
about) (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
from typing import AsyncGenerator
from bs4 import BeautifulSoup
from r2r.base.abstractions.document import DataType
from r2r.base.parsers.base_parser import AsyncParser
class MDParser(AsyncParser[DataType]):
"""A parser for Markdown data."""
def __init__(self):
import markdown
self.markdown = markdown
async def ingest(self, data: DataType) -> AsyncGenerator[str, None]:
"""Ingest Markdown data and yield text."""
if isinstance(data, bytes):
data = data.decode("utf-8")
html = self.markdown.markdown(data)
soup = BeautifulSoup(html, "html.parser")
yield soup.get_text()
|