blob: 9c663fbed503e32a4143dd5abf6b932c8c401529 (
about) (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
from typing import AsyncGenerator
from bs4 import BeautifulSoup
from r2r.base.abstractions.document import DataType
from r2r.base.parsers.base_parser import AsyncParser
class HTMLParser(AsyncParser[DataType]):
"""A parser for HTML data."""
async def ingest(self, data: DataType) -> AsyncGenerator[str, None]:
"""Ingest HTML data and yield text."""
soup = BeautifulSoup(data, "html.parser")
yield soup.get_text()
|