From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- R2R/r2r/examples/scripts/advanced_kg_cookbook.py | 194 +++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100755 R2R/r2r/examples/scripts/advanced_kg_cookbook.py (limited to 'R2R/r2r/examples/scripts/advanced_kg_cookbook.py') diff --git a/R2R/r2r/examples/scripts/advanced_kg_cookbook.py b/R2R/r2r/examples/scripts/advanced_kg_cookbook.py new file mode 100755 index 00000000..a4d59a79 --- /dev/null +++ b/R2R/r2r/examples/scripts/advanced_kg_cookbook.py @@ -0,0 +1,194 @@ +import json +import os + +import fire +import requests +from bs4 import BeautifulSoup, Comment + +from r2r import ( + EntityType, + R2RClient, + R2RPromptProvider, + Relation, + update_kg_prompt, +) + + +def escape_braces(text): + return text.replace("{", "{{").replace("}", "}}") + + +def get_all_yc_co_directory_urls(): + this_file_path = os.path.abspath(os.path.dirname(__file__)) + yc_company_dump_path = os.path.join( + this_file_path, "..", "data", "yc_companies.txt" + ) + + with open(yc_company_dump_path, "r") as f: + urls = f.readlines() + urls = [url.strip() for url in urls] + return {url.split("/")[-1]: url for url in urls} + + +# Function to fetch and clean HTML content +def fetch_and_clean_yc_co_data(url): + # Fetch the HTML content from the URL + response = requests.get(url) + response.raise_for_status() # Raise an error for bad status codes + html_content = response.text + + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(html_content, "html.parser") + + # Remove all