import json import os import fire import requests from bs4 import BeautifulSoup, Comment from r2r import ( EntityType, R2RClient, R2RPromptProvider, Relation, update_kg_prompt, ) def escape_braces(text): return text.replace("{", "{{").replace("}", "}}") def get_all_yc_co_directory_urls(): this_file_path = os.path.abspath(os.path.dirname(__file__)) yc_company_dump_path = os.path.join( this_file_path, "..", "data", "yc_companies.txt" ) with open(yc_company_dump_path, "r") as f: urls = f.readlines() urls = [url.strip() for url in urls] return {url.split("/")[-1]: url for url in urls} # Function to fetch and clean HTML content def fetch_and_clean_yc_co_data(url): # Fetch the HTML content from the URL response = requests.get(url) response.raise_for_status() # Raise an error for bad status codes html_content = response.text # Parse the HTML content with BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") # Remove all