1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
|
import json
import os
import fire
import requests
from bs4 import BeautifulSoup, Comment
from r2r import (
EntityType,
R2RClient,
R2RPromptProvider,
Relation,
update_kg_prompt,
)
def escape_braces(text):
return text.replace("{", "{{").replace("}", "}}")
def get_all_yc_co_directory_urls():
this_file_path = os.path.abspath(os.path.dirname(__file__))
yc_company_dump_path = os.path.join(
this_file_path, "..", "data", "yc_companies.txt"
)
with open(yc_company_dump_path, "r") as f:
urls = f.readlines()
urls = [url.strip() for url in urls]
return {url.split("/")[-1]: url for url in urls}
# Function to fetch and clean HTML content
def fetch_and_clean_yc_co_data(url):
# Fetch the HTML content from the URL
response = requests.get(url)
response.raise_for_status() # Raise an error for bad status codes
html_content = response.text
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Remove all <script>, <style>, <meta>, <link>, <header>, <nav>, and <footer> elements
for element in soup(
["script", "style", "meta", "link", "header", "nav", "footer"]
):
element.decompose()
# Remove comments
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
comment.extract()
# Select the main content (you can adjust the selector based on the structure of your target pages)
main_content = soup.select_one("main") or soup.body
if main_content:
spans = main_content.find_all(["span", "a"])
proc_spans = []
for span in spans:
proc_spans.append(span.get_text(separator=" ", strip=True))
span_text = "\n".join(proc_spans)
# Extract the text content from the main content
paragraphs = main_content.find_all(
["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]
)
cleaned_text = (
"### Bulk:\n\n"
+ "\n\n".join(
paragraph.get_text(separator=" ", strip=True)
for paragraph in paragraphs
)
+ "\n\n### Metadata:\n\n"
+ span_text
)
return cleaned_text
else:
return "Main content not found"
def execute_query(provider, query, params={}):
print(f"Executing query: {query}")
with provider.client.session(database=provider._database) as session:
result = session.run(query, params)
return [record.data() for record in result]
def main(
max_entries=50,
local_mode=True,
base_url="http://localhost:8000",
):
# Specify the entity types for the KG extraction prompt
entity_types = [
EntityType("COMPANY"),
EntityType("SCHOOL"),
EntityType("LOCATION"),
EntityType("PERSON"),
EntityType("DATE"),
EntityType("OTHER"),
EntityType("QUANTITY"),
EntityType("EVENT"),
EntityType("INDUSTRY"),
EntityType("MEDIA"),
]
# Specify the relations for the KG construction
relations = [
# Founder Relations
Relation("EDUCATED_AT"),
Relation("WORKED_AT"),
Relation("FOUNDED"),
# Company relations
Relation("RAISED"),
Relation("REVENUE"),
Relation("TEAM_SIZE"),
Relation("LOCATION"),
Relation("ACQUIRED_BY"),
Relation("ANNOUNCED"),
Relation("INDUSTRY"),
# Product relations
Relation("PRODUCT"),
Relation("FEATURES"),
Relation("TECHNOLOGY"),
# Additional relations
Relation("HAS"),
Relation("AS_OF"),
Relation("PARTICIPATED"),
Relation("ASSOCIATED"),
]
client = R2RClient(base_url=base_url)
r2r_prompts = R2RPromptProvider()
prompt_base = (
"zero_shot_ner_kg_extraction"
if local_mode
else "few_shot_ner_kg_extraction"
)
update_kg_prompt(client, r2r_prompts, prompt_base, entity_types, relations)
url_map = get_all_yc_co_directory_urls()
i = 0
# Ingest and clean the data for each company
for company, url in url_map.items():
company_data = fetch_and_clean_yc_co_data(url)
if i >= max_entries:
break
i += 1
try:
# Ingest as a text document
file_name = f"{company}.txt"
with open(file_name, "w") as f:
f.write(company_data)
client.ingest_files(
[file_name],
metadatas=[{"title": company}],
)
os.remove(file_name)
except:
continue
print(client.inspect_knowledge_graph(1_000)["results"])
if not local_mode:
update_kg_prompt(
client, r2r_prompts, "kg_agent", entity_types, relations
)
result = client.search(
query="Find up to 10 founders that worked at Google",
use_kg_search=True,
)["results"]
print("result:\n", result)
print("Search Result:\n", result["kg_search_results"])
result = client.rag(
query="Find up to 10 founders that worked at Google",
use_kg_search=True,
)
print("RAG Result:\n", result)
if __name__ == "__main__":
fire.Fire(main)
|