Spaces:
Running
Running
Create GPT4KG.py
Browse files
GPT4KG.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pydot
|
3 |
+
from sentence_transformers import SentenceTransformer, util
|
4 |
+
import numpy as np
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
import torch
|
7 |
+
import openai
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
system_text = """You are an expert AI that extracts knowledge graphs from text and outputs JSON files with the extracted knowledge, and nothing more. Here's how the JSON is broken down.
|
11 |
+
Entity dictionaries are organized in a list
|
12 |
+
Every entity mentioned in the text has its own entity dictionary, in which the name of the entity is the key, and the value is a list of relationships.
|
13 |
+
Each relationship contains a short word or two accurately describing the relationship to the other entity as the key, and then the other entity as a value.
|
14 |
+
All inverses of these relationships are represented in the relationship list of the other entities. This is REALY IMPORTANT. For example if Apple created the iPhone, it is also important to note that the iPhone was created by Apple (each entity should have this relsationship from their perspective).
|
15 |
+
Non specified relationships are also inferred (if person X is the son of person Y, and person Z is person X's sibling, person Z is also the child of person Y).
|
16 |
+
The JSON contains NO NEW LINES. All the data should be on one line.
|
17 |
+
Every entity has a "description" relationship which provides a short description of what it is in a few words. If the description references another entity, then this relationship MUST be graphed, even if it is redundant.
|
18 |
+
Relationships are only created about facts, not just any connection between two entities mentioned in the text.
|
19 |
+
Example output:
|
20 |
+
[{"Toki Pona": [{"description": "philosophical artistic constructed language"}, {"translated as": "the language of good"}, {"created by": "Sonja Lang"}, {"first published": "2001"}, {"complete form published in": "Toki Pona: The Language of Good"}, {"supplementary dictionary": "Toki Pona Dictionary"}], "Sonja Lang": [{"description": "Canadian linguist and translator"}, {"creator of": "Toki Pona"}], "Toki Pona: The Language of Good": [{"description": "book"}, {"published in": "2014"}, {"language": "Toki Pona"}], "Toki Pona Dictionary": [{"description": "dictionary"}, {"released in": "July 2021"}, {"based on": "community usage"}]}]"""
|
21 |
+
|
22 |
+
class KnowledgeGraph:
|
23 |
+
def __init__(self,api_key,kg_file=""):
|
24 |
+
openai.api_key = api_key
|
25 |
+
self.system_text = system_text
|
26 |
+
self.graph = pydot.Dot(graph_type="digraph")
|
27 |
+
self.entities = {}
|
28 |
+
self.fact_scores = {}
|
29 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
+
self.model = SentenceTransformer('all-MiniLM-L6-v2').to(self.device)
|
31 |
+
self.entity_embeddings = {}
|
32 |
+
if kg_file!="":
|
33 |
+
self.load_graph(kg_file)
|
34 |
+
|
35 |
+
def add_entity(self, name, description):
|
36 |
+
if name not in self.entities:
|
37 |
+
self.entities[name] = {"description": description}
|
38 |
+
entity_node = pydot.Node(name, label=f"{name}\n({description})")
|
39 |
+
self.graph.add_node(entity_node)
|
40 |
+
self.entity_embeddings[name] = self.model.encode(name)#+": \n"+"\n".join([key+": "+kg.entities[name][key] for key in kg.entities[name]]))
|
41 |
+
print("added embedding")
|
42 |
+
|
43 |
+
def add_relationship(self, entity1, relationship, entity2):
|
44 |
+
if entity1 in self.entities:
|
45 |
+
try:
|
46 |
+
self.entities[entity1][relationship] += ", "+entity2
|
47 |
+
except:
|
48 |
+
self.entities[entity1][relationship] = entity2
|
49 |
+
edge = pydot.Edge(entity1, entity2, label=relationship)
|
50 |
+
self.graph.add_edge(edge)
|
51 |
+
|
52 |
+
def update_graph(self, json_str,clean=True):
|
53 |
+
try:
|
54 |
+
data = json.loads(json_str)
|
55 |
+
except:
|
56 |
+
print("GPT4 failed to create a valid JSON. Input may be too long for processing.")
|
57 |
+
return
|
58 |
+
for entity_dict in data:
|
59 |
+
for entity, relationships in entity_dict.items():
|
60 |
+
try:
|
61 |
+
self.add_entity(entity, relationships[0]["description"])
|
62 |
+
except:
|
63 |
+
self.add_entity(entity, "")
|
64 |
+
for rel in relationships[1:]:
|
65 |
+
for relationship, other_entity in rel.items():
|
66 |
+
try:
|
67 |
+
self.add_relationship(entity, relationship, other_entity)
|
68 |
+
except:
|
69 |
+
for o in other_entity:
|
70 |
+
self.add_relationship(entity, relationship, o)
|
71 |
+
if clean:
|
72 |
+
for entity_dict in data:
|
73 |
+
for entity, relationships in entity_dict.items():
|
74 |
+
self.clean_graph(entity)
|
75 |
+
|
76 |
+
def display_graph(self, output_file="knowledge_graph.png"):
|
77 |
+
self.graph.write_png(output_file)
|
78 |
+
img = Image.open(output_file)
|
79 |
+
img.show()
|
80 |
+
|
81 |
+
def search(self, query, n=5):
|
82 |
+
if len(self.entity_embeddings)<5:
|
83 |
+
n = len(self.entity_embeddings)
|
84 |
+
query_embedding = self.model.encode(query)
|
85 |
+
query_tensor = torch.tensor([query_embedding])
|
86 |
+
entity_tensor = torch.tensor(list(self.entity_embeddings.values()))
|
87 |
+
similarities = util.cos_sim(query_tensor, query_tensor).numpy()
|
88 |
+
top_indices = np.argsort(similarities[0])[-n:][::-1]
|
89 |
+
results = [(list(self.entity_embeddings.keys())[index], similarities[0][index]) for index in top_indices]
|
90 |
+
return results
|
91 |
+
|
92 |
+
def related_entities(self,query, n=5):
|
93 |
+
query_embedding = self.model.encode(query)
|
94 |
+
query_tensor = torch.tensor([query_embedding])
|
95 |
+
potentities = [key+": "+self.entities[key]["description"] for key in self.entities]
|
96 |
+
entity_tensor = self.model.encode(potentities)
|
97 |
+
similarities = util.cos_sim(query_tensor, entity_tensor).numpy()
|
98 |
+
if len(similarities)<n:
|
99 |
+
n = len(similarities)
|
100 |
+
top_indices = np.argsort(similarities[0])[-n:][::-1]
|
101 |
+
results = [potentities[index] for index in top_indices]
|
102 |
+
return results
|
103 |
+
|
104 |
+
def text_to_data(self,text):
|
105 |
+
system = {"role":"system","content":self.system_text}
|
106 |
+
messages = [system]
|
107 |
+
try:
|
108 |
+
related = self.related_entities(text)
|
109 |
+
text = text+f"\n\nGenerate the JSON for the text above, remembering to add inverse relationships and inferences. Here are some related entities already in the graph. If you are adding information about any of them, refer to them by the names below (otherwise ignore this information):\n\n{str(related)}"
|
110 |
+
except:
|
111 |
+
pass
|
112 |
+
messages.append({"role":"user","content":text})
|
113 |
+
output = openai.ChatCompletion.create(model="gpt-4",messages=messages)["choices"][0]["message"].to_dict()["content"]
|
114 |
+
return output
|
115 |
+
|
116 |
+
def learn(self,text,show_output=False):
|
117 |
+
json_str = self.text_to_data(text)
|
118 |
+
if show_output:
|
119 |
+
print(json_str)
|
120 |
+
self.update_graph(json_str)
|
121 |
+
|
122 |
+
def graph_search(self,query,n=5,path="subgraph.png"):
|
123 |
+
results = self.search(query, n)
|
124 |
+
if len(results)<n:
|
125 |
+
n = len(results)
|
126 |
+
top_ents = [results[i][0] for i in range(n)]
|
127 |
+
data = [{ent:[{key:self.entities[ent][key]} for key in self.entities[ent]]} for ent in top_ents]
|
128 |
+
new = KnowledgeGraph()
|
129 |
+
json_string = json.dumps(data)
|
130 |
+
new.update_graph(str(json_string),clean=False)
|
131 |
+
new.display_graph(path)
|
132 |
+
|
133 |
+
def text_search(self,query,n=3):
|
134 |
+
results = self.search(query, n)
|
135 |
+
keys = [r[0] for r in results]
|
136 |
+
potentities = [key+": "+str(self.entities[key]) for key in keys]
|
137 |
+
for p in potentities:
|
138 |
+
print(p)
|
139 |
+
|
140 |
+
def qa_search(self,query,n=5):
|
141 |
+
results = self.search(query, n)
|
142 |
+
keys = [r[0] for r in results]
|
143 |
+
facts = [key+": "+str(rel).replace("description","is")+" "+str(self.entities[key][rel]) for key in keys for rel in self.entities[key]]
|
144 |
+
query_embedding = self.model.encode(query)
|
145 |
+
query_tensor = torch.tensor([query_embedding])
|
146 |
+
fact_tensor = self.model.encode(facts)
|
147 |
+
similarities = util.cos_sim(query_tensor, fact_tensor).numpy()
|
148 |
+
if len(similarities[0])<n:
|
149 |
+
n = len(similarities)
|
150 |
+
top_indices = np.argsort(similarities[0])[-n:][::-1]
|
151 |
+
results = [facts[index] for index in top_indices]
|
152 |
+
return results
|
153 |
+
|
154 |
+
def chat_qa(self,query):
|
155 |
+
results = self.qa_search(query)
|
156 |
+
system = {"role":"system","content":"You are a helpful chatbot that answers questions based on data in your fact database."}
|
157 |
+
messages = [system]
|
158 |
+
text = f"Question: {query}\n\nFact Data: \n{results}"
|
159 |
+
messages.append({"role":"user","content":text})
|
160 |
+
output = openai.ChatCompletion.create(model="gpt-3.5-turbo",messages=messages)["choices"][0]["message"].to_dict()["content"]
|
161 |
+
return output
|
162 |
+
|
163 |
+
def clean_graph(self,key):
|
164 |
+
facts = [key+": "+str(rel).replace("description","is")+" "+str(self.entities[key][rel]) for rel in self.entities[key]]
|
165 |
+
rels = [rel for rel in self.entities[key]]
|
166 |
+
fact_embs = self.model.encode(facts)
|
167 |
+
scores = util.cos_sim(fact_embs,fact_embs)
|
168 |
+
pairs = []
|
169 |
+
for i in range(len(scores)):
|
170 |
+
for j in range(len(scores[i])):
|
171 |
+
if round(scores[i][j].item(),3)!=1.0 and scores[i][j]>0.7:
|
172 |
+
if (facts[i],facts[j]) not in pairs and (facts[j],facts[i]) not in pairs:
|
173 |
+
pairs.append((facts[i],facts[j]))
|
174 |
+
for pair in pairs:
|
175 |
+
system = {"role":"system","content":"You are a helpful chatbot that only outputs YES or NO"}
|
176 |
+
messages = [system]
|
177 |
+
messages.append({"role":"user","content":f"Do these two facts in our database express the same thing?: {pair}"})
|
178 |
+
output = openai.ChatCompletion.create(model="gpt-4",messages=messages)["choices"][0]["message"].to_dict()["content"]
|
179 |
+
if "yes" in output.lower():
|
180 |
+
bad_index = facts.index(pair[1])
|
181 |
+
redundant = rels[bad_index]
|
182 |
+
del self.entities[key][redundant]
|
183 |
+
good_index = facts.index(pair[0])
|
184 |
+
validated = rels[bad_index]
|
185 |
+
try:
|
186 |
+
self.fact_scores[(key,validated)]+=1
|
187 |
+
except:
|
188 |
+
self.fact_scores[(key,validated)]=1
|
189 |
+
|
190 |
+
def load_graph(self,kg_file):
|
191 |
+
with open(kg_file) as f:
|
192 |
+
lines = f.readlines()
|
193 |
+
graph_data = "\n".join(lines[:-1])
|
194 |
+
ents = eval(lines[-1])
|
195 |
+
data = [{ent:[{key:ents[ent][key]} for key in ents[ent]]} for ent in ents]
|
196 |
+
json_string = json.dumps(data)
|
197 |
+
print(json_string)
|
198 |
+
self.update_graph(str(json_string))
|
199 |
+
self.graph = pydot.graph_from_dot_data(graph_data)[0]
|
200 |
+
|
201 |
+
def save_graph(self,filename="mygraph.kg"):
|
202 |
+
with open(filename,"w") as f:
|
203 |
+
f.write("")
|
204 |
+
self.graph.write_dot(filename)
|
205 |
+
with open(filename,"a") as f:
|
206 |
+
f.write("\n")
|
207 |
+
f.write(str(self.entities))
|