Spaces:
Runtime error
Runtime error
naveed-stockmark
commited on
Commit
•
acb2316
1
Parent(s):
3cf5a8c
Update app.py
Browse files
app.py
CHANGED
@@ -2,14 +2,24 @@ import pandas as pd
|
|
2 |
from utils import normalize_text
|
3 |
import streamlit as st
|
4 |
|
5 |
-
|
6 |
-
WIKIPEDIA_PATH = "./kensho_en_wiki_typing_technical.csv"
|
7 |
-
WIKIDATA_PATH = "./wikidata_ss_processed.csv"
|
8 |
-
REBEL_INFER_PATH = "./rebel_inference_processed_ss.csv"
|
9 |
-
ENTITY_LINKING_PATH = "./linking_df_technical_min.csv"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
st.title("Materials use case search app")
|
12 |
|
|
|
|
|
13 |
# User Input
|
14 |
input_text = st.text_input(
|
15 |
label="Enter the name of a material i.e steel, sand, plastic, etc and press Enter",
|
@@ -19,50 +29,63 @@ input_text = st.text_input(
|
|
19 |
|
20 |
st.write("preparing data ...")
|
21 |
|
|
|
22 |
@st.cache_data(persist="disk")
|
23 |
-
def get_wiki_df():
|
24 |
-
wiki_df = pd.read_csv(
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
# filter out technical articles
|
30 |
-
exclude_ids = set(wiki_df[(wiki_df.exclude == True) | (wiki_df.technical == False)].page_id.to_list())
|
31 |
-
include_skpes = set(wiki_df[wiki_df.page_id.apply(lambda x: x not in exclude_ids)].skpe_id.to_list())
|
32 |
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
|
|
|
38 |
@st.cache_data(persist="disk")
|
39 |
-
def get_wikidata_df():
|
40 |
-
wikidata_df = pd.read_csv(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
return wikidata_df
|
42 |
|
43 |
wikidata_df = get_wikidata_df()
|
44 |
|
45 |
-
# filter technical wikidata
|
46 |
-
wikidata_df = wikidata_df[wikidata_df.apply(lambda x: x.source_skpe in include_skpes and x.target_skpe in include_skpes, axis=1)]
|
47 |
-
|
48 |
@st.cache_data(persist="disk")
|
49 |
-
def get_rebel_infer_df():
|
50 |
-
rebel_infer_df = pd.read_csv(
|
51 |
-
|
52 |
-
|
53 |
-
rebel_infer_df =
|
54 |
-
|
55 |
-
# filter technical
|
56 |
-
rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: type(x.source_skpe_id) == str and type(x.target_skpe_id) == str, axis=1)]
|
57 |
|
58 |
-
rebel_infer_df = rebel_infer_df.
|
59 |
-
rebel_infer_df = rebel_infer_df.
|
|
|
|
|
60 |
|
|
|
|
|
|
|
61 |
|
62 |
-
|
63 |
-
|
|
|
64 |
|
65 |
-
rebel_infer_df =
|
66 |
|
67 |
kg_df = pd.concat([wikidata_df, rebel_infer_df])
|
68 |
|
@@ -71,22 +94,27 @@ def get_entity_linking_df():
|
|
71 |
linking_df = pd.read_csv(ENTITY_LINKING_PATH)
|
72 |
return linking_df
|
73 |
|
|
|
|
|
74 |
linking_df = get_entity_linking_df()
|
75 |
|
76 |
# normalise and match
|
77 |
text_norm = normalize_text(input_text)
|
78 |
match_df = linking_df[linking_df.text == text_norm]
|
79 |
|
|
|
|
|
|
|
80 |
# top match skpe
|
81 |
if len(match_df) > 0:
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
|
87 |
# Match list
|
88 |
-
wiki_match_df = wiki_df[wiki_df.
|
89 |
-
wiki_match_df['link_score'] = wiki_match_df['
|
90 |
wiki_match_df = wiki_match_df.sort_values(by='link_score', ascending=False)
|
91 |
|
92 |
# show similar results
|
@@ -95,11 +123,11 @@ if len(match_df) > 0:
|
|
95 |
|
96 |
# proceeding with top match
|
97 |
st.write("Performing use case extraction for the following top match ...")
|
98 |
-
wiki_df[wiki_df.
|
99 |
|
100 |
# Stuff that are made out of input
|
101 |
-
made_of_df = kg_df[(kg_df.relation == 'made_from_material') & (kg_df.
|
102 |
-
# made_of_list = made_of_df.
|
103 |
|
104 |
if len(made_of_df) > 0:
|
105 |
|
@@ -113,10 +141,10 @@ if len(match_df) > 0:
|
|
113 |
# iterate over first rows
|
114 |
for first_edge in made_of_df.itertuples():
|
115 |
|
116 |
-
first_item = first_edge.
|
117 |
|
118 |
# applications of stuff made out of first item
|
119 |
-
use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.
|
120 |
|
121 |
# add all 2 len paths
|
122 |
for second_edge in use_df.itertuples():
|
@@ -125,16 +153,16 @@ if len(match_df) > 0:
|
|
125 |
# expand to part of
|
126 |
|
127 |
# applications of stuff made out of steel # 1
|
128 |
-
part_df = kg_df[((kg_df.relation == 'has_part') & (kg_df.
|
129 |
|
130 |
# iterate over all parts of product
|
131 |
for second_edge in part_df.itertuples():
|
132 |
|
133 |
# select second item
|
134 |
-
second_item = second_edge.
|
135 |
|
136 |
# get uses of second item
|
137 |
-
use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.
|
138 |
|
139 |
# add all 3 len paths
|
140 |
for third_edge in use_df.itertuples():
|
@@ -142,17 +170,31 @@ if len(match_df) > 0:
|
|
142 |
|
143 |
if len(all_paths) > 0:
|
144 |
|
145 |
-
st.write(f"Found
|
146 |
st.write("------")
|
147 |
|
148 |
# print all paths
|
149 |
-
for path in all_paths:
|
150 |
material = path[0].target_en
|
|
|
|
|
|
|
151 |
use_case = path[-1].source_en if path[-1].relation == 'uses' else path[-1].target_en
|
|
|
|
|
|
|
|
|
152 |
|
153 |
for edge in path:
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
st.write("------")
|
157 |
else:
|
158 |
st.write("Found no knowledge graph paths relevant to use cases")
|
|
|
2 |
from utils import normalize_text
|
3 |
import streamlit as st
|
4 |
|
5 |
+
### Data paths
|
6 |
+
# WIKIPEDIA_PATH = "./kensho_en_wiki_typing_technical.csv"
|
7 |
+
# WIKIDATA_PATH = "./wikidata_ss_processed.csv"
|
8 |
+
# REBEL_INFER_PATH = "./rebel_inference_processed_ss.csv"
|
9 |
+
# ENTITY_LINKING_PATH = "./linking_df_technical_min.csv"
|
10 |
+
|
11 |
+
relation_to_id = {
|
12 |
+
"uses": 2283,
|
13 |
+
"has_use": 366,
|
14 |
+
"part_of": 361,
|
15 |
+
"has_part": 527,
|
16 |
+
"made_from_material": 186
|
17 |
+
}
|
18 |
|
19 |
st.title("Materials use case search app")
|
20 |
|
21 |
+
|
22 |
+
|
23 |
# User Input
|
24 |
input_text = st.text_input(
|
25 |
label="Enter the name of a material i.e steel, sand, plastic, etc and press Enter",
|
|
|
29 |
|
30 |
st.write("preparing data ...")
|
31 |
|
32 |
+
# Wikipedia metadata
|
33 |
@st.cache_data(persist="disk")
|
34 |
+
def get_wiki_df(path="./kensho_en_wiki_typing_technical.csv"):
|
35 |
+
wiki_df = pd.read_csv(path)
|
36 |
+
|
37 |
+
# filter out technical articles
|
38 |
+
exclude_ids = set(wiki_df[(wiki_df.exclude == True) | (wiki_df.technical == False)].page_id.to_list())
|
39 |
+
include_skpes = set(wiki_df[wiki_df.page_id.apply(lambda x: x not in exclude_ids)].skpe_id.to_list())
|
|
|
|
|
|
|
40 |
|
41 |
+
skpe_to_wikidata = dict(zip(wiki_df.skpe_id.to_list(), wiki_df.item_id.to_list()))
|
42 |
+
|
43 |
+
wiki_df = wiki_df.drop(columns=['Unnamed: 0', 'en_probs', 'exclude'])
|
44 |
+
wiki_df = wiki_df.rename(columns={'title_x': 'en_title'})
|
45 |
+
|
46 |
+
return wiki_df, include_skpes, skpe_to_wikidata
|
47 |
|
48 |
+
wiki_df, include_skpes, skpe_to_wikidata = get_wiki_df()
|
49 |
|
50 |
+
# KG data source 1: Wikidata
|
51 |
@st.cache_data(persist="disk")
|
52 |
+
def get_wikidata_df(path="./wikidata_ss_processed.csv"):
|
53 |
+
wikidata_df = pd.read_csv(path)
|
54 |
+
|
55 |
+
# filter technical wikidata
|
56 |
+
wikidata_df = wikidata_df[wikidata_df.apply(lambda x: x.source_skpe in include_skpes and x.target_skpe in include_skpes, axis=1)]
|
57 |
+
|
58 |
+
wikidata_df['source_wikidata'] = wikidata_df.source_skpe.apply(lambda x: skpe_to_wikidata[x])
|
59 |
+
wikidata_df['target_wikidata'] = wikidata_df.target_skpe.apply(lambda x: skpe_to_wikidata[x])
|
60 |
+
wikidata_df = wikidata_df.drop(columns=['source_skpe', 'target_skpe'])
|
61 |
+
|
62 |
+
wikidata_df['source'] = 'wikidata'
|
63 |
+
|
64 |
return wikidata_df
|
65 |
|
66 |
wikidata_df = get_wikidata_df()
|
67 |
|
|
|
|
|
|
|
68 |
@st.cache_data(persist="disk")
|
69 |
+
def get_rebel_infer_df(path="./rebel_inference_processed_ss.csv"):
|
70 |
+
rebel_infer_df = pd.read_csv(path)
|
71 |
+
|
72 |
+
# filter technical
|
73 |
+
rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: type(x.source_skpe_id) == str and type(x.target_skpe_id) == str, axis=1)]
|
|
|
|
|
|
|
74 |
|
75 |
+
rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: x.source_skpe_id in skpe_to_wikidata.keys() and x.target_skpe_id in skpe_to_wikidata.keys(), axis=1)]
|
76 |
+
rebel_infer_df['source_wikidata'] = rebel_infer_df.source_skpe_id.apply(lambda x: skpe_to_wikidata[x])
|
77 |
+
rebel_infer_df['target_wikidata'] = rebel_infer_df.target_skpe_id.apply(lambda x: skpe_to_wikidata[x])
|
78 |
+
# rebel_infer_df['title_page_id'] = rebel_infer_df.page_skpe_id.apply(lambda x: skpe_to_wikidata[x])
|
79 |
|
80 |
+
rebel_infer_df = rebel_infer_df.drop(columns=['instance_id', 'source_text', 'target_text', 'page_skpe_id', 'source_skpe_id', 'target_skpe_id'])
|
81 |
+
rebel_infer_df = rebel_infer_df.rename(columns={'source_skpe_id': 'source_skpe', 'target_skpe_id': 'target_skpe', 'source': 'source_en', 'target': 'target_en'})
|
82 |
+
rebel_infer_df = rebel_infer_df[rebel_infer_df.source_wikidata != rebel_infer_df.target_wikidata]
|
83 |
|
84 |
+
rebel_infer_df['source'] = 'rebel_wikipedia'
|
85 |
+
|
86 |
+
return rebel_infer_df
|
87 |
|
88 |
+
rebel_infer_df = get_rebel_infer_df()
|
89 |
|
90 |
kg_df = pd.concat([wikidata_df, rebel_infer_df])
|
91 |
|
|
|
94 |
linking_df = pd.read_csv(ENTITY_LINKING_PATH)
|
95 |
return linking_df
|
96 |
|
97 |
+
st.write("matching input text ...")
|
98 |
+
|
99 |
linking_df = get_entity_linking_df()
|
100 |
|
101 |
# normalise and match
|
102 |
text_norm = normalize_text(input_text)
|
103 |
match_df = linking_df[linking_df.text == text_norm]
|
104 |
|
105 |
+
match_df = match_df[match_df.skpe_id.apply(lambda x: x in skpe_to_wikidata.keys())]
|
106 |
+
match_df['wikidata_id'] = match_df.skpe_id.apply(lambda x: skpe_to_wikidata[x])
|
107 |
+
|
108 |
# top match skpe
|
109 |
if len(match_df) > 0:
|
110 |
|
111 |
+
top_wikidata = match_df.wikidata_id.mode()[0]
|
112 |
+
all_wikidata = set(match_df.wikidata_id.to_list())
|
113 |
+
wikidata_to_count = dict(match_df.wikidata_id.value_counts())
|
114 |
|
115 |
# Match list
|
116 |
+
wiki_match_df = wiki_df[wiki_df.item_id.apply(lambda x: x in all_wikidata)].copy()
|
117 |
+
wiki_match_df['link_score'] = wiki_match_df['item_id'].apply(lambda x: wikidata_to_count[x] / sum(wikidata_to_count.values()))
|
118 |
wiki_match_df = wiki_match_df.sort_values(by='link_score', ascending=False)
|
119 |
|
120 |
# show similar results
|
|
|
123 |
|
124 |
# proceeding with top match
|
125 |
st.write("Performing use case extraction for the following top match ...")
|
126 |
+
wiki_df[wiki_df.item_id.apply(lambda x: x == top_wikidata)]
|
127 |
|
128 |
# Stuff that are made out of input
|
129 |
+
made_of_df = kg_df[(kg_df.relation == 'made_from_material') & (kg_df.target_wikidata == top_wikidata)].copy()
|
130 |
+
# made_of_list = made_of_df.source_wikidata.to_list()
|
131 |
|
132 |
if len(made_of_df) > 0:
|
133 |
|
|
|
141 |
# iterate over first rows
|
142 |
for first_edge in made_of_df.itertuples():
|
143 |
|
144 |
+
first_item = first_edge.source_wikidata
|
145 |
|
146 |
# applications of stuff made out of first item
|
147 |
+
use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_wikidata == first_item)) | ((kg_df.relation == 'uses') & (kg_df.target_wikidata == first_item))]
|
148 |
|
149 |
# add all 2 len paths
|
150 |
for second_edge in use_df.itertuples():
|
|
|
153 |
# expand to part of
|
154 |
|
155 |
# applications of stuff made out of steel # 1
|
156 |
+
part_df = kg_df[((kg_df.relation == 'has_part') & (kg_df.target_wikidata == first_item)) | (kg_df.relation == 'part_of') & (kg_df.source_wikidata == first_item)]
|
157 |
|
158 |
# iterate over all parts of product
|
159 |
for second_edge in part_df.itertuples():
|
160 |
|
161 |
# select second item
|
162 |
+
second_item = second_edge.source_wikidata if second_edge.relation == 'has_part' else second_edge.target_wikidata
|
163 |
|
164 |
# get uses of second item
|
165 |
+
use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_wikidata == second_item)) | ((kg_df.relation == 'uses') & (kg_df.target_wikidata == second_item))]
|
166 |
|
167 |
# add all 3 len paths
|
168 |
for third_edge in use_df.itertuples():
|
|
|
170 |
|
171 |
if len(all_paths) > 0:
|
172 |
|
173 |
+
st.write(f"Found {len(all_paths)} knowledge graph paths relevant to use cases of {input_text}")
|
174 |
st.write("------")
|
175 |
|
176 |
# print all paths
|
177 |
+
for i, path in enumerate(all_paths):
|
178 |
material = path[0].target_en
|
179 |
+
material_wikidata = path[0].target_wikidata
|
180 |
+
material_url = f"https://www.wikidata.org/wiki/Q{material_wikidata}"
|
181 |
+
|
182 |
use_case = path[-1].source_en if path[-1].relation == 'uses' else path[-1].target_en
|
183 |
+
use_case_wikidata = path[-1].source_wikidata if path[-1].relation == 'uses' else path[-1].target_wikidata
|
184 |
+
use_case_url = f"https://www.wikidata.org/wiki/Q{use_case_wikidata}"
|
185 |
+
|
186 |
+
st.write(f"**Reasoning Path {i+1}:**")
|
187 |
|
188 |
for edge in path:
|
189 |
+
|
190 |
+
source_url = f"https://www.wikidata.org/wiki/Q{edge.source_wikidata}"
|
191 |
+
target_url = f"https://www.wikidata.org/wiki/Q{edge.target_wikidata}"
|
192 |
+
|
193 |
+
relation_url = f"https://www.wikidata.org/wiki/Property:P{relation_to_id[edge.relation]}"
|
194 |
+
|
195 |
+
st.markdown(f"[{edge.source_en}]({source_url}) --[{edge.relation}]({relation_url})--> [{edge.target_en}]({target_url}) (source: {edge.source})")
|
196 |
+
st.write("**Conclusion:**")
|
197 |
+
st.write(f"[{material}]({material_url}) is useful for [{use_case}]({use_case_url})")
|
198 |
st.write("------")
|
199 |
else:
|
200 |
st.write("Found no knowledge graph paths relevant to use cases")
|