naveed-stockmark commited on
Commit
acb2316
1 Parent(s): 3cf5a8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -52
app.py CHANGED
@@ -2,14 +2,24 @@ import pandas as pd
2
  from utils import normalize_text
3
  import streamlit as st
4
 
5
- # Data paths
6
- WIKIPEDIA_PATH = "./kensho_en_wiki_typing_technical.csv"
7
- WIKIDATA_PATH = "./wikidata_ss_processed.csv"
8
- REBEL_INFER_PATH = "./rebel_inference_processed_ss.csv"
9
- ENTITY_LINKING_PATH = "./linking_df_technical_min.csv"
 
 
 
 
 
 
 
 
10
 
11
  st.title("Materials use case search app")
12
 
 
 
13
  # User Input
14
  input_text = st.text_input(
15
  label="Enter the name of a material i.e steel, sand, plastic, etc and press Enter",
@@ -19,50 +29,63 @@ input_text = st.text_input(
19
 
20
  st.write("preparing data ...")
21
 
 
22
  @st.cache_data(persist="disk")
23
- def get_wiki_df():
24
- wiki_df = pd.read_csv(WIKIPEDIA_PATH)
25
- return wiki_df
26
-
27
- wiki_df = get_wiki_df()
28
-
29
- # filter out technical articles
30
- exclude_ids = set(wiki_df[(wiki_df.exclude == True) | (wiki_df.technical == False)].page_id.to_list())
31
- include_skpes = set(wiki_df[wiki_df.page_id.apply(lambda x: x not in exclude_ids)].skpe_id.to_list())
32
 
33
- wiki_df = wiki_df.drop(columns=['Unnamed: 0', 'en_probs', 'exclude'])
34
- wiki_df = wiki_df.rename(columns={'title_x': 'en_title'})
 
 
 
 
35
 
36
- # load kg df
37
 
 
38
  @st.cache_data(persist="disk")
39
- def get_wikidata_df():
40
- wikidata_df = pd.read_csv(WIKIDATA_PATH)
 
 
 
 
 
 
 
 
 
 
41
  return wikidata_df
42
 
43
  wikidata_df = get_wikidata_df()
44
 
45
- # filter technical wikidata
46
- wikidata_df = wikidata_df[wikidata_df.apply(lambda x: x.source_skpe in include_skpes and x.target_skpe in include_skpes, axis=1)]
47
-
48
  @st.cache_data(persist="disk")
49
- def get_rebel_infer_df():
50
- rebel_infer_df = pd.read_csv(REBEL_INFER_PATH)
51
- return rebel_infer_df
52
-
53
- rebel_infer_df = get_rebel_infer_df()
54
-
55
- # filter technical
56
- rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: type(x.source_skpe_id) == str and type(x.target_skpe_id) == str, axis=1)]
57
 
58
- rebel_infer_df = rebel_infer_df.drop(columns=['instance_id', 'source_text', 'target_text'])
59
- rebel_infer_df = rebel_infer_df.rename(columns={'source_skpe_id': 'source_skpe', 'target_skpe_id': 'target_skpe', 'source': 'source_en', 'target': 'target_en'})
 
 
60
 
 
 
 
61
 
62
- wikidata_df['source'] = 'wikidata'
63
- rebel_infer_df['source'] = 'rebel_wikipedia'
 
64
 
65
- rebel_infer_df = rebel_infer_df[rebel_infer_df.source_skpe != rebel_infer_df.target_skpe]
66
 
67
  kg_df = pd.concat([wikidata_df, rebel_infer_df])
68
 
@@ -71,22 +94,27 @@ def get_entity_linking_df():
71
  linking_df = pd.read_csv(ENTITY_LINKING_PATH)
72
  return linking_df
73
 
 
 
74
  linking_df = get_entity_linking_df()
75
 
76
  # normalise and match
77
  text_norm = normalize_text(input_text)
78
  match_df = linking_df[linking_df.text == text_norm]
79
 
 
 
 
80
  # top match skpe
81
  if len(match_df) > 0:
82
 
83
- top_skpe = match_df.skpe_id.mode()[0]
84
- all_skpe = set(match_df.skpe_id.to_list())
85
- skpe_to_count = dict(match_df.skpe_id.value_counts())
86
 
87
  # Match list
88
- wiki_match_df = wiki_df[wiki_df.skpe_id.apply(lambda x: x in all_skpe)].copy()
89
- wiki_match_df['link_score'] = wiki_match_df['skpe_id'].apply(lambda x: skpe_to_count[x] / sum(skpe_to_count.values()))
90
  wiki_match_df = wiki_match_df.sort_values(by='link_score', ascending=False)
91
 
92
  # show similar results
@@ -95,11 +123,11 @@ if len(match_df) > 0:
95
 
96
  # proceeding with top match
97
  st.write("Performing use case extraction for the following top match ...")
98
- wiki_df[wiki_df.skpe_id.apply(lambda x: x == top_skpe)]
99
 
100
  # Stuff that are made out of input
101
- made_of_df = kg_df[(kg_df.relation == 'made_from_material') & (kg_df.target_skpe == top_skpe)].copy()
102
- # made_of_list = made_of_df.source_skpe.to_list()
103
 
104
  if len(made_of_df) > 0:
105
 
@@ -113,10 +141,10 @@ if len(match_df) > 0:
113
  # iterate over first rows
114
  for first_edge in made_of_df.itertuples():
115
 
116
- first_item = first_edge.source_skpe
117
 
118
  # applications of stuff made out of first item
119
- use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_skpe == first_item)) | ((kg_df.relation == 'uses') & (kg_df.target_skpe == first_item))]
120
 
121
  # add all 2 len paths
122
  for second_edge in use_df.itertuples():
@@ -125,16 +153,16 @@ if len(match_df) > 0:
125
  # expand to part of
126
 
127
  # applications of stuff made out of steel # 1
128
- part_df = kg_df[((kg_df.relation == 'has_part') & (kg_df.target_skpe == first_item)) | (kg_df.relation == 'part_of') & (kg_df.source_skpe == first_item)]
129
 
130
  # iterate over all parts of product
131
  for second_edge in part_df.itertuples():
132
 
133
  # select second item
134
- second_item = second_edge.source_skpe if second_edge.relation == 'has_part' else second_edge.target_skpe
135
 
136
  # get uses of second item
137
- use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_skpe == second_item)) | ((kg_df.relation == 'uses') & (kg_df.target_skpe == second_item))]
138
 
139
  # add all 3 len paths
140
  for third_edge in use_df.itertuples():
@@ -142,17 +170,31 @@ if len(match_df) > 0:
142
 
143
  if len(all_paths) > 0:
144
 
145
- st.write(f"Found following knowledge graph paths relevant to use cases of {input_text}")
146
  st.write("------")
147
 
148
  # print all paths
149
- for path in all_paths:
150
  material = path[0].target_en
 
 
 
151
  use_case = path[-1].source_en if path[-1].relation == 'uses' else path[-1].target_en
 
 
 
 
152
 
153
  for edge in path:
154
- st.write(f"{edge.source_en} --{edge.relation}--> {edge.target_en} (source: {edge.source})")
155
- st.write(f"**Conclusion: {material} is useful for {use_case}**")
 
 
 
 
 
 
 
156
  st.write("------")
157
  else:
158
  st.write("Found no knowledge graph paths relevant to use cases")
 
2
  from utils import normalize_text
3
  import streamlit as st
4
 
5
+ ### Data paths
6
+ # WIKIPEDIA_PATH = "./kensho_en_wiki_typing_technical.csv"
7
+ # WIKIDATA_PATH = "./wikidata_ss_processed.csv"
8
+ # REBEL_INFER_PATH = "./rebel_inference_processed_ss.csv"
9
+ # ENTITY_LINKING_PATH = "./linking_df_technical_min.csv"
10
+
11
+ relation_to_id = {
12
+ "uses": 2283,
13
+ "has_use": 366,
14
+ "part_of": 361,
15
+ "has_part": 527,
16
+ "made_from_material": 186
17
+ }
18
 
19
  st.title("Materials use case search app")
20
 
21
+
22
+
23
  # User Input
24
  input_text = st.text_input(
25
  label="Enter the name of a material i.e steel, sand, plastic, etc and press Enter",
 
29
 
30
  st.write("preparing data ...")
31
 
32
+ # Wikipedia metadata
33
  @st.cache_data(persist="disk")
34
+ def get_wiki_df(path="./kensho_en_wiki_typing_technical.csv"):
35
+ wiki_df = pd.read_csv(path)
36
+
37
+ # filter out technical articles
38
+ exclude_ids = set(wiki_df[(wiki_df.exclude == True) | (wiki_df.technical == False)].page_id.to_list())
39
+ include_skpes = set(wiki_df[wiki_df.page_id.apply(lambda x: x not in exclude_ids)].skpe_id.to_list())
 
 
 
40
 
41
+ skpe_to_wikidata = dict(zip(wiki_df.skpe_id.to_list(), wiki_df.item_id.to_list()))
42
+
43
+ wiki_df = wiki_df.drop(columns=['Unnamed: 0', 'en_probs', 'exclude'])
44
+ wiki_df = wiki_df.rename(columns={'title_x': 'en_title'})
45
+
46
+ return wiki_df, include_skpes, skpe_to_wikidata
47
 
48
+ wiki_df, include_skpes, skpe_to_wikidata = get_wiki_df()
49
 
50
+ # KG data source 1: Wikidata
51
  @st.cache_data(persist="disk")
52
+ def get_wikidata_df(path="./wikidata_ss_processed.csv"):
53
+ wikidata_df = pd.read_csv(path)
54
+
55
+ # filter technical wikidata
56
+ wikidata_df = wikidata_df[wikidata_df.apply(lambda x: x.source_skpe in include_skpes and x.target_skpe in include_skpes, axis=1)]
57
+
58
+ wikidata_df['source_wikidata'] = wikidata_df.source_skpe.apply(lambda x: skpe_to_wikidata[x])
59
+ wikidata_df['target_wikidata'] = wikidata_df.target_skpe.apply(lambda x: skpe_to_wikidata[x])
60
+ wikidata_df = wikidata_df.drop(columns=['source_skpe', 'target_skpe'])
61
+
62
+ wikidata_df['source'] = 'wikidata'
63
+
64
  return wikidata_df
65
 
66
  wikidata_df = get_wikidata_df()
67
 
 
 
 
68
  @st.cache_data(persist="disk")
69
+ def get_rebel_infer_df(path="./rebel_inference_processed_ss.csv"):
70
+ rebel_infer_df = pd.read_csv(path)
71
+
72
+ # filter technical
73
+ rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: type(x.source_skpe_id) == str and type(x.target_skpe_id) == str, axis=1)]
 
 
 
74
 
75
+ rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: x.source_skpe_id in skpe_to_wikidata.keys() and x.target_skpe_id in skpe_to_wikidata.keys(), axis=1)]
76
+ rebel_infer_df['source_wikidata'] = rebel_infer_df.source_skpe_id.apply(lambda x: skpe_to_wikidata[x])
77
+ rebel_infer_df['target_wikidata'] = rebel_infer_df.target_skpe_id.apply(lambda x: skpe_to_wikidata[x])
78
+ # rebel_infer_df['title_page_id'] = rebel_infer_df.page_skpe_id.apply(lambda x: skpe_to_wikidata[x])
79
 
80
+ rebel_infer_df = rebel_infer_df.drop(columns=['instance_id', 'source_text', 'target_text', 'page_skpe_id', 'source_skpe_id', 'target_skpe_id'])
81
+ rebel_infer_df = rebel_infer_df.rename(columns={'source_skpe_id': 'source_skpe', 'target_skpe_id': 'target_skpe', 'source': 'source_en', 'target': 'target_en'})
82
+ rebel_infer_df = rebel_infer_df[rebel_infer_df.source_wikidata != rebel_infer_df.target_wikidata]
83
 
84
+ rebel_infer_df['source'] = 'rebel_wikipedia'
85
+
86
+ return rebel_infer_df
87
 
88
+ rebel_infer_df = get_rebel_infer_df()
89
 
90
  kg_df = pd.concat([wikidata_df, rebel_infer_df])
91
 
 
94
  linking_df = pd.read_csv(ENTITY_LINKING_PATH)
95
  return linking_df
96
 
97
+ st.write("matching input text ...")
98
+
99
  linking_df = get_entity_linking_df()
100
 
101
  # normalise and match
102
  text_norm = normalize_text(input_text)
103
  match_df = linking_df[linking_df.text == text_norm]
104
 
105
+ match_df = match_df[match_df.skpe_id.apply(lambda x: x in skpe_to_wikidata.keys())]
106
+ match_df['wikidata_id'] = match_df.skpe_id.apply(lambda x: skpe_to_wikidata[x])
107
+
108
  # top match skpe
109
  if len(match_df) > 0:
110
 
111
+ top_wikidata = match_df.wikidata_id.mode()[0]
112
+ all_wikidata = set(match_df.wikidata_id.to_list())
113
+ wikidata_to_count = dict(match_df.wikidata_id.value_counts())
114
 
115
  # Match list
116
+ wiki_match_df = wiki_df[wiki_df.item_id.apply(lambda x: x in all_wikidata)].copy()
117
+ wiki_match_df['link_score'] = wiki_match_df['item_id'].apply(lambda x: wikidata_to_count[x] / sum(wikidata_to_count.values()))
118
  wiki_match_df = wiki_match_df.sort_values(by='link_score', ascending=False)
119
 
120
  # show similar results
 
123
 
124
  # proceeding with top match
125
  st.write("Performing use case extraction for the following top match ...")
126
+ wiki_df[wiki_df.item_id.apply(lambda x: x == top_wikidata)]
127
 
128
  # Stuff that are made out of input
129
+ made_of_df = kg_df[(kg_df.relation == 'made_from_material') & (kg_df.target_wikidata == top_wikidata)].copy()
130
+ # made_of_list = made_of_df.source_wikidata.to_list()
131
 
132
  if len(made_of_df) > 0:
133
 
 
141
  # iterate over first rows
142
  for first_edge in made_of_df.itertuples():
143
 
144
+ first_item = first_edge.source_wikidata
145
 
146
  # applications of stuff made out of first item
147
+ use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_wikidata == first_item)) | ((kg_df.relation == 'uses') & (kg_df.target_wikidata == first_item))]
148
 
149
  # add all 2 len paths
150
  for second_edge in use_df.itertuples():
 
153
  # expand to part of
154
 
155
  # applications of stuff made out of steel # 1
156
+ part_df = kg_df[((kg_df.relation == 'has_part') & (kg_df.target_wikidata == first_item)) | (kg_df.relation == 'part_of') & (kg_df.source_wikidata == first_item)]
157
 
158
  # iterate over all parts of product
159
  for second_edge in part_df.itertuples():
160
 
161
  # select second item
162
+ second_item = second_edge.source_wikidata if second_edge.relation == 'has_part' else second_edge.target_wikidata
163
 
164
  # get uses of second item
165
+ use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_wikidata == second_item)) | ((kg_df.relation == 'uses') & (kg_df.target_wikidata == second_item))]
166
 
167
  # add all 3 len paths
168
  for third_edge in use_df.itertuples():
 
170
 
171
  if len(all_paths) > 0:
172
 
173
+ st.write(f"Found {len(all_paths)} knowledge graph paths relevant to use cases of {input_text}")
174
  st.write("------")
175
 
176
  # print all paths
177
+ for i, path in enumerate(all_paths):
178
  material = path[0].target_en
179
+ material_wikidata = path[0].target_wikidata
180
+ material_url = f"https://www.wikidata.org/wiki/Q{material_wikidata}"
181
+
182
  use_case = path[-1].source_en if path[-1].relation == 'uses' else path[-1].target_en
183
+ use_case_wikidata = path[-1].source_wikidata if path[-1].relation == 'uses' else path[-1].target_wikidata
184
+ use_case_url = f"https://www.wikidata.org/wiki/Q{use_case_wikidata}"
185
+
186
+ st.write(f"**Reasoning Path {i+1}:**")
187
 
188
  for edge in path:
189
+
190
+ source_url = f"https://www.wikidata.org/wiki/Q{edge.source_wikidata}"
191
+ target_url = f"https://www.wikidata.org/wiki/Q{edge.target_wikidata}"
192
+
193
+ relation_url = f"https://www.wikidata.org/wiki/Property:P{relation_to_id[edge.relation]}"
194
+
195
+ st.markdown(f"[{edge.source_en}]({source_url}) --[{edge.relation}]({relation_url})--> [{edge.target_en}]({target_url}) (source: {edge.source})")
196
+ st.write("**Conclusion:**")
197
+ st.write(f"[{material}]({material_url}) is useful for [{use_case}]({use_case_url})")
198
  st.write("------")
199
  else:
200
  st.write("Found no knowledge graph paths relevant to use cases")