Spaces:
Runtime error
Runtime error
import pandas as pd | |
from utils import normalize_text | |
import streamlit as st | |
# load wikipedia data | |
wiki_df = pd.read_csv("../knowledge_platform/wiki_output/kensho_en_wiki_typing_technical.csv") | |
# filter out technical articles | |
exclude_ids = set(wiki_df[(wiki_df.exclude == True) | (wiki_df.technical == False)].page_id.to_list()) | |
include_skpes = set(wiki_df[wiki_df.page_id.apply(lambda x: x not in exclude_ids)].skpe_id.to_list()) | |
wiki_df = wiki_df.drop(columns=['Unnamed: 0', 'en_probs', 'exclude']) | |
wiki_df = wiki_df.rename(columns={'title_x': 'en_title'}) | |
# load kg df | |
"""Load wikidata""" | |
wikidata_df = pd.read_csv("../knowledge_platform/kg_data/wikidata_ss_processed.csv") | |
# filter technical wikidata | |
wikidata_df = wikidata_df[wikidata_df.apply(lambda x: x.source_skpe in include_skpes and x.target_skpe in include_skpes, axis=1)] | |
"""KG Infer data""" | |
rebel_infer_df = pd.read_csv("../knowledge_platform/kg_data/rebel_inference_processed_ss.csv") | |
# filter technical | |
rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: type(x.source_skpe_id) == str and type(x.target_skpe_id) == str, axis=1)] | |
rebel_infer_df = rebel_infer_df.drop(columns=['instance_id', 'source_text', 'target_text']) | |
rebel_infer_df = rebel_infer_df.rename(columns={'source_skpe_id': 'source_skpe', 'target_skpe_id': 'target_skpe', 'source': 'source_en', 'target': 'target_en'}) | |
wikidata_df['source'] = 'wikidata' | |
rebel_infer_df['source'] = 'rebel_wikipedia' | |
rebel_infer_df = rebel_infer_df[rebel_infer_df.source_skpe != rebel_infer_df.target_skpe] | |
kg_df = pd.concat([wikidata_df, rebel_infer_df]) | |
# ??? | |
# load entity linking dictionary | |
linking_df = pd.read_csv('./linking_df_technical_min.csv') | |
# User Input | |
input_text = st.text_input( | |
label="Enter first entity name", | |
value="semiconductor", | |
key="ent", | |
) | |
# normalise and match | |
text_norm = normalize_text(input_text) | |
match_df = linking_df[linking_df.text == text_norm] | |
# top match skpe | |
if len(match_df) > 0: | |
top_skpe = match_df.skpe_id.mode()[0] | |
all_skpe = set(match_df.skpe_id.to_list()) | |
skpe_to_count = dict(match_df.skpe_id.value_counts()) | |
# Match list | |
wiki_match_df = wiki_df[wiki_df.skpe_id.apply(lambda x: x in all_skpe)].copy() | |
wiki_match_df['link_score'] = wiki_match_df['skpe_id'].apply(lambda x: skpe_to_count[x] / sum(skpe_to_count.values())) | |
wiki_match_df = wiki_match_df.sort_values(by='link_score', ascending=False) | |
else: | |
st.write("no matches") | |
# show similar results | |
wiki_match_df.sort_values(by='views', ascending=False)[:5] | |
# Stuff that are made out of input | |
made_of_df = kg_df[(kg_df.relation == 'made_from_material') & (kg_df.target_skpe == top_skpe)].copy() | |
# made_of_list = made_of_df.source_skpe.to_list() | |
all_paths = [] | |
# iterate over first rows | |
for first_edge in made_of_df.itertuples(): | |
first_item = first_edge.source_skpe | |
# applications of stuff made out of first item | |
use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_skpe == first_item)) | ((kg_df.relation == 'uses') & (kg_df.target_skpe == first_item))] | |
# add all 2 len paths | |
for second_edge in use_df.itertuples(): | |
all_paths.append([first_edge, second_edge]) | |
# expand to part of | |
# applications of stuff made out of steel # 1 | |
part_df = kg_df[((kg_df.relation == 'has_part') & (kg_df.target_skpe == first_item)) | (kg_df.relation == 'part_of') & (kg_df.source_skpe == first_item)] | |
# iterate over all parts of product | |
for second_edge in part_df.itertuples(): | |
# select second item | |
second_item = second_edge.source_skpe if second_edge.relation == 'has_part' else second_edge.target_skpe | |
# get uses of second item | |
use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_skpe == second_item)) | ((kg_df.relation == 'uses') & (kg_df.target_skpe == second_item))] | |
# add all 3 len paths | |
for third_edge in use_df.itertuples(): | |
all_paths.append([first_edge, second_edge, third_edge]) | |
# print all paths | |
for path in all_paths: | |
for edge in path: | |
st.write(f"{edge.source_en} --{edge.relation}--> {edge.target_en} | source: {edge.source}") | |
st.write("------") |