Spaces:

naveed-stockmark
/

kg_reasoning_demo

Runtime error

App Files Files Community

kg_reasoning_demo / app.py

naveed-stockmark

Upload 3 files

eb30e6c verified 8 months ago

raw

history blame

No virus

4.2 kB

	import pandas as pd
	from utils import normalize_text
	import streamlit as st

	# load wikipedia data
	wiki_df = pd.read_csv("../knowledge_platform/wiki_output/kensho_en_wiki_typing_technical.csv")

	# filter out technical articles
	exclude_ids = set(wiki_df[(wiki_df.exclude == True) \| (wiki_df.technical == False)].page_id.to_list())
	include_skpes = set(wiki_df[wiki_df.page_id.apply(lambda x: x not in exclude_ids)].skpe_id.to_list())

	wiki_df = wiki_df.drop(columns=['Unnamed: 0', 'en_probs', 'exclude'])
	wiki_df = wiki_df.rename(columns={'title_x': 'en_title'})

	# load kg df

	"""Load wikidata"""

	wikidata_df = pd.read_csv("../knowledge_platform/kg_data/wikidata_ss_processed.csv")

	# filter technical wikidata
	wikidata_df = wikidata_df[wikidata_df.apply(lambda x: x.source_skpe in include_skpes and x.target_skpe in include_skpes, axis=1)]


	"""KG Infer data"""


	rebel_infer_df = pd.read_csv("../knowledge_platform/kg_data/rebel_inference_processed_ss.csv")

	# filter technical
	rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: type(x.source_skpe_id) == str and type(x.target_skpe_id) == str, axis=1)]

	rebel_infer_df = rebel_infer_df.drop(columns=['instance_id', 'source_text', 'target_text'])
	rebel_infer_df = rebel_infer_df.rename(columns={'source_skpe_id': 'source_skpe', 'target_skpe_id': 'target_skpe', 'source': 'source_en', 'target': 'target_en'})


	wikidata_df['source'] = 'wikidata'
	rebel_infer_df['source'] = 'rebel_wikipedia'

	rebel_infer_df = rebel_infer_df[rebel_infer_df.source_skpe != rebel_infer_df.target_skpe]

	kg_df = pd.concat([wikidata_df, rebel_infer_df])


	# ???


	# load entity linking dictionary
	linking_df = pd.read_csv('./linking_df_technical_min.csv')

	# User Input
	input_text = st.text_input(
	label="Enter first entity name",
	value="semiconductor",
	key="ent",
	)

	# normalise and match
	text_norm = normalize_text(input_text)
	match_df = linking_df[linking_df.text == text_norm]

	# top match skpe
	if len(match_df) > 0:

	top_skpe = match_df.skpe_id.mode()[0]
	all_skpe = set(match_df.skpe_id.to_list())
	skpe_to_count = dict(match_df.skpe_id.value_counts())

	# Match list
	wiki_match_df = wiki_df[wiki_df.skpe_id.apply(lambda x: x in all_skpe)].copy()
	wiki_match_df['link_score'] = wiki_match_df['skpe_id'].apply(lambda x: skpe_to_count[x] / sum(skpe_to_count.values()))
	wiki_match_df = wiki_match_df.sort_values(by='link_score', ascending=False)

	else:
	st.write("no matches")

	# show similar results
	wiki_match_df.sort_values(by='views', ascending=False)[:5]

	# Stuff that are made out of input
	made_of_df = kg_df[(kg_df.relation == 'made_from_material') & (kg_df.target_skpe == top_skpe)].copy()
	# made_of_list = made_of_df.source_skpe.to_list()

	all_paths = []


	# iterate over first rows
	for first_edge in made_of_df.itertuples():

	first_item = first_edge.source_skpe

	# applications of stuff made out of first item
	use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_skpe == first_item)) \| ((kg_df.relation == 'uses') & (kg_df.target_skpe == first_item))]

	# add all 2 len paths
	for second_edge in use_df.itertuples():
	all_paths.append([first_edge, second_edge])

	# expand to part of

	# applications of stuff made out of steel # 1
	part_df = kg_df[((kg_df.relation == 'has_part') & (kg_df.target_skpe == first_item)) \| (kg_df.relation == 'part_of') & (kg_df.source_skpe == first_item)]

	# iterate over all parts of product
	for second_edge in part_df.itertuples():

	# select second item
	second_item = second_edge.source_skpe if second_edge.relation == 'has_part' else second_edge.target_skpe

	# get uses of second item
	use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_skpe == second_item)) \| ((kg_df.relation == 'uses') & (kg_df.target_skpe == second_item))]

	# add all 3 len paths
	for third_edge in use_df.itertuples():
	all_paths.append([first_edge, second_edge, third_edge])

	# print all paths
	for path in all_paths:
	for edge in path:
	st.write(f"{edge.source_en} --{edge.relation}--> {edge.target_en} \| source: {edge.source}")
	st.write("------")