Spaces:
Running
Running
##### Imports ##### | |
import graphviz | |
import streamlit as st | |
import pandas as pd | |
# import numpy as np | |
# custom component | |
from st_keyup import st_keyup | |
##### Data Functions ##### | |
# What does persist = disk do ? | |
# @st.cache_data(persist="disk") | |
### For competitor analysis | |
def load_pandas_xlsx(path): | |
data = pd.read_excel(path) | |
return data | |
def build_company_df(input_df): | |
# build company df | |
output_df = input_df[['companyLabel', 'companyLabelJA', 'company']].drop_duplicates() | |
return output_df | |
### For industry analysis | |
def build_industry_df(input_df): | |
# Pre compute unique number of companies per industry | |
output_df = input_df[['company', 'companyLabel', 'companyLabelJA', 'industry', 'industryLabel', 'industryLabelJA']].drop_duplicates().groupby(['industry', 'industryLabel', 'industryLabelJA'])['company'].count().sort_values(ascending=False).reset_index().copy() | |
output_df = output_df.rename(columns={'company': 'n_competitors'}) | |
return output_df | |
### For product analysis | |
def build_product_df(input_df): | |
# Pre compute unique number of companies per product | |
output_df = input_df[['company', 'companyLabel', 'companyLabelJA', 'product', 'productLabel', 'productLabelJA']].drop_duplicates().groupby(['product', 'productLabel', 'productLabelJA'])['company'].count().sort_values(ascending=False).reset_index().copy() | |
output_df = output_df.rename(columns={'company': 'n_competitors'}) | |
return output_df | |
### For customer analysis | |
def build_company_product_kg(company_product_path, product_manufacturer_path): | |
company_product_df = pd.read_csv(company_product_path) | |
product_manufacturer_df = pd.read_csv(product_manufacturer_path) | |
output_df = pd.concat([company_product_df, product_manufacturer_df]) | |
return output_df | |
def build_company_df_2(input_df): | |
# build company df | |
output_df = input_df[['companyLabel', 'companyLabelJA', 'company']].drop_duplicates() | |
return output_df | |
property_mapping = { | |
'http://www.wikidata.org/prop/direct/P186': 'made_from_material', | |
'http://www.wikidata.org/prop/direct/P527': 'has_part', | |
'http://www.wikidata.org/prop/direct/P2283': 'uses', | |
'http://www.wikidata.org/prop/direct/P31': 'instance_of', | |
'http://www.wikidata.org/prop/direct/P366': 'has_use', | |
'http://www.wikidata.org/prop/direct/P361': 'part_of' | |
} | |
def build_product_kg_df(): | |
product_df_1 = pd.read_csv('data/product_manufacturer_relations_haspart_uses_madefrom_out.csv') | |
product_df_2 = pd.read_csv('data/product_relations_haspart_uses_madefrom_out.csv') | |
product_df_3 = pd.read_csv('data/product_relations_partof_hasuse_out.csv') | |
product_df_4 = pd.read_csv('data/product_manufacturer_relations_hasuse_partof_out.csv') | |
product_kg_df = pd.concat([product_df_1, product_df_2, product_df_3, product_df_4]).drop_duplicates() | |
product_kg_df['propertyLabel'] = product_kg_df.propertyLabel.apply(lambda x: property_mapping[x]) | |
return product_kg_df | |
def build_product_instance_df(): | |
product_instance_df_1 = pd.read_csv('data/product_relations_instance_out.csv') | |
product_instance_df_2 = pd.read_csv('data/product_manufacturer_relations_instance_out.csv') | |
product_instance_df = pd.concat([product_instance_df_1, product_instance_df_2]).drop_duplicates() | |
product_instance_df['propertyLabel'] = product_instance_df.propertyLabel.apply(lambda x: property_mapping[x]) | |
return product_instance_df | |
### For searching | |
def search_df(inp, df, col): | |
mask = df[col].str.contains(inp, case=False, regex=False) | |
select_df = df[mask] | |
return select_df | |
##### Data Logic ##### | |
# For competitor and industry analysis | |
COMPETITOR_PATH = 'data/merged_competitors_all_20241122.xlsx' | |
INDUSTRY_PATH = 'data/industry_hierarchy_20241125.xlsx' | |
# For customer analysis | |
COMPANY_PRODUCT_PATH = 'data/company_product_pairs.csv' | |
PRODUCT_MANUFACTURER_PATH = 'data/product_manufacturer_pair.csv' | |
# Load data | |
with st.spinner(text="Loading competitor data ..."): | |
competitor_df = load_pandas_xlsx(COMPETITOR_PATH) | |
st.success("Competitor Data Loaded!") | |
# Load data | |
with st.spinner(text="Build company df ..."): | |
company_df = build_company_df(competitor_df) | |
st.success("Company Data Loaded!") | |
industry_hierarchy = load_pandas_xlsx(INDUSTRY_PATH) | |
# industry_data = pd.read_excel('data/industry_hierarchy_20241125.xlsx') | |
### Pre computation Steps ### | |
# Pre compute unique number of companies per industry | |
industry_to_counts = build_industry_df(competitor_df) | |
# Pre compute unique number of companies per industry | |
product_to_counts = build_product_df(competitor_df) | |
### end ### | |
# Title | |
st.title('3C Competitor / Customer Analysis Demo') | |
option = st.selectbox( | |
"Analysis Mode", | |
("Customer Analysis", "Competitor Analysis", "Industry Analysis", "Product Analysis") | |
) | |
st.write("You selected:", option) | |
##### App Logic ##### | |
if option == "Competitor Analysis": | |
st.title("Searching by Company") | |
# Get input | |
inp = st_keyup("Enter a company name", value="amazon", key="0", debounce=500) | |
# Perform search | |
select_df = search_df(inp, company_df, 'companyLabel') | |
# def show_data(): | |
# select_value = st.session_state.value | |
# row_id=select_value['selection']['rows'][0] | |
# st.write(company_df.iloc[row_id]) | |
# Show search results | |
with st.status("Searching ...", state="running", expanded=False) as status: | |
status.update(label=f"{len(select_df)} results found", state="complete", expanded=True) | |
### Selection for Company ### | |
st.dataframe(select_df, on_select="rerun", key="value", selection_mode="single-row") | |
# Expand if company is selected | |
select_value = st.session_state.value | |
if len(select_value['selection']['rows']) > 0: | |
st.title("Company Data") | |
row_id = select_value['selection']['rows'][0] | |
row = select_df.iloc[row_id] | |
entries = competitor_df[competitor_df.company == row.company] | |
st.write(f"Company Name: {row.companyLabel}") | |
st.write(f"Japanese Name: {row.companyLabelJA}") | |
st.write(f"Wikidata URL: {row.company}") | |
st.write(f"Cities where Offices Located In: {list(entries.headquartersLabel.unique())}") | |
st.write(f"Countries Operated in: {list(entries.countryLabel.unique())}") | |
st.write(f"Industries belongs to: {list(set(list(entries.industryLabel.unique()) + list(entries.industryLabelJA.unique())))}") | |
st.write(f"Products or Services Provided: {list(set(list(entries.productLabel.unique()) + list(entries.productLabelJA.unique())))}") | |
st.title("Competitive Analysis by Industry") | |
# new: select from pre-computed list | |
associated_industries = set(list(entries.industry.unique())) | |
wikidata_industries = industry_to_counts[industry_to_counts.industry.apply(lambda x: x in associated_industries)] | |
# old: direct computation | |
# wikidata_industries = entries[['industry', 'industryLabel', 'industryLabelJA']].drop_duplicates().copy() | |
### selection for industry ### | |
st.dataframe(wikidata_industries, on_select="rerun", key="industry", selection_mode="single-row") | |
select_industry = st.session_state.industry | |
# expand if industry if selected | |
if len(select_industry['selection']['rows']) > 0: | |
industry_id = select_industry['selection']['rows'][0] | |
industry = wikidata_industries.iloc[industry_id] | |
st.title(f"All Competitors for {industry.industryLabel}") | |
competitors = competitor_df[competitor_df.industry == industry.industry][['companyLabel', 'companyLabelJA', 'company', 'country', 'countryLabel']].drop_duplicates().copy() | |
st.dataframe(competitors) | |
# print("------") | |
st.title("Analysis by country") | |
competitors['countryLabel'] = competitors.countryLabel.fillna('undefined') | |
competitors['country'] = competitors.country.fillna('undefined') | |
# country_counts = competitors.groupby(['country', 'countryLabel'])['companyLabel'].count().sort_values(ascending=False).reset_index()[:50] | |
country_counts = competitors.groupby(['countryLabel', 'country'])['companyLabel'].count().sort_values(ascending=False).reset_index()[:50] | |
country_counts = country_counts.rename(columns={'countryLabel': 'countryName', 'country': 'countryWikidata', 'companyLabel': 'countryCounts'}) | |
### selection for country ### | |
st.dataframe(country_counts, on_select="rerun", key="country", selection_mode="single-row") | |
select_country = st.session_state.country | |
# expand if country is selected | |
if len(select_country['selection']['rows']) > 0: | |
select_country_id = select_country['selection']['rows'][0] | |
select_country = country_counts.iloc[select_country_id] | |
competitors_by_country = competitors[competitors.country == select_country.countryWikidata][['companyLabel', 'companyLabelJA', 'company']].copy() | |
st.title(f"Companies with industry {industry.industryLabel} in country {select_country.countryName}") | |
st.dataframe(competitors_by_country) | |
elif option == "Industry Analysis": | |
st.title("Searching by Industry") | |
# Get input | |
industry_input = st_keyup("Enter an industry name", value="retail", key="1", debounce=500) | |
# Perform search | |
industry_select_df = search_df(industry_input, industry_to_counts, 'industryLabel') | |
# Show search results | |
with st.status("Searching ...", state="running", expanded=False) as status: | |
status.update(label=f"{len(industry_select_df)} results found", state="complete", expanded=True) | |
### Selection for Industry ### | |
st.dataframe(industry_select_df, on_select="rerun", key="industry", selection_mode="single-row") | |
select_industry = st.session_state.industry | |
# expand if industry if selected | |
if len(select_industry['selection']['rows']) > 0: | |
industry_id = select_industry['selection']['rows'][0] | |
industry = industry_select_df.iloc[industry_id] | |
st.title(f"All Competitors for {industry.industryLabel}") | |
competitors = competitor_df[competitor_df.industry == industry.industry][['companyLabel', 'companyLabelJA', 'company', 'country', 'countryLabel']].drop_duplicates().copy() | |
st.dataframe(competitors) | |
st.title(f'Industry Superclasses of {industry.industryLabel}') | |
superclasses = industry_hierarchy[industry_hierarchy.subject == industry.industry][['object', 'objectLabel', 'objectLabelJa']] | |
st.dataframe(superclasses) | |
st.title(f'Industry Subclasses of {industry.industryLabel}') | |
subclasses = industry_hierarchy[industry_hierarchy.object == industry.industry][['subject', 'subjectLabel', 'subjectLabelJa']] | |
st.dataframe(subclasses) | |
# st.title(f'Hierarchy Graph') | |
# Create a graphlib graph object | |
# graph = graphviz.Digraph() | |
# for sup in superclasses.itertuples(): | |
# graph.edge(sup.industryLabel, industry.industryLabel) | |
# for sub in subclasses.itertuples(): | |
# graph.edge(industry.industryLabel, sub.industryLabel) | |
# st.graphviz_chart(graph) | |
st.title(f'Test Network Graph') | |
# Create a graphlib graph object | |
graph = graphviz.Digraph() | |
graph.edge("run", "intr") | |
graph.edge("intr", "runbl") | |
graph.edge("runbl", "run") | |
graph.edge("run", "kernel") | |
graph.edge("kernel", "zombie") | |
graph.edge("kernel", "sleep") | |
graph.edge("kernel", "runmem") | |
graph.edge("sleep", "swap") | |
graph.edge("swap", "runswap") | |
graph.edge("runswap", "new") | |
graph.edge("runswap", "runmem") | |
graph.edge("new", "runmem") | |
graph.edge("sleep", "runmem") | |
st.graphviz_chart(graph) | |
elif option == "Product Analysis": | |
st.title("Searching by Product") | |
# Get input | |
product_input = st_keyup("Enter an product name", value="computer", key="2", debounce=500) | |
# Perform search | |
product_select_df = search_df(product_input, product_to_counts, 'productLabel') | |
# Show search results | |
with st.status("Searching ...", state="running", expanded=False) as status: | |
status.update(label=f"{len(product_select_df)} results found", state="complete", expanded=True) | |
### Selection for Product ### | |
st.dataframe(product_select_df, on_select="rerun", key="product", selection_mode="single-row") | |
select_product = st.session_state.product | |
# expand if product if selected | |
if len(select_product['selection']['rows']) > 0: | |
product_id = select_product['selection']['rows'][0] | |
product = product_select_df.iloc[product_id] | |
st.title(f"All Competitors for {product.productLabel}") | |
competitors = competitor_df[competitor_df['product'] == product['product']][['companyLabel', 'companyLabelJA', 'company', 'country', 'countryLabel']].drop_duplicates().copy() | |
st.dataframe(competitors) | |
elif option == "Customer Analysis": | |
# Load data | |
with st.spinner(text="Build company product knowledge graph ..."): | |
company_product_kg_df = build_company_product_kg(COMPANY_PRODUCT_PATH, PRODUCT_MANUFACTURER_PATH) | |
company_df_2 = build_company_df_2(company_product_kg_df) | |
st.success("Company Product Knowledge Graph Loaded!") | |
with st.spinner(text="Build product relationship knowledge graph ..."): | |
product_kg_df = build_product_kg_df() | |
st.success("Product Relationship Knowledge Graph Loaded!") | |
with st.spinner(text="Build product instance knowledge graph ..."): | |
product_instance_df = build_product_instance_df() | |
st.success("Product Instance Knowledge Graph Loaded!") | |
### Search Start | |
st.title("Searching by Company") | |
# Get input | |
inp = st_keyup("Enter a company name", value="toshiba", key="0", debounce=500) | |
# Perform search | |
select_df = search_df(inp, company_df_2, 'companyLabel') | |
# def show_data(): | |
# select_value = st.session_state.value | |
# row_id=select_value['selection']['rows'][0] | |
# st.write(company_df.iloc[row_id]) | |
# Show search results | |
with st.status("Searching ...", state="running", expanded=False) as status: | |
status.update(label=f"{len(select_df)} results found", state="complete", expanded=True) | |
### Selection for Company ### | |
st.dataframe(select_df, on_select="rerun", key="value", selection_mode="single-row") | |
# Expand if company is selected | |
select_value = st.session_state.value | |
if len(select_value['selection']['rows']) > 0: | |
st.title("Company Data") | |
row_id = select_value['selection']['rows'][0] | |
row = select_df.iloc[row_id] | |
entries = company_product_kg_df[company_product_kg_df.company == row.company] | |
st.write(f"Company Name: {row.companyLabel}") | |
st.write(f"Japanese Name: {row.companyLabelJA}") | |
st.write(f"Wikidata URL: {row.company}") | |
# st.write(f"Products or Services Provided: {list(set(list(entries.productLabel.unique()) + list(entries.productLabelJA.unique())))}") | |
st.write(f"Products and services provided by {row.companyLabel}") | |
product_select_df = company_product_kg_df[(company_product_kg_df.company == row.company) & (company_product_kg_df.propertyLabel == 'product_or_service_provided')][['productLabel', 'productLabelJA', 'product', 'company', 'companyLabel', 'companyLabelJA']] | |
### Selection for Product ### | |
st.dataframe(product_select_df, on_select="rerun", key="product", selection_mode="single-row") | |
select_product = st.session_state.product | |
# expand if product if selected | |
if len(select_product['selection']['rows']) > 0: | |
product_id = select_product['selection']['rows'][0] | |
target_product = product_select_df.iloc[product_id] | |
# st.title(f"All Product Categories produced by {row.companyLabel}") | |
# st.dataframe(competitors) | |
# Hypothesis | |
# for incoming relations: 'uses' of 'has_part' is useful, since it lists services that have selected product as a component | |
# for outgoing relations: 'has_use' and 'part_of' is useful, since it lists services that have selected product as a component | |
####### Build kg paths | |
### Step 1 ### | |
start_df = pd.DataFrame() | |
start_df[['company_start', 'companyLabel_start', 'companyLabelJA_start', 'product_start', 'productLabel_start', 'productLabelJA_start']] = [[target_product.company, target_product.companyLabel, target_product.companyLabelJA, target_product['product'], target_product.productLabel, target_product.productLabelJA]] | |
### Step 2 ### | |
related_out_df = product_kg_df[(product_kg_df['product'] == target_product['product']) & (product_kg_df['propertyLabel'].apply(lambda x: x in ['has_use', 'part_of']))] | |
related_in_df = product_kg_df[(product_kg_df['object'] == target_product['product']) & (product_kg_df['propertyLabel'].apply(lambda x: x in ['uses', 'has_part']))] | |
path_df = pd.concat( | |
[ | |
start_df.merge(related_out_df[['product', 'object', 'objectLabel', 'objectLabelJa']], left_on='product_start', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_second', 'objectLabel': 'productLabel_second', 'objectLabelJa': 'productLabelJa_second'}), | |
start_df.merge(related_in_df[['object', 'product', 'productLabel', 'productLabelJa']], left_on='product_start', right_on='object').drop(columns=['object']).rename(columns={'product': 'product_second', 'productLabel': 'productLabel_second', 'productLabelJa': 'productLabelJa_second'}), | |
] | |
) | |
# merge 1 | |
### Step 3a ### | |
path_df_1 = path_df.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_second', right_on='product').drop(columns=['product']) | |
### Step 3b ### | |
path_df_2 = path_df.merge(product_instance_df[['object', 'objectLabel', 'objectLabelJa', 'product']], left_on='product_second', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_third', 'objectLabel': 'productLabel_third', 'objectLabelJa': 'productLabelJa_third'}) | |
path_df_2 = path_df_2.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_third', right_on='product').drop(columns=['product']) | |
### Step 3c ### | |
path_df_3 = path_df.merge(product_instance_df[['object', 'objectLabel', 'objectLabelJa', 'product']], left_on='product_second', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_third', 'objectLabel': 'productLabel_third', 'objectLabelJa': 'productLabelJa_third'}) | |
path_df_3 = path_df_3.merge(product_instance_df[['product', 'productLabel', 'productLabelJa', 'object']], left_on='product_third', right_on='object').drop(columns=['object']).rename(columns={'product': 'product_fourth', 'productLabel': 'productLabel_fourth', 'productLabelJa': 'productLabelJa_fourth'}) | |
path_df_3 = path_df_3.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_fourth', right_on='product').drop(columns=['product']) | |
### Step 5 ### | |
path_df_1['length'] = 4 | |
path_df_2['length'] = 5 | |
path_df_3['length'] = 6 | |
final_path_df = pd.concat([path_df_1, path_df_2, path_df_3]) | |
final_path_df = final_path_df.reset_index(drop=True) | |
final_path_df['path_id'] = final_path_df.index | |
#final_path_df = final_path_df.set_index('path_id', drop=False) | |
final_company_df = final_path_df[['path_id', 'company', 'companyLabel', 'companyLabelJA']].copy() | |
### Step 6 ### | |
st.title(f"Potential Customers for {target_product.companyLabel} for product {target_product.productLabel}") | |
# st.dataframe(final_company_df) | |
st.dataframe(final_company_df, on_select="rerun", key="customer", selection_mode="single-row") | |
select_customer = st.session_state.customer | |
if len(select_customer['selection']['rows']) > 0: | |
customer_id = select_customer['selection']['rows'][0] | |
target_customer = final_company_df.iloc[customer_id] | |
customer_df = final_path_df[final_path_df.path_id == target_customer.path_id].iloc[0] | |
# import graphviz | |
# Create a graphlib graph object | |
graph = graphviz.Digraph() | |
graph.edge(customer_df.companyLabel_start, customer_df.productLabel_start, label=' produces') | |
graph.edge(customer_df.productLabel_start, customer_df.productLabel_second, label=' part of') | |
if customer_df.length == 4: | |
graph.edge(customer_df.productLabel_second, customer_df.companyLabel, label= ' produced by') | |
elif customer_df.length == 5: | |
graph.edge(customer_df.productLabel_second, customer_df.productLabel_third, label=' instance of') | |
graph.edge(customer_df.productLabel_third, customer_df.companyLabel, label= ' produced by') | |
if customer_df.length == 6: | |
graph.edge(customer_df.productLabel_second, customer_df.productLabel_third, label=' instance of') | |
graph.edge(customer_df.productLabel_fourth, customer_df.productLabel_third, ' instance of') | |
graph.edge(customer_df.productLabel_fourth, customer_df.companyLabel, label= ' produced by') | |
st.graphviz_chart(graph) | |
else: | |
st.write("no option selected") | |