llm-arch / pages /020_Data_Browser.py
alfraser's picture
Added page level comments to describe the purpose of each page
4f07f72
raw
history blame
6.24 kB
"""
This page allows users to browse the test data set. Mainly this is just to get a sense of the size,
content and composition of the dataset behind the project.
"""
import streamlit as st
import pandas as pd
import plotly.express as px
from src.st_helpers import st_setup
from src.datatypes import *
def show_db_selector_and_summary_in_container(container) -> None:
with container:
dbs = DataLoader.available_dbs()
if len(dbs) > 1:
st.write(f'**:1234: Database Selector**')
idx = dbs.index(DataLoader.active_db)
DataLoader.set_db_name(st.selectbox("Connected to:", dbs, index=idx, label_visibility="collapsed"))
st.write(f'**:1234: Summary Statistics**')
summary = f'- **{len(Category.all):,}** categories'
summary += f'\n- **{len(Product.all):,}** products'
summary += f'\n- **{len(Feature.all):,}** features'
summary += f'\n- **{len(Review.all):,}** reviews'
st.markdown(summary)
if st.button('Force data reload'):
DataLoader.load_data(True)
st.rerun()
def show_data_summary_charts_in_container(container) -> None:
with container:
cats = Category.all_sorted()
with st.expander("**Review Counts**"):
category_names = [c.name for c in cats]
category_review_counts = [sum([p.review_count for p in c.products]) for c in cats]
data = zip(category_names, category_review_counts)
df = pd.DataFrame(data, columns=["Category", "Review Count"])
st.bar_chart(df, x="Category", y="Review Count")
with st.expander("**Product Ratings**"):
data = []
for c in cats:
for p in c.products:
data.append([c.name, p.average_rating])
df = pd.DataFrame(data, columns=['Category', 'Mean Product Rating'])
fig = px.box(df, x="Category", y="Mean Product Rating")
fig.update_xaxes(tickangle=-90)
st.plotly_chart(fig, use_container_width=True)
with st.expander("**Product Prices**"):
data = []
for c in cats:
for p in c.products:
data.append([c.name, p.price])
df = pd.DataFrame(data, columns=['Category', 'Price'])
fig = px.box(df, x="Category", y="Price")
fig.update_xaxes(tickangle=-90)
st.plotly_chart(fig, use_container_width=True)
def show_top_section() -> None:
"""
Writes the top section to the streamlit page, showing the currently selected database
and some associated summary numbers and charts
"""
top_section = st.container()
with top_section:
summary_left, summary_right = st.columns([1, 3])
show_db_selector_and_summary_in_container(summary_left)
show_data_summary_charts_in_container(summary_right)
def get_user_selected_category(container) -> Category:
"""
Show a selector to pick a category and return the selected category
"""
with container:
st.write('**Category**')
cats = Category.all_sorted()
options = [f"{c.name}" for c in cats]
selection = st.radio("**Category**", options, label_visibility="collapsed")
return Category.by_name(selection)
def show_category_datatable_in_container(category, container) -> None:
"""
Displays a category of products into a given container. Shows the products themselvses
as a table and then if any products are ticked to show the reviews it displays the associated reviews
also.
"""
with container:
features = [f.name for f in category.features]
features.sort()
st.write(f"**{category.singular_name} Features ({len(features)}):**")
st.write('; '.join(features))
prod_index = [p.id for p in category.products]
prod_data = [[p.name, p.price, p.feature_count, ', '.join([str(f) for f in p.features]), p.review_count,
p.average_rating, False, p.description] for p in category.products]
prod_columns = ['Name', 'Price', 'Feature Count', 'Features', 'Review Count', 'Average Rating', 'Show Reviews?',
'Description']
prod_df = pd.DataFrame(prod_data, index=prod_index, columns=prod_columns)
total_reviews = sum([p.review_count for p in category.products])
st.write(f"**{category.name} ({len(prod_index)}). Having {total_reviews} reviews in total:**")
edited_df = st.data_editor(prod_df, disabled=(
'Name', 'Price', 'Feature Count', 'Features', 'Review Count', 'Average Rating', 'Description'))
selected_product_count = edited_df['Show Reviews?'].sum()
selected_review_count = edited_df[edited_df['Show Reviews?']]['Review Count'].sum()
st.write(f"**{category.singular_name} Reviews ({selected_review_count} from {selected_product_count} products):**")
if selected_review_count > 0:
selected_products = list(edited_df[edited_df['Show Reviews?']].index)
products = Product.for_ids(selected_products)
rev_data = []
rev_index = []
for p in products:
for r in p.reviews:
rev_index.append(r.id)
rev_data.append([p.name, r.rating, r.review_text])
rev_columns = ['Product', 'Review Rating', 'Review Text']
rev_df = pd.DataFrame(rev_data, index=rev_index, columns=rev_columns)
st.dataframe(rev_df, width=10000)
else:
st.write("Check boxes in the table above to see reviews for products.")
def show_bottom_section() -> None:
# Set up space
selected_category_sub_heading = st.container()
category_col, datatable_col = st.columns([1, 3])
# Display into containers
selected_category = get_user_selected_category(category_col)
with selected_category_sub_heading:
st.write(f'### {selected_category.name}')
show_category_datatable_in_container(selected_category, datatable_col)
if st_setup('LLM Arch'):
if not DataLoader.loaded:
DataLoader.load_data()
st.write("# Data Browser")
show_top_section()
show_bottom_section()