Spaces:
Sleeping
Sleeping
import streamlit as st | |
from datasets import load_dataset | |
import os | |
HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
st.set_page_config(page_title="FW Clusters inspection", layout="wide") | |
st.title("FW clusters inspection (on AFAIK topics)") | |
st.markdown(""" | |
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering). | |
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material. | |
Additionally, the model was tasked with assigning a category to each cluster from 23 predefined categories found in [AFAIK](https://afaik.io/). | |
Sometimes, the model may define its own category. This can happen either within the context of AFAIK topics or seperately. Hence the `Select Category Type` dropdown in our interface. | |
""") | |
def load_data(educational_topic): | |
ds = load_dataset("HuggingFaceTB/FW_clusters_under_afaik_topics", split="train", token=HF_TOKEN, num_proc=2) | |
if educational_topic in ['Yes', 'No']: | |
ds = ds.filter(lambda x: x['is_topic_educational'] == educational_topic) | |
return ds | |
def get_categories_by_type(_ds, category_type): | |
filtered_ds = _ds.filter(lambda x: x['category_type'] == category_type) | |
return list(set(filtered_ds['category'])) | |
st.subheader("Cluster information") | |
col_1, col_2, col_3 = st.columns(3) | |
with col_1: | |
educational_topic = st.selectbox('Are the topics deemed educational by the LLM?', ["Yes", "No"]) | |
ds = load_data(educational_topic) | |
with col_2: | |
category_types = ['afaik', 'defined_by_llm', 'defined_by_llm_under_afaik'] | |
default_index = 0 if educational_topic == "Yes" else 1 | |
selected_category_type = st.selectbox("Select Category Type", category_types, index=default_index) | |
with col_3: | |
categories = get_categories_by_type(ds, selected_category_type) | |
selected_category = st.selectbox("Select Category", categories) | |
selected_cluster = ds.filter(lambda x: x['category'] == selected_category) | |
# Select sample index | |
n_samples = len(selected_cluster) | |
if n_samples > 0: | |
col_1, col_2 = st.columns(2) | |
with col_1: | |
index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1) | |
files = selected_cluster[index_cluster]["examples"] | |
with col_2: | |
index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1) | |
sample = files[index_example] | |
st.markdown(sample) | |
else: | |
st.markdown("No files found, change the cluster.") |