asoria's picture
asoria HF staff
Open in spaces
64583bd
raw
history blame
21.9 kB
import spaces
import gradio as gr
import logging
import os
import datamapplot
import duckdb
import numpy as np
import requests
from dotenv import load_dotenv
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import TextGeneration
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from huggingface_hub import HfApi, SpaceCard
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from prompts import REPRESENTATION_PROMPT
from torch import cuda, bfloat16
from transformers import (
BitsAndBytesConfig,
AutoTokenizer,
AutoModelForCausalLM,
pipeline,
)
"""
TODOs:
- Improve representation layer (Try with llamacpp or TextGeneration)
- Make it run on Zero GPU
- Try with more rows (Current: 50_000/10_000 -> Minimal Targett: 1_000_000/20_000)
- Export interactive plots and serve their HTML content (It doesn't work with gr.HTML)
"""
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
EXPORTS_REPOSITORY = os.getenv("EXPORTS_REPOSITORY")
assert (
EXPORTS_REPOSITORY is not None
), "You need to set EXPORTS_REPOSITORY in your environment variables"
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
MAX_ROWS = 50_000
CHUNK_SIZE = 10_000
api = HfApi(token=HF_TOKEN)
session = requests.Session()
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# Representation model
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=bfloat16,
)
model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map="auto",
)
model.eval()
generator = pipeline(
model=model,
tokenizer=tokenizer,
task="text-generation",
temperature=0.1,
max_new_tokens=500,
repetition_penalty=1.1,
)
representation_model = TextGeneration(generator, prompt=REPRESENTATION_PROMPT)
# End of representation model
vectorizer_model = CountVectorizer(stop_words="english")
def get_split_rows(dataset, config, split):
config_size = session.get(
f"https://datasets-server.huggingface.co/size?dataset={dataset}&config={config}",
timeout=20,
).json()
if "error" in config_size:
raise Exception(f"Error fetching config size: {config_size['error']}")
split_size = next(
(s for s in config_size["size"]["splits"] if s["split"] == split),
None,
)
if split_size is None:
raise Exception(f"Error fetching split {split} in config {config}")
return split_size["num_rows"]
def get_parquet_urls(dataset, config, split):
parquet_files = session.get(
f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
timeout=20,
).json()
if "error" in parquet_files:
raise Exception(f"Error fetching parquet files: {parquet_files['error']}")
parquet_urls = [file["url"] for file in parquet_files["parquet_files"]]
logging.debug(f"Parquet files: {parquet_urls}")
return ",".join(f"'{url}'" for url in parquet_urls)
def get_docs_from_parquet(parquet_urls, column, offset, limit):
SQL_QUERY = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
df = duckdb.sql(SQL_QUERY).to_df()
logging.debug(f"Dataframe: {df.head(5)}")
return df[column].tolist()
# @spaces.GPU
def calculate_embeddings(docs):
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
def calculate_n_neighbors_and_components(n_rows):
n_neighbors = min(max(n_rows // 20, 15), 100)
n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets
return n_neighbors, n_components
# @spaces.GPU
def fit_model(docs, embeddings, n_neighbors, n_components):
umap_model = UMAP(
n_neighbors=n_neighbors,
n_components=n_components,
min_dist=0.0,
metric="cosine",
random_state=42,
)
hdbscan_model = HDBSCAN(
min_cluster_size=max(
5, n_neighbors // 2
), # Reducing min_cluster_size for fewer outliers
metric="euclidean",
cluster_selection_method="eom",
prediction_data=True,
)
new_model = BERTopic(
language="english",
# Sub-models
embedding_model=sentence_model, # Step 1 - Extract embeddings
umap_model=umap_model, # Step 2 - UMAP model
hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
representation_model=representation_model, # Step 5 - Label topics
# Hyperparameters
top_n_words=10,
verbose=True,
min_topic_size=n_neighbors, # Coherent with n_neighbors?
)
logging.info("Fitting new model")
new_model.fit(docs, embeddings)
logging.info("End fitting new model")
return new_model
def _push_to_hub(
dataset_id,
file_path,
):
logging.info(f"Pushing file to hub: {dataset_id} on file {file_path}")
file_name = file_path.split("/")[-1]
try:
logging.info(f"About to push {file_path} - {dataset_id}")
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_name,
repo_id=EXPORTS_REPOSITORY,
repo_type="dataset",
)
except Exception as e:
logging.info("Failed to push file", e)
raise
def create_space_with_content(dataset_id, html_file_path):
# TODO: Parameterize organization name
repo_id = f"datasets-topics/{dataset_id.replace('/', '-')}"
logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
api.create_repo(
repo_id=repo_id,
repo_type="space",
private=False,
exist_ok=True,
token=HF_TOKEN,
space_sdk="static",
)
SPACE_REPO_CARD_CONTENT = """
---
title: {dataset_id} topic modeling
sdk: static
pinned: false
datasets:
- {dataset_id}
---
"""
SpaceCard(
content=SPACE_REPO_CARD_CONTENT.format(dataset_id=dataset_id)
).push_to_hub(repo_id=repo_id, repo_type="space", token=HF_TOKEN)
api.upload_file(
path_or_fileobj=html_file_path,
path_in_repo="index.html",
repo_type="space",
repo_id=repo_id,
token=HF_TOKEN,
)
logging.info(f"Space created done")
return repo_id
def generate_topics(dataset, config, split, column, nested_column, plot_type):
logging.info(
f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
)
parquet_urls = get_parquet_urls(dataset, config, split)
split_rows = get_split_rows(dataset, config, split)
logging.info(f"Split rows: {split_rows}")
limit = min(split_rows, MAX_ROWS)
n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
reduce_umap_model = UMAP(
n_neighbors=n_neighbors,
n_components=2, # For visualization, keeping it for 2D
min_dist=0.0,
metric="cosine",
random_state=42,
)
offset = 0
rows_processed = 0
base_model = None
all_docs = []
reduced_embeddings_list = []
topics_info, topic_plot = None, None
full_processing = split_rows <= MAX_ROWS
message = (
f"⚙️ Processing full dataset: 0 of ({split_rows} rows)"
if full_processing
else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
)
yield (
gr.Accordion(open=False),
gr.DataFrame(value=[], interactive=False, visible=True),
gr.Plot(value=None, visible=True),
gr.Label({message: rows_processed / limit}, visible=True),
"",
"",
)
while offset < limit:
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
if not docs:
break
logging.info(
f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs"
)
embeddings = calculate_embeddings(docs)
new_model = fit_model(docs, embeddings, n_neighbors, n_components)
if base_model is None:
base_model = new_model
else:
updated_model = BERTopic.merge_models([base_model, new_model])
nr_new_topics = len(set(updated_model.topics_)) - len(
set(base_model.topics_)
)
new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
logging.info(f"The following topics are newly found: {new_topics}")
base_model = updated_model
reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
reduced_embeddings_list.append(reduced_embeddings)
all_docs.extend(docs)
reduced_embeddings_array = np.vstack(reduced_embeddings_list)
topics_info = base_model.get_topic_info()
all_topics, _ = base_model.transform(all_docs)
all_topics = np.array(all_topics)
topic_plot = (
base_model.visualize_document_datamap(
docs=all_docs,
reduced_embeddings=reduced_embeddings_array,
title=dataset,
font_family="Montserrat Thin",
width=800,
height=700,
arrowprops={
"arrowstyle": "wedge,tail_width=0.5",
"connectionstyle": "arc3,rad=0.05",
"linewidth": 0,
"fc": "#33333377",
},
# TODO: Make it configurable in UI
dynamic_label_size=False,
# label_wrap_width=12,
# label_over_points=True,
# dynamic_label_size=True,
# max_font_size=36,
# min_font_size=4,
)
if plot_type == "DataMapPlot"
else base_model.visualize_documents(
docs=all_docs,
reduced_embeddings=reduced_embeddings_array,
custom_labels=True,
title=dataset,
font_family="Montserrat Thin",
)
)
rows_processed += len(docs)
progress = min(rows_processed / limit, 1.0)
logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
message = (
f"⚙️ Processing full dataset: {rows_processed} of {limit}"
if full_processing
else f"⚙️ Processing partial dataset: {rows_processed} of {limit} rows"
)
yield (
gr.Accordion(open=False),
topics_info,
topic_plot,
gr.Label({message: progress}, visible=True),
"",
"",
)
offset += CHUNK_SIZE
logging.info("Finished processing all data")
plot_png = f"{dataset.replace('/', '-')}-{plot_type.lower()}.png"
if plot_type == "DataMapPlot":
topic_plot.savefig(plot_png, format="png", dpi=300)
else:
topic_plot.write_image(plot_png)
_push_to_hub(dataset, plot_png)
all_topics, _ = base_model.transform(all_docs)
topic_info = base_model.get_topic_info()
topic_names = {row["Topic"]: row["Name"] for index, row in topic_info.iterrows()}
topic_names_array = np.array(
[
topic_names.get(topic, "No Topic").split("_")[1].strip("-")
for topic in all_topics
]
)
dataset_clear_name = dataset.replace("/", "-")
interactive_plot = datamapplot.create_interactive_plot(
reduced_embeddings_array,
topic_names_array,
hover_text=all_docs,
title=dataset,
enable_search=True,
font_family="Montserrat Thin",
# TODO: Export data to .arrow and also serve it
inline_data=True,
# offline_data_prefix=dataset_clear_name,
initial_zoom_fraction=0.9,
)
html_content = str(interactive_plot)
html_file_path = f"{dataset_clear_name}.html"
with open(html_file_path, "w", encoding="utf-8") as html_file:
html_file.write(html_content)
space_id = create_space_with_content(dataset, html_file_path)
plot_png_link = (
f"https://huggingface.co/datasets/{EXPORTS_REPOSITORY}/blob/main/{plot_png}"
)
space_link = f"https://huggingface.co/spaces/{space_id}"
yield (
gr.Accordion(open=False),
topics_info,
topic_plot,
gr.Label(
{f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
),
f"[![Download as PNG](https://img.shields.io/badge/Download_as-PNG-red)]({plot_png_link})",
f"[![Go to interactive plot](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue)]({space_link})",
)
cuda.empty_cache()
with gr.Blocks() as demo:
gr.Markdown("# 💠 Dataset Topic Discovery 🔭")
gr.Markdown("## Select dataset and text column")
data_details_accordion = gr.Accordion("Data details", open=True)
with data_details_accordion:
with gr.Row():
with gr.Column(scale=3):
dataset_name = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
)
subset_dropdown = gr.Dropdown(label="Subset", visible=False)
split_dropdown = gr.Dropdown(label="Split", visible=False)
with gr.Accordion("Dataset preview", open=False):
@gr.render(inputs=[dataset_name, subset_dropdown, split_dropdown])
def embed(name, subset, split):
html_code = f"""
<iframe
src="https://huggingface.co/datasets/{name}/embed/viewer/{subset}/{split}"
frameborder="0"
width="100%"
height="600px"
></iframe>
"""
return gr.HTML(value=html_code)
with gr.Row():
text_column_dropdown = gr.Dropdown(label="Text column name")
nested_text_column_dropdown = gr.Dropdown(
label="Nested text column name", visible=False
)
plot_type_radio = gr.Radio(
["DataMapPlot", "Plotly"],
value="DataMapPlot",
label="Choose the plot type",
interactive=True,
)
generate_button = gr.Button("Generate Topics", variant="primary")
gr.Markdown("## Data map")
full_topics_generation_label = gr.Label(visible=False, show_label=False)
with gr.Row():
open_png_label = gr.Markdown()
open_space_label = gr.Markdown()
topics_plot = gr.Plot()
with gr.Accordion("Topics Info", open=False):
topics_df = gr.DataFrame(interactive=False, visible=True)
generate_button.click(
generate_topics,
inputs=[
dataset_name,
subset_dropdown,
split_dropdown,
text_column_dropdown,
nested_text_column_dropdown,
plot_type_radio,
],
outputs=[
data_details_accordion,
topics_df,
topics_plot,
full_topics_generation_label,
open_png_label,
open_space_label,
],
)
def _resolve_dataset_selection(
dataset: str, default_subset: str, default_split: str, text_feature
):
if "/" not in dataset.strip().strip("/"):
return {
subset_dropdown: gr.Dropdown(visible=False),
split_dropdown: gr.Dropdown(visible=False),
text_column_dropdown: gr.Dropdown(label="Text column name"),
nested_text_column_dropdown: gr.Dropdown(visible=False),
}
info_resp = session.get(
f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=20
).json()
if "error" in info_resp:
return {
subset_dropdown: gr.Dropdown(visible=False),
split_dropdown: gr.Dropdown(visible=False),
text_column_dropdown: gr.Dropdown(label="Text column name"),
nested_text_column_dropdown: gr.Dropdown(visible=False),
}
subsets: list[str] = list(info_resp["dataset_info"])
subset = default_subset if default_subset in subsets else subsets[0]
splits: list[str] = list(info_resp["dataset_info"][subset]["splits"])
split = default_split if default_split in splits else splits[0]
features = info_resp["dataset_info"][subset]["features"]
def _is_string_feature(feature):
return isinstance(feature, dict) and feature.get("dtype") == "string"
text_features = [
feature_name
for feature_name, feature in features.items()
if _is_string_feature(feature)
]
nested_features = [
feature_name
for feature_name, feature in features.items()
if isinstance(feature, dict)
and isinstance(next(iter(feature.values())), dict)
]
nested_text_features = [
feature_name
for feature_name in nested_features
if any(
_is_string_feature(nested_feature)
for nested_feature in features[feature_name].values()
)
]
if not text_feature:
return {
subset_dropdown: gr.Dropdown(
value=subset, choices=subsets, visible=len(subsets) > 1
),
split_dropdown: gr.Dropdown(
value=split, choices=splits, visible=len(splits) > 1
),
text_column_dropdown: gr.Dropdown(
choices=text_features + nested_text_features,
label="Text column name",
),
nested_text_column_dropdown: gr.Dropdown(visible=False),
}
if text_feature in nested_text_features:
nested_keys = [
feature_name
for feature_name, feature in features[text_feature].items()
if _is_string_feature(feature)
]
return {
subset_dropdown: gr.Dropdown(
value=subset, choices=subsets, visible=len(subsets) > 1
),
split_dropdown: gr.Dropdown(
value=split, choices=splits, visible=len(splits) > 1
),
text_column_dropdown: gr.Dropdown(
choices=text_features + nested_text_features,
label="Text column name",
),
nested_text_column_dropdown: gr.Dropdown(
value=nested_keys[0],
choices=nested_keys,
label="Nested text column name",
visible=True,
),
}
return {
subset_dropdown: gr.Dropdown(
value=subset, choices=subsets, visible=len(subsets) > 1
),
split_dropdown: gr.Dropdown(
value=split, choices=splits, visible=len(splits) > 1
),
text_column_dropdown: gr.Dropdown(
choices=text_features + nested_text_features, label="Text column name"
),
nested_text_column_dropdown: gr.Dropdown(visible=False),
}
@dataset_name.change(
inputs=[dataset_name],
outputs=[
subset_dropdown,
split_dropdown,
text_column_dropdown,
nested_text_column_dropdown,
],
)
def show_input_from_subset_dropdown(dataset: str) -> dict:
return _resolve_dataset_selection(
dataset, default_subset="default", default_split="train", text_feature=None
)
@subset_dropdown.change(
inputs=[dataset_name, subset_dropdown],
outputs=[
subset_dropdown,
split_dropdown,
text_column_dropdown,
nested_text_column_dropdown,
],
)
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
return _resolve_dataset_selection(
dataset, default_subset=subset, default_split="train", text_feature=None
)
@split_dropdown.change(
inputs=[dataset_name, subset_dropdown, split_dropdown],
outputs=[
subset_dropdown,
split_dropdown,
text_column_dropdown,
nested_text_column_dropdown,
],
)
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
return _resolve_dataset_selection(
dataset, default_subset=subset, default_split=split, text_feature=None
)
@text_column_dropdown.change(
inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown],
outputs=[
subset_dropdown,
split_dropdown,
text_column_dropdown,
nested_text_column_dropdown,
],
)
def show_input_from_text_column_dropdown(
dataset: str, subset: str, split: str, text_column
) -> dict:
return _resolve_dataset_selection(
dataset,
default_subset=subset,
default_split=split,
text_feature=text_column,
)
demo.launch()