|
import gradio as gr |
|
import httpx |
|
from toolz import groupby |
|
import plotly.express as px |
|
import pandas as pd |
|
from functools import lru_cache |
|
|
|
choices = sorted( |
|
[ |
|
"art", |
|
"biology", |
|
"code", |
|
"distilabel", |
|
"fiftyone", |
|
"legal", |
|
"medical", |
|
"sentence-transformers", |
|
"synthetic", |
|
] |
|
) |
|
|
|
|
|
@lru_cache(maxsize=100) |
|
def fetch_data(framework): |
|
r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}") |
|
data = r.json() |
|
grouped = groupby(lambda x: x["author"], data) |
|
grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True)) |
|
return data, grouped |
|
|
|
|
|
def generate_dashboard(data, grouped, framework): |
|
total_datasets = sum(len(v) for v in grouped.values()) |
|
|
|
dashboard = f"## Hugging Face datasets for {framework} \n\n" |
|
dashboard += f"**Total number of datasets: {total_datasets}**\n\n" |
|
dashboard += f"**Total number of authors: {len(grouped)}**\n\n" |
|
dashboard += "### Datasets per Author\n\n" |
|
|
|
for k, v in grouped.items(): |
|
dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n" |
|
dashboard += f" - **Number of datasets:** {len(v)}\n" |
|
|
|
return dashboard |
|
|
|
|
|
def plot_datasets_growth(data, framework): |
|
df = pd.DataFrame(data) |
|
df["createdAt"] = pd.to_datetime(df["createdAt"]) |
|
df["month"] = df["createdAt"].dt.to_period("M").astype(str) |
|
|
|
|
|
current_month = pd.Period.now("M").strftime("%Y-%m") |
|
df = df[df["month"] < current_month] |
|
|
|
df_counts = df.groupby("month").size().reset_index(name="count") |
|
df_counts["cumulative_count"] = df_counts["count"].cumsum() |
|
df_counts["growth_rate"] = df_counts["count"].pct_change() |
|
fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth") |
|
fig.update_layout( |
|
xaxis_title="Month", |
|
yaxis_title="Cumulative Number of Datasets", |
|
yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"), |
|
yaxis2=dict( |
|
title="Month-over-Month Growth Rate", |
|
overlaying="y", |
|
side="right", |
|
tickformat=",.0%", |
|
), |
|
legend=dict( |
|
title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 |
|
), |
|
) |
|
fig.add_scatter( |
|
x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2" |
|
) |
|
fig.update_layout( |
|
title={ |
|
"text": f"Dataset Growth for {framework} datasets", |
|
"y": 0.95, |
|
"x": 0.5, |
|
"xanchor": "center", |
|
"yanchor": "top", |
|
}, |
|
title_font=dict(size=24), |
|
annotations=[ |
|
dict( |
|
x=0.5, |
|
y=0.85, |
|
xref="paper", |
|
yref="paper", |
|
text="Cumulative number of datasets and month-over-month growth rate", |
|
showarrow=False, |
|
font=dict(size=14), |
|
) |
|
], |
|
) |
|
return fig |
|
|
|
|
|
def update_dashboard(framework): |
|
data, grouped = fetch_data(framework) |
|
dashboard = generate_dashboard(data, grouped, framework) |
|
fig = plot_datasets_growth(data, framework) |
|
return fig, dashboard |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Dataset frameworks/tags on the Hub") |
|
gr.Markdown( |
|
"This dashboard displays the number of datasets per author and the growth of datasets over time for a given framework/tag." |
|
) |
|
framework = gr.Dropdown( |
|
choices=choices, |
|
allow_custom_value=True, |
|
label="Select a framework/tag", |
|
) |
|
plot = gr.Plot(label="Growth of datasets over time") |
|
markdown = gr.Markdown(label="summary") |
|
framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown]) |
|
|
|
demo.launch() |
|
|