File size: 3,105 Bytes
38fcf43 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import gradio as gr
import httpx
from toolz import groupby
from datetime import datetime
import plotly.express as px
import pandas as pd
def fetch_data(framework):
r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}")
data = r.json()
grouped = groupby(lambda x: x["author"], data)
grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
return data, grouped
def generate_dashboard(data, grouped, framework):
total_datasets = sum(len(v) for v in grouped.values())
dashboard = f"## Hugging Face Datasets for {framework} \n\n"
dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
dashboard += "### Datasets per Author\n\n"
for k, v in grouped.items():
dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n"
dashboard += f" - **Number of datasets:** {len(v)}\n"
return dashboard
def plot_datasets_growth(data, framework):
df = pd.DataFrame(data)
df["createdAt"] = pd.to_datetime(df["createdAt"])
df["month"] = df["createdAt"].dt.to_period("M").astype(str)
df_counts = df.groupby("month").size().reset_index(name="count")
df_counts["cumulative_count"] = df_counts["count"].cumsum()
df_counts["growth_rate"] = df_counts["count"].pct_change()
fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
fig.update_layout(
xaxis_title="Month",
yaxis_title="Cumulative Number of Datasets",
yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"),
yaxis2=dict(
title="Month-over-Month Growth Rate",
overlaying="y",
side="right",
tickformat=",.0%",
),
legend=dict(
title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
),
)
fig.add_scatter(
x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2"
)
fig.update_layout(
title={
"text": f"Dataset Growth for {framework} datasets",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
title_font=dict(size=24),
annotations=[
dict(
x=0.5,
y=0.85,
xref="paper",
yref="paper",
text="Cumulative number of datasets and month-over-month growth rate",
showarrow=False,
font=dict(size=14),
)
],
)
return fig
def update_dashboard(framework):
data, grouped = fetch_data(framework)
dashboard = generate_dashboard(data, grouped, framework)
fig = plot_datasets_growth(data, framework)
return fig, dashboard
with gr.Blocks() as demo:
framework = gr.Dropdown(
choices=["distilabel", "sentence-transformers", "synthetic"],
allow_custom_value=True,
)
plot = gr.Plot()
markdown = gr.Markdown()
framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown])
demo.launch()
|