davanstrien HF staff commited on
Commit
38fcf43
·
1 Parent(s): e8921da

chore: Add Hugging Face dataset dashboard with Gradio interface

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import httpx
3
+ from toolz import groupby
4
+ from datetime import datetime
5
+ import plotly.express as px
6
+ import pandas as pd
7
+
8
+
9
+ def fetch_data(framework):
10
+ r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}")
11
+ data = r.json()
12
+ grouped = groupby(lambda x: x["author"], data)
13
+ grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
14
+ return data, grouped
15
+
16
+
17
+ def generate_dashboard(data, grouped, framework):
18
+ total_datasets = sum(len(v) for v in grouped.values())
19
+
20
+ dashboard = f"## Hugging Face Datasets for {framework} \n\n"
21
+ dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
22
+ dashboard += "### Datasets per Author\n\n"
23
+
24
+ for k, v in grouped.items():
25
+ dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n"
26
+ dashboard += f" - **Number of datasets:** {len(v)}\n"
27
+
28
+ return dashboard
29
+
30
+
31
+ def plot_datasets_growth(data, framework):
32
+ df = pd.DataFrame(data)
33
+ df["createdAt"] = pd.to_datetime(df["createdAt"])
34
+ df["month"] = df["createdAt"].dt.to_period("M").astype(str)
35
+ df_counts = df.groupby("month").size().reset_index(name="count")
36
+ df_counts["cumulative_count"] = df_counts["count"].cumsum()
37
+ df_counts["growth_rate"] = df_counts["count"].pct_change()
38
+
39
+ fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
40
+ fig.update_layout(
41
+ xaxis_title="Month",
42
+ yaxis_title="Cumulative Number of Datasets",
43
+ yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"),
44
+ yaxis2=dict(
45
+ title="Month-over-Month Growth Rate",
46
+ overlaying="y",
47
+ side="right",
48
+ tickformat=",.0%",
49
+ ),
50
+ legend=dict(
51
+ title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
52
+ ),
53
+ )
54
+
55
+ fig.add_scatter(
56
+ x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2"
57
+ )
58
+
59
+ fig.update_layout(
60
+ title={
61
+ "text": f"Dataset Growth for {framework} datasets",
62
+ "y": 0.95,
63
+ "x": 0.5,
64
+ "xanchor": "center",
65
+ "yanchor": "top",
66
+ },
67
+ title_font=dict(size=24),
68
+ annotations=[
69
+ dict(
70
+ x=0.5,
71
+ y=0.85,
72
+ xref="paper",
73
+ yref="paper",
74
+ text="Cumulative number of datasets and month-over-month growth rate",
75
+ showarrow=False,
76
+ font=dict(size=14),
77
+ )
78
+ ],
79
+ )
80
+
81
+ return fig
82
+
83
+
84
+ def update_dashboard(framework):
85
+ data, grouped = fetch_data(framework)
86
+ dashboard = generate_dashboard(data, grouped, framework)
87
+ fig = plot_datasets_growth(data, framework)
88
+ return fig, dashboard
89
+
90
+
91
+ with gr.Blocks() as demo:
92
+ framework = gr.Dropdown(
93
+ choices=["distilabel", "sentence-transformers", "synthetic"],
94
+ allow_custom_value=True,
95
+ )
96
+ plot = gr.Plot()
97
+ markdown = gr.Markdown()
98
+ framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown])
99
+
100
+ demo.launch()