File size: 3,105 Bytes
38fcf43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
import httpx
from toolz import groupby
from datetime import datetime
import plotly.express as px
import pandas as pd


def fetch_data(framework):
    r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}")
    data = r.json()
    grouped = groupby(lambda x: x["author"], data)
    grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
    return data, grouped


def generate_dashboard(data, grouped, framework):
    total_datasets = sum(len(v) for v in grouped.values())

    dashboard = f"## Hugging Face Datasets for {framework} \n\n"
    dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
    dashboard += "### Datasets per Author\n\n"

    for k, v in grouped.items():
        dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n"
        dashboard += f"  - **Number of datasets:** {len(v)}\n"

    return dashboard


def plot_datasets_growth(data, framework):
    df = pd.DataFrame(data)
    df["createdAt"] = pd.to_datetime(df["createdAt"])
    df["month"] = df["createdAt"].dt.to_period("M").astype(str)
    df_counts = df.groupby("month").size().reset_index(name="count")
    df_counts["cumulative_count"] = df_counts["count"].cumsum()
    df_counts["growth_rate"] = df_counts["count"].pct_change()

    fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
    fig.update_layout(
        xaxis_title="Month",
        yaxis_title="Cumulative Number of Datasets",
        yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"),
        yaxis2=dict(
            title="Month-over-Month Growth Rate",
            overlaying="y",
            side="right",
            tickformat=",.0%",
        ),
        legend=dict(
            title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
        ),
    )

    fig.add_scatter(
        x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2"
    )

    fig.update_layout(
        title={
            "text": f"Dataset Growth for {framework} datasets",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        title_font=dict(size=24),
        annotations=[
            dict(
                x=0.5,
                y=0.85,
                xref="paper",
                yref="paper",
                text="Cumulative number of datasets and month-over-month growth rate",
                showarrow=False,
                font=dict(size=14),
            )
        ],
    )

    return fig


def update_dashboard(framework):
    data, grouped = fetch_data(framework)
    dashboard = generate_dashboard(data, grouped, framework)
    fig = plot_datasets_growth(data, framework)
    return fig, dashboard


with gr.Blocks() as demo:
    framework = gr.Dropdown(
        choices=["distilabel", "sentence-transformers", "synthetic"],
        allow_custom_value=True,
    )
    plot = gr.Plot()
    markdown = gr.Markdown()
    framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown])

demo.launch()