File size: 3,799 Bytes
38fcf43
 
 
 
 
195a3cd
38fcf43
842e3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
38fcf43
195a3cd
38fcf43
 
 
 
 
 
 
 
 
 
 
842e3d0
38fcf43
bb1ee8a
38fcf43
 
 
 
 
 
 
 
 
 
 
 
 
e19490a
 
 
 
 
38fcf43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e19490a
 
 
 
38fcf43
842e3d0
38fcf43
e19490a
38fcf43
e19490a
 
38fcf43
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import httpx
from toolz import groupby
import plotly.express as px
import pandas as pd
from functools import lru_cache

choices = sorted(
    [
        "art",
        "biology",
        "code",
        "distilabel",
        "fiftyone",
        "legal",
        "medical",
        "sentence-transformers",
        "synthetic",
    ]
)


@lru_cache(maxsize=100)
def fetch_data(framework):
    r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}")
    data = r.json()
    grouped = groupby(lambda x: x["author"], data)
    grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
    return data, grouped


def generate_dashboard(data, grouped, framework):
    total_datasets = sum(len(v) for v in grouped.values())

    dashboard = f"## Hugging Face datasets for {framework} \n\n"
    dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
    dashboard += f"**Total number of authors: {len(grouped)}**\n\n"
    dashboard += "### Datasets per Author\n\n"

    for k, v in grouped.items():
        dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n"
        dashboard += f"  - **Number of datasets:** {len(v)}\n"

    return dashboard


def plot_datasets_growth(data, framework):
    df = pd.DataFrame(data)
    df["createdAt"] = pd.to_datetime(df["createdAt"])
    df["month"] = df["createdAt"].dt.to_period("M").astype(str)

    # Exclude the current month
    current_month = pd.Period.now("M").strftime("%Y-%m")
    df = df[df["month"] < current_month]

    df_counts = df.groupby("month").size().reset_index(name="count")
    df_counts["cumulative_count"] = df_counts["count"].cumsum()
    df_counts["growth_rate"] = df_counts["count"].pct_change()
    fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
    fig.update_layout(
        xaxis_title="Month",
        yaxis_title="Cumulative Number of Datasets",
        yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"),
        yaxis2=dict(
            title="Month-over-Month Growth Rate",
            overlaying="y",
            side="right",
            tickformat=",.0%",
        ),
        legend=dict(
            title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
        ),
    )
    fig.add_scatter(
        x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2"
    )
    fig.update_layout(
        title={
            "text": f"Dataset Growth for {framework} datasets",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        title_font=dict(size=24),
        annotations=[
            dict(
                x=0.5,
                y=0.85,
                xref="paper",
                yref="paper",
                text="Cumulative number of datasets and month-over-month growth rate",
                showarrow=False,
                font=dict(size=14),
            )
        ],
    )
    return fig


def update_dashboard(framework):
    data, grouped = fetch_data(framework)
    dashboard = generate_dashboard(data, grouped, framework)
    fig = plot_datasets_growth(data, framework)
    return fig, dashboard


with gr.Blocks() as demo:
    gr.Markdown("# Dataset frameworks/tags on the Hub")
    gr.Markdown(
        "This dashboard displays the number of datasets per author and the growth of datasets over time for a given framework/tag."
    )
    framework = gr.Dropdown(
        choices=choices,
        allow_custom_value=True,
        label="Select a framework/tag",
    )
    plot = gr.Plot(label="Growth of datasets over time")
    markdown = gr.Markdown(label="summary")
    framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown])

demo.launch()