Spaces:

librarian-bots
/

dataset-framework-growth

Sleeping

File size: 3,799 Bytes

import gradio as gr
import httpx
from toolz import groupby
import plotly.express as px
import pandas as pd
from functools import lru_cache

choices = sorted(
    [
        "art",
        "biology",
        "code",
        "distilabel",
        "fiftyone",
        "legal",
        "medical",
        "sentence-transformers",
        "synthetic",
    ]
)


@lru_cache(maxsize=100)
def fetch_data(framework):
    r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}")
    data = r.json()
    grouped = groupby(lambda x: x["author"], data)
    grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
    return data, grouped


def generate_dashboard(data, grouped, framework):
    total_datasets = sum(len(v) for v in grouped.values())

    dashboard = f"## Hugging Face datasets for {framework} \n\n"
    dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
    dashboard += f"**Total number of authors: {len(grouped)}**\n\n"
    dashboard += "### Datasets per Author\n\n"

    for k, v in grouped.items():
        dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n"
        dashboard += f"  - **Number of datasets:** {len(v)}\n"

    return dashboard


def plot_datasets_growth(data, framework):
    df = pd.DataFrame(data)
    df["createdAt"] = pd.to_datetime(df["createdAt"])
    df["month"] = df["createdAt"].dt.to_period("M").astype(str)

    # Exclude the current month
    current_month = pd.Period.now("M").strftime("%Y-%m")
    df = df[df["month"] < current_month]

    df_counts = df.groupby("month").size().reset_index(name="count")
    df_counts["cumulative_count"] = df_counts["count"].cumsum()
    df_counts["growth_rate"] = df_counts["count"].pct_change()
    fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
    fig.update_layout(
        xaxis_title="Month",
        yaxis_title="Cumulative Number of Datasets",
        yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"),
        yaxis2=dict(
            title="Month-over-Month Growth Rate",
            overlaying="y",
            side="right",
            tickformat=",.0%",
        ),
        legend=dict(
            title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
        ),
    )
    fig.add_scatter(
        x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2"
    )
    fig.update_layout(
        title={
            "text": f"Dataset Growth for {framework} datasets",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        title_font=dict(size=24),
        annotations=[
            dict(
                x=0.5,
                y=0.85,
                xref="paper",
                yref="paper",
                text="Cumulative number of datasets and month-over-month growth rate",
                showarrow=False,
                font=dict(size=14),
            )
        ],
    )
    return fig


def update_dashboard(framework):
    data, grouped = fetch_data(framework)
    dashboard = generate_dashboard(data, grouped, framework)
    fig = plot_datasets_growth(data, framework)
    return fig, dashboard


with gr.Blocks() as demo:
    gr.Markdown("# Dataset frameworks/tags on the Hub")
    gr.Markdown(
        "This dashboard displays the number of datasets per author and the growth of datasets over time for a given framework/tag."
    )
    framework = gr.Dropdown(
        choices=choices,
        allow_custom_value=True,
        label="Select a framework/tag",
    )
    plot = gr.Plot(label="Growth of datasets over time")
    markdown = gr.Markdown(label="summary")
    framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown])

demo.launch()