Spaces:

mariagrandury
/

language-gap-in-hf-hub

Sleeping

App Files Files Community

mariagrandury commited on May 15, 2024

Commit

1f70be8

1 Parent(s): 6f2c797

first commit

Browse files

Files changed (7) hide show

.gitignore +1 -0
README.md +1 -3
app.py +75 -0
numero_datasets_hub.ipynb +781 -0
numero_datasets_hub_output.ipynb +918 -0
plots/datasets_hub.png +0 -0
requirements.txt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Language Gap In The Hub
 emoji: 📊
 colorFrom: pink
 colorTo: purple
@@ -9,5 +9,3 @@ app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Language Gap In The Hugging Face Hub
 emoji: 📊
 colorFrom: pink
 colorTo: purple
 pinned: false
 license: apache-2.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import gradio as gr
+import papermill as pm
+def run_notebook():
+    try:
+        # Execute the notebook
+        pm.execute_notebook(
+            "numero_datasets_hub.ipynb",
+            "numero_datasets_hub_output.ipynb",  # This will save the output in a new notebook
+        )
+        return "Notebook executed successfully!"
+    except Exception as e:
+        return f"Failed to execute notebook: {str(e)}"
+def create_app():
+    with gr.Blocks() as app:
+        gr.Markdown("# The language gap in the Hugging Face Hub")
+        # Button to run the notebook
+        run_button = gr.Button("Run Notebook")
+        output_label = gr.Label()  # To display the result of running the notebook
+        run_button.click(run_notebook, outputs=output_label)
+        # Create a 2x2 grid for images
+        with gr.Row():
+            with gr.Column():
+                image1 = gr.Image(
+                    value="plots/datasets_hub.png",
+                    label="Image 1",
+                    show_label=True,
+                    show_download_button=True,
+                    show_share_button=True,
+                )
+                image2 = gr.Image(
+                    value="datasets_hub.png",
+                    label="Image 2",
+                )
+            with gr.Column():
+                image3 = gr.Image(
+                    value="datasets_hub.png",
+                    label="Image 3",
+                )
+                image4 = gr.Image(
+                    value="datasets_hub.png",
+                    label="Image 4",
+                )
+        # Description for images
+        gr.Markdown("### Image Descriptions")
+        gr.Markdown("Description for Image 1")
+        gr.Markdown("Description for Image 2")
+        gr.Markdown("Description for Image 3")
+        gr.Markdown("Description for Image 4")
+        # Collapsible block for citation
+        with gr.Accordion("Citation Information"):
+            gr.Markdown(
+                """
+                        If you use the images or code please cite:
+                        ```
+                        fjdlsafd
+                        ```
+                        """
+            )
+    return app
+app = create_app()
+app.launch()

numero_datasets_hub.ipynb ADDED Viewed

	@@ -0,0 +1,781 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bCPvBCk_VLoi",
+        "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.20.3)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (3.14.0)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2023.6.0)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2.31.0)\n",
+            "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.66.4)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (6.0.1)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.11.0)\n",
+            "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (24.0)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (3.3.2)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (3.7)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (2.0.7)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (2024.2.2)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install huggingface_hub"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "NbQeXxudVJW9"
+      },
+      "outputs": [],
+      "source": [
+        "from datetime import datetime\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import pandas as pd\n",
+        "from huggingface_hub import HfApi\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ogyTHBYJVZ8I",
+        "outputId": "f23a554a-7328-4e50-d87c-90368294467d"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
+            "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
+            "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
+            "You will be able to reuse this secret in all of your notebooks.\n",
+            "Please note that authentication is recommended but still optional to access public models or datasets.\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "145101\n"
+          ]
+        }
+      ],
+      "source": [
+        "hf_api = HfApi()\n",
+        "\n",
+        "all_datasets = hf_api.list_datasets(full=True)\n",
+        "\n",
+        "total_count = len(list(all_datasets))\n",
+        "print(total_count)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "GXDMUU-4XmaI"
+      },
+      "outputs": [],
+      "source": [
+        "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n",
+        "\n",
+        "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
+        "\n",
+        "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n",
+        "\n",
+        "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n",
+        "\n",
+        "#for dataset in spanish_only_datasets:\n",
+        "#    print(dataset)\n",
+        "#    break"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pjCvHVq_hChx",
+        "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "318\n"
+          ]
+        }
+      ],
+      "source": [
+        "hf_api = HfApi()\n",
+        "\n",
+        "all_datasets = hf_api.list_datasets(full=True)\n",
+        "\n",
+        "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
+        "spanish_datasets = list(spanish_filter)\n",
+        "spanish_count = len(list(spanish_datasets))\n",
+        "print(spanish_count)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "WANGkTpGRw8t",
+        "outputId": "da8931bf-7ae2-438d-8188-20190f568193"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "8357\n"
+          ]
+        }
+      ],
+      "source": [
+        "hf_api = HfApi()\n",
+        "\n",
+        "all_datasets = hf_api.list_datasets(full=True)\n",
+        "\n",
+        "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n",
+        "english_datasets = list(english_filter)\n",
+        "english_count = len(list(english_datasets))\n",
+        "print(english_count)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yPtF0G7SWS53",
+        "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "568\n"
+          ]
+        }
+      ],
+      "source": [
+        "hf_api = HfApi()\n",
+        "\n",
+        "all_datasets = hf_api.list_datasets(full=True)\n",
+        "\n",
+        "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n",
+        "chinese_datasets = list(chinese_filter)\n",
+        "chinese_count = len(list(chinese_datasets))\n",
+        "print(chinese_count)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "RlxAlOOsW7p9",
+        "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "436\n"
+          ]
+        }
+      ],
+      "source": [
+        "hf_api = HfApi()\n",
+        "\n",
+        "all_datasets = hf_api.list_datasets(full=True)\n",
+        "\n",
+        "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n",
+        "french_datasets = list(french_filter)\n",
+        "french_count = len(list(french_datasets))\n",
+        "print(french_count)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "OMQfBXjUYBPz",
+        "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "13886\n"
+          ]
+        }
+      ],
+      "source": [
+        "hf_api = HfApi()\n",
+        "\n",
+        "all_datasets = hf_api.list_datasets(full=True)\n",
+        "\n",
+        "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n",
+        "mono_datasets = list(mono_filter)\n",
+        "mono_count = len(list(mono_datasets))\n",
+        "print(mono_count)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 180
+        },
+        "id": "sTPechkdWmYS",
+        "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3"
+      },
+      "outputs": [
+        {
+          "ename": "AssertionError",
+          "evalue": "",
+          "output_type": "error",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+            "\u001b[0;32m<ipython-input-10-da38b5a6b412>\u001b[0m in \u001b[0;36m<cell line: 7>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mcreation_dates_english\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0md\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreated_at\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menglish_datasets\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcreation_dates_english\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m8336\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;31mAssertionError\u001b[0m: "
+          ]
+        }
+      ],
+      "source": [
+        "# Extract creation date\n",
+        "\n",
+        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "assert len(creation_dates_spanish) == 318\n",
+        "\n",
+        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "assert len(creation_dates_english) == 8336"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hefZVynDSjjE"
+      },
+      "outputs": [],
+      "source": [
+        "print(creation_dates_spanish[0])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aFaEBlkkSbrs"
+      },
+      "source": [
+        "## Bar Chart\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dYJ2zd4dShYh"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "from collections import Counter\n",
+        "\n",
+        "# Sample data (replace with your actual data)\n",
+        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "\n",
+        "# Extract years from the creation dates\n",
+        "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
+        "english_counts = Counter(date.year for date in creation_dates_english)\n",
+        "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
+        "\n",
+        "# Plotting the bar chart\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
+        "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
+        "\n",
+        "# Adding labels and title\n",
+        "plt.xlabel('Year')\n",
+        "plt.ylabel('Number of Datasets')\n",
+        "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
+        "plt.xticks(years)\n",
+        "plt.legend()\n",
+        "\n",
+        "# Display the plot\n",
+        "plt.grid(True)\n",
+        "plt.tight_layout()\n",
+        "plt.show()\n",
+        "plt.savefig(\"plots/bar_stack.png\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wViEE4wCUVgs"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "from collections import Counter\n",
+        "\n",
+        "# Sample data (replace with your actual data)\n",
+        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "\n",
+        "# Extract years from the creation dates\n",
+        "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
+        "english_counts = Counter(date.year for date in creation_dates_english)\n",
+        "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
+        "\n",
+        "# Define the width of each bar\n",
+        "bar_width = 0.4\n",
+        "\n",
+        "# Define the x-coordinates for the bars\n",
+        "years_index = np.arange(len(years))\n",
+        "\n",
+        "# Plotting the side-by-side bar chart\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
+        "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
+        "\n",
+        "# Adding labels and title\n",
+        "plt.xlabel('Year')\n",
+        "plt.ylabel('Number of Datasets')\n",
+        "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
+        "plt.xticks(years_index, years)\n",
+        "plt.legend()\n",
+        "\n",
+        "# Display the plot\n",
+        "plt.grid(True)\n",
+        "plt.tight_layout()\n",
+        "plt.show()\n",
+        "plt.savefig(\"plots/bar_width.png\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Hp8vNA6LUA1E"
+      },
+      "source": [
+        "# Stacked Area Chart\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CWgCunzGUCot"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "from collections import Counter\n",
+        "\n",
+        "# Sample data (replace with your actual data)\n",
+        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "\n",
+        "# Extract years from the creation dates\n",
+        "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
+        "english_counts = Counter(date.year for date in creation_dates_english)\n",
+        "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
+        "\n",
+        "# Calculate cumulative counts\n",
+        "english_datasets_cumulative = [english_counts[year] for year in years]\n",
+        "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n",
+        "for i in range(1, len(years)):\n",
+        "    english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n",
+        "    spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
+        "\n",
+        "# Plotting the stacked area chart\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
+        "\n",
+        "# Adding labels and title\n",
+        "plt.xlabel('Year')\n",
+        "plt.ylabel('Cumulative Number of Datasets')\n",
+        "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
+        "plt.xticks(years)\n",
+        "plt.legend(loc='upper left')\n",
+        "\n",
+        "# Display the plot\n",
+        "plt.grid(True)\n",
+        "plt.tight_layout()\n",
+        "plt.show()\n",
+        "\n",
+        "plt.savefig(\"plots/stack_area_1.png\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GwRpZwYWhau3"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import pandas as pd\n",
+        "from collections import Counter\n",
+        "\n",
+        "# Sample data (replace with your actual data)\n",
+        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "\n",
+        "# Extract months from the creation dates\n",
+        "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
+        "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
+        "\n",
+        "# Count the occurrences of each month\n",
+        "english_counts = Counter(months_english)\n",
+        "spanish_counts = Counter(months_spanish)\n",
+        "\n",
+        "# Create a DataFrame for English datasets\n",
+        "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
+        "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
+        "df_english = df_english.sort_index()\n",
+        "\n",
+        "# Create a DataFrame for Spanish datasets\n",
+        "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
+        "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
+        "df_spanish = df_spanish.sort_index()\n",
+        "\n",
+        "# Merge the DataFrames\n",
+        "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
+        "\n",
+        "# Convert index to datetime\n",
+        "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
+        "\n",
+        "# Calculate cumulative sum\n",
+        "df_cumulative = df.cumsum()\n",
+        "\n",
+        "# Plotting the stacked area chart\n",
+        "plt.figure(figsize=(8, 5))\n",
+        "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
+        "\n",
+        "# Adding labels and title\n",
+        "plt.xlabel('Creation date')\n",
+        "plt.ylabel('Cumulative number of monolingual datasets')\n",
+        "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
+        "\n",
+        "# Display the plot\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.legend(loc='upper left')\n",
+        "plt.grid(False)\n",
+        "plt.tight_layout()\n",
+        "plt.show()\n",
+        "\n",
+        "plt.savefig(\"plots/stack_area_2.png\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kJQ0OgRtglOQ"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import pandas as pd\n",
+        "from collections import Counter\n",
+        "\n",
+        "# Sample data (replace with your actual data)\n",
+        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "\n",
+        "# Extract months from the creation dates\n",
+        "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
+        "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
+        "\n",
+        "# Count the occurrences of each month\n",
+        "english_counts = Counter(months_english)\n",
+        "spanish_counts = Counter(months_spanish)\n",
+        "\n",
+        "# Create a DataFrame for English datasets\n",
+        "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
+        "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
+        "df_english = df_english.sort_index()\n",
+        "\n",
+        "# Create a DataFrame for Spanish datasets\n",
+        "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
+        "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
+        "df_spanish = df_spanish.sort_index()\n",
+        "\n",
+        "# Merge the DataFrames\n",
+        "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
+        "\n",
+        "# Convert index to datetime\n",
+        "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
+        "\n",
+        "# Plotting the stacked area chart\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
+        "\n",
+        "# Adding labels and title\n",
+        "plt.xlabel('Date')\n",
+        "plt.ylabel('Cumulative Number of Datasets')\n",
+        "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
+        "\n",
+        "# Display the plot\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.legend(loc='upper left')\n",
+        "plt.grid(True)\n",
+        "plt.tight_layout()\n",
+        "plt.show()\n",
+        "\n",
+        "plt.savefig(\"plots/stack_area_3.png\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IAnFHiPlgnRE"
+      },
+      "source": [
+        "## Pie Chart"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8tKR1x-kVeZT"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "from collections import Counter\n",
+        "\n",
+        "# Calculate the count of \"other\" datasets\n",
+        "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n",
+        "\n",
+        "# Pie chart data\n",
+        "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n",
+        "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n",
+        "\n",
+        "# Plotting the pie chart\n",
+        "plt.figure(figsize=(8, 8))\n",
+        "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
+        "plt.title('Distribution of Monolingual Datasets by Language')\n",
+        "plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
+        "\n",
+        "# Display the plot\n",
+        "plt.show()\n",
+        "\n",
+        "plt.savefig(\"plots/pie_chart.png\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "z2xf8FrHROxy"
+      },
+      "source": [
+        "# Time series plot"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DuPFSZKUhyQj"
+      },
+      "outputs": [],
+      "source": [
+        "# Prepare data for plotting\n",
+        "\n",
+        "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n",
+        "df[\"Count\"] = 1\n",
+        "# Ensure the 'Date' column is of type datetime\n",
+        "df['Date'] = pd.to_datetime(df['Date'])\n",
+        "# Group by month and calculate cumulative sum\n",
+        "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n",
+        "\n",
+        "# Plot the data\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "plt.plot(\n",
+        "    df.index,\n",
+        "    df[\"Count\"],\n",
+        "    #marker=\"o\",\n",
+        "    color=\"g\"\n",
+        ")\n",
+        "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
+        "plt.xlabel(\"Fecha\")\n",
+        "plt.ylabel(\"Número de bases de datos\")\n",
+        "plt.grid(True)\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.tight_layout()\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-Vu3PIe2hITq"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import pandas as pd\n",
+        "from collections import Counter\n",
+        "\n",
+        "# Sample data (replace with your actual data)\n",
+        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "\n",
+        "# Extract months from the creation dates\n",
+        "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
+        "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
+        "\n",
+        "# Count the occurrences of each month\n",
+        "english_counts = Counter(months_english)\n",
+        "spanish_counts = Counter(months_spanish)\n",
+        "\n",
+        "# Create a DataFrame for English datasets\n",
+        "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
+        "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
+        "df_english = df_english.sort_index()\n",
+        "\n",
+        "# Create a DataFrame for Spanish datasets\n",
+        "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
+        "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
+        "df_spanish = df_spanish.sort_index()\n",
+        "\n",
+        "# Merge the DataFrames\n",
+        "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
+        "\n",
+        "# Convert index to datetime\n",
+        "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
+        "\n",
+        "# Calculate cumulative sum\n",
+        "df_cumulative = df.cumsum()\n",
+        "\n",
+        "# Plotting the cumulative chart\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n",
+        "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
+        "\n",
+        "# Adding labels and title\n",
+        "plt.xlabel('Date')\n",
+        "plt.ylabel('Cumulative Number of Datasets')\n",
+        "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
+        "\n",
+        "# Display the plot\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.legend(loc='upper left')\n",
+        "plt.grid(True)\n",
+        "plt.tight_layout()\n",
+        "plt.show()\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KG__of2IfdHu"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import pandas as pd\n",
+        "from collections import Counter\n",
+        "\n",
+        "# Sample data (replace with your actual data)\n",
+        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "\n",
+        "# Extract years from the creation dates\n",
+        "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
+        "english_counts = Counter(date.year for date in creation_dates_english)\n",
+        "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
+        "\n",
+        "# Prepare data for plotting\n",
+        "english_series = pd.Series([english_counts[year] for year in years], index=years)\n",
+        "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n",
+        "\n",
+        "# Plotting the time series\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n",
+        "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
+        "\n",
+        "# Adding labels and title\n",
+        "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
+        "plt.xlabel('Year')\n",
+        "plt.ylabel('Number of Datasets')\n",
+        "plt.legend()\n",
+        "plt.grid(True)\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.tight_layout()\n",
+        "plt.show()\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

numero_datasets_hub_output.ipynb ADDED Viewed

	@@ -0,0 +1,918 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b494ecb",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "bCPvBCk_VLoi",
+    "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d736660e",
+   "metadata": {
+    "id": "NbQeXxudVJW9",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from huggingface_hub import HfApi\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8dc1a8d8",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ogyTHBYJVZ8I",
+    "outputId": "f23a554a-7328-4e50-d87c-90368294467d",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "hf_api = HfApi()\n",
+    "\n",
+    "all_datasets = hf_api.list_datasets(full=True)\n",
+    "\n",
+    "total_count = len(list(all_datasets))\n",
+    "print(total_count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "299e6d56",
+   "metadata": {
+    "id": "GXDMUU-4XmaI",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n",
+    "\n",
+    "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
+    "\n",
+    "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n",
+    "\n",
+    "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n",
+    "\n",
+    "#for dataset in spanish_only_datasets:\n",
+    "#    print(dataset)\n",
+    "#    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "691d8f3a",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "pjCvHVq_hChx",
+    "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "hf_api = HfApi()\n",
+    "\n",
+    "all_datasets = hf_api.list_datasets(full=True)\n",
+    "\n",
+    "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
+    "spanish_datasets = list(spanish_filter)\n",
+    "spanish_count = len(list(spanish_datasets))\n",
+    "print(spanish_count)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9676c89",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "WANGkTpGRw8t",
+    "outputId": "da8931bf-7ae2-438d-8188-20190f568193",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "hf_api = HfApi()\n",
+    "\n",
+    "all_datasets = hf_api.list_datasets(full=True)\n",
+    "\n",
+    "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n",
+    "english_datasets = list(english_filter)\n",
+    "english_count = len(list(english_datasets))\n",
+    "print(english_count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf300ce6",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "yPtF0G7SWS53",
+    "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "hf_api = HfApi()\n",
+    "\n",
+    "all_datasets = hf_api.list_datasets(full=True)\n",
+    "\n",
+    "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n",
+    "chinese_datasets = list(chinese_filter)\n",
+    "chinese_count = len(list(chinese_datasets))\n",
+    "print(chinese_count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "407c46fc",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "RlxAlOOsW7p9",
+    "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "hf_api = HfApi()\n",
+    "\n",
+    "all_datasets = hf_api.list_datasets(full=True)\n",
+    "\n",
+    "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n",
+    "french_datasets = list(french_filter)\n",
+    "french_count = len(list(french_datasets))\n",
+    "print(french_count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7d82d5d",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "OMQfBXjUYBPz",
+    "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "hf_api = HfApi()\n",
+    "\n",
+    "all_datasets = hf_api.list_datasets(full=True)\n",
+    "\n",
+    "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n",
+    "mono_datasets = list(mono_filter)\n",
+    "mono_count = len(list(mono_datasets))\n",
+    "print(mono_count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6dc0ac68",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 180
+    },
+    "id": "sTPechkdWmYS",
+    "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Extract creation date\n",
+    "\n",
+    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+    "assert len(creation_dates_spanish) == 318\n",
+    "\n",
+    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+    "assert len(creation_dates_english) == 8336"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57d206ec",
+   "metadata": {
+    "id": "hefZVynDSjjE",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "print(creation_dates_spanish[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b80e411d",
+   "metadata": {
+    "id": "aFaEBlkkSbrs",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## Bar Chart\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96652421",
+   "metadata": {
+    "id": "dYJ2zd4dShYh",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Sample data (replace with your actual data)\n",
+    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+    "\n",
+    "# Extract years from the creation dates\n",
+    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
+    "english_counts = Counter(date.year for date in creation_dates_english)\n",
+    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
+    "\n",
+    "# Plotting the bar chart\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
+    "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
+    "\n",
+    "# Adding labels and title\n",
+    "plt.xlabel('Year')\n",
+    "plt.ylabel('Number of Datasets')\n",
+    "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
+    "plt.xticks(years)\n",
+    "plt.legend()\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.grid(True)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "plt.savefig(\"plots/bar_stack.png\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d1ae015",
+   "metadata": {
+    "id": "wViEE4wCUVgs",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Sample data (replace with your actual data)\n",
+    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+    "\n",
+    "# Extract years from the creation dates\n",
+    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
+    "english_counts = Counter(date.year for date in creation_dates_english)\n",
+    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
+    "\n",
+    "# Define the width of each bar\n",
+    "bar_width = 0.4\n",
+    "\n",
+    "# Define the x-coordinates for the bars\n",
+    "years_index = np.arange(len(years))\n",
+    "\n",
+    "# Plotting the side-by-side bar chart\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
+    "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
+    "\n",
+    "# Adding labels and title\n",
+    "plt.xlabel('Year')\n",
+    "plt.ylabel('Number of Datasets')\n",
+    "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
+    "plt.xticks(years_index, years)\n",
+    "plt.legend()\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.grid(True)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "plt.savefig(\"plots/bar_width.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cddf7237",
+   "metadata": {
+    "id": "Hp8vNA6LUA1E",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# Stacked Area Chart\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68255399",
+   "metadata": {
+    "id": "CWgCunzGUCot",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Sample data (replace with your actual data)\n",
+    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+    "\n",
+    "# Extract years from the creation dates\n",
+    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
+    "english_counts = Counter(date.year for date in creation_dates_english)\n",
+    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
+    "\n",
+    "# Calculate cumulative counts\n",
+    "english_datasets_cumulative = [english_counts[year] for year in years]\n",
+    "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n",
+    "for i in range(1, len(years)):\n",
+    "    english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n",
+    "    spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
+    "\n",
+    "# Plotting the stacked area chart\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
+    "\n",
+    "# Adding labels and title\n",
+    "plt.xlabel('Year')\n",
+    "plt.ylabel('Cumulative Number of Datasets')\n",
+    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
+    "plt.xticks(years)\n",
+    "plt.legend(loc='upper left')\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.grid(True)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "plt.savefig(\"plots/stack_area_1.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ba74cf5",
+   "metadata": {
+    "id": "GwRpZwYWhau3",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Sample data (replace with your actual data)\n",
+    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+    "\n",
+    "# Extract months from the creation dates\n",
+    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
+    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
+    "\n",
+    "# Count the occurrences of each month\n",
+    "english_counts = Counter(months_english)\n",
+    "spanish_counts = Counter(months_spanish)\n",
+    "\n",
+    "# Create a DataFrame for English datasets\n",
+    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
+    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
+    "df_english = df_english.sort_index()\n",
+    "\n",
+    "# Create a DataFrame for Spanish datasets\n",
+    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
+    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
+    "df_spanish = df_spanish.sort_index()\n",
+    "\n",
+    "# Merge the DataFrames\n",
+    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
+    "\n",
+    "# Convert index to datetime\n",
+    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
+    "\n",
+    "# Calculate cumulative sum\n",
+    "df_cumulative = df.cumsum()\n",
+    "\n",
+    "# Plotting the stacked area chart\n",
+    "plt.figure(figsize=(8, 5))\n",
+    "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
+    "\n",
+    "# Adding labels and title\n",
+    "plt.xlabel('Creation date')\n",
+    "plt.ylabel('Cumulative number of monolingual datasets')\n",
+    "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.xticks(rotation=45)\n",
+    "plt.legend(loc='upper left')\n",
+    "plt.grid(False)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "plt.savefig(\"plots/stack_area_2.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d96225ce",
+   "metadata": {
+    "id": "kJQ0OgRtglOQ",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Sample data (replace with your actual data)\n",
+    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+    "\n",
+    "# Extract months from the creation dates\n",
+    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
+    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
+    "\n",
+    "# Count the occurrences of each month\n",
+    "english_counts = Counter(months_english)\n",
+    "spanish_counts = Counter(months_spanish)\n",
+    "\n",
+    "# Create a DataFrame for English datasets\n",
+    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
+    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
+    "df_english = df_english.sort_index()\n",
+    "\n",
+    "# Create a DataFrame for Spanish datasets\n",
+    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
+    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
+    "df_spanish = df_spanish.sort_index()\n",
+    "\n",
+    "# Merge the DataFrames\n",
+    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
+    "\n",
+    "# Convert index to datetime\n",
+    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
+    "\n",
+    "# Plotting the stacked area chart\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
+    "\n",
+    "# Adding labels and title\n",
+    "plt.xlabel('Date')\n",
+    "plt.ylabel('Cumulative Number of Datasets')\n",
+    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.xticks(rotation=45)\n",
+    "plt.legend(loc='upper left')\n",
+    "plt.grid(True)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "plt.savefig(\"plots/stack_area_3.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bbec0ac",
+   "metadata": {
+    "id": "IAnFHiPlgnRE",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## Pie Chart"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c3dd684",
+   "metadata": {
+    "id": "8tKR1x-kVeZT",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Calculate the count of \"other\" datasets\n",
+    "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n",
+    "\n",
+    "# Pie chart data\n",
+    "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n",
+    "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n",
+    "\n",
+    "# Plotting the pie chart\n",
+    "plt.figure(figsize=(8, 8))\n",
+    "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
+    "plt.title('Distribution of Monolingual Datasets by Language')\n",
+    "plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.show()\n",
+    "\n",
+    "plt.savefig(\"plots/pie_chart.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11c1c9c8",
+   "metadata": {
+    "id": "z2xf8FrHROxy",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# Time series plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bb6a676",
+   "metadata": {
+    "id": "DuPFSZKUhyQj",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Prepare data for plotting\n",
+    "\n",
+    "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n",
+    "df[\"Count\"] = 1\n",
+    "# Ensure the 'Date' column is of type datetime\n",
+    "df['Date'] = pd.to_datetime(df['Date'])\n",
+    "# Group by month and calculate cumulative sum\n",
+    "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n",
+    "\n",
+    "# Plot the data\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(\n",
+    "    df.index,\n",
+    "    df[\"Count\"],\n",
+    "    #marker=\"o\",\n",
+    "    color=\"g\"\n",
+    ")\n",
+    "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
+    "plt.xlabel(\"Fecha\")\n",
+    "plt.ylabel(\"Número de bases de datos\")\n",
+    "plt.grid(True)\n",
+    "plt.xticks(rotation=45)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fc77d7f",
+   "metadata": {
+    "id": "-Vu3PIe2hITq",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Sample data (replace with your actual data)\n",
+    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+    "\n",
+    "# Extract months from the creation dates\n",
+    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
+    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
+    "\n",
+    "# Count the occurrences of each month\n",
+    "english_counts = Counter(months_english)\n",
+    "spanish_counts = Counter(months_spanish)\n",
+    "\n",
+    "# Create a DataFrame for English datasets\n",
+    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
+    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
+    "df_english = df_english.sort_index()\n",
+    "\n",
+    "# Create a DataFrame for Spanish datasets\n",
+    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
+    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
+    "df_spanish = df_spanish.sort_index()\n",
+    "\n",
+    "# Merge the DataFrames\n",
+    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
+    "\n",
+    "# Convert index to datetime\n",
+    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
+    "\n",
+    "# Calculate cumulative sum\n",
+    "df_cumulative = df.cumsum()\n",
+    "\n",
+    "# Plotting the cumulative chart\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n",
+    "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
+    "\n",
+    "# Adding labels and title\n",
+    "plt.xlabel('Date')\n",
+    "plt.ylabel('Cumulative Number of Datasets')\n",
+    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.xticks(rotation=45)\n",
+    "plt.legend(loc='upper left')\n",
+    "plt.grid(True)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c0d23ac",
+   "metadata": {
+    "id": "KG__of2IfdHu",
+    "papermill": {
+     "duration": null,
+     "end_time": null,
+     "exception": null,
+     "start_time": null,
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Sample data (replace with your actual data)\n",
+    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+    "\n",
+    "# Extract years from the creation dates\n",
+    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
+    "english_counts = Counter(date.year for date in creation_dates_english)\n",
+    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
+    "\n",
+    "# Prepare data for plotting\n",
+    "english_series = pd.Series([english_counts[year] for year in years], index=years)\n",
+    "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n",
+    "\n",
+    "# Plotting the time series\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n",
+    "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
+    "\n",
+    "# Adding labels and title\n",
+    "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
+    "plt.xlabel('Year')\n",
+    "plt.ylabel('Number of Datasets')\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.xticks(rotation=45)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.6"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "duration": 0.047858,
+   "end_time": "2024-05-15T09:04:29.634379",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "numero_datasets_hub.ipynb",
+   "output_path": "numero_datasets_hub_output.ipynb",
+   "parameters": {},
+   "start_time": "2024-05-15T09:04:29.586521",
+   "version": "2.6.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

plots/datasets_hub.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio==4.31.0
+nbconvert
+papermill