mariagrandury commited on
Commit
1f70be8
·
1 Parent(s): 6f2c797

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Language Gap In The Hub
3
  emoji: 📊
4
  colorFrom: pink
5
  colorTo: purple
@@ -9,5 +9,3 @@ app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Language Gap In The Hugging Face Hub
3
  emoji: 📊
4
  colorFrom: pink
5
  colorTo: purple
 
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import papermill as pm
3
+
4
+
5
+ def run_notebook():
6
+ try:
7
+ # Execute the notebook
8
+ pm.execute_notebook(
9
+ "numero_datasets_hub.ipynb",
10
+ "numero_datasets_hub_output.ipynb", # This will save the output in a new notebook
11
+ )
12
+ return "Notebook executed successfully!"
13
+ except Exception as e:
14
+ return f"Failed to execute notebook: {str(e)}"
15
+
16
+
17
+ def create_app():
18
+ with gr.Blocks() as app:
19
+
20
+ gr.Markdown("# The language gap in the Hugging Face Hub")
21
+
22
+ # Button to run the notebook
23
+ run_button = gr.Button("Run Notebook")
24
+ output_label = gr.Label() # To display the result of running the notebook
25
+
26
+ run_button.click(run_notebook, outputs=output_label)
27
+
28
+ # Create a 2x2 grid for images
29
+ with gr.Row():
30
+ with gr.Column():
31
+ image1 = gr.Image(
32
+ value="plots/datasets_hub.png",
33
+ label="Image 1",
34
+ show_label=True,
35
+ show_download_button=True,
36
+ show_share_button=True,
37
+ )
38
+ image2 = gr.Image(
39
+ value="datasets_hub.png",
40
+ label="Image 2",
41
+ )
42
+ with gr.Column():
43
+ image3 = gr.Image(
44
+ value="datasets_hub.png",
45
+ label="Image 3",
46
+ )
47
+ image4 = gr.Image(
48
+ value="datasets_hub.png",
49
+ label="Image 4",
50
+ )
51
+
52
+ # Description for images
53
+ gr.Markdown("### Image Descriptions")
54
+ gr.Markdown("Description for Image 1")
55
+ gr.Markdown("Description for Image 2")
56
+ gr.Markdown("Description for Image 3")
57
+ gr.Markdown("Description for Image 4")
58
+
59
+ # Collapsible block for citation
60
+ with gr.Accordion("Citation Information"):
61
+ gr.Markdown(
62
+ """
63
+ If you use the images or code please cite:
64
+
65
+ ```
66
+ fjdlsafd
67
+ ```
68
+ """
69
+ )
70
+
71
+ return app
72
+
73
+
74
+ app = create_app()
75
+ app.launch()
numero_datasets_hub.ipynb ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/"
9
+ },
10
+ "id": "bCPvBCk_VLoi",
11
+ "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba"
12
+ },
13
+ "outputs": [
14
+ {
15
+ "name": "stdout",
16
+ "output_type": "stream",
17
+ "text": [
18
+ "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.20.3)\n",
19
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (3.14.0)\n",
20
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2023.6.0)\n",
21
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2.31.0)\n",
22
+ "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.66.4)\n",
23
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (6.0.1)\n",
24
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.11.0)\n",
25
+ "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (24.0)\n",
26
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (3.3.2)\n",
27
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (3.7)\n",
28
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (2.0.7)\n",
29
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (2024.2.2)\n"
30
+ ]
31
+ }
32
+ ],
33
+ "source": [
34
+ "!pip install huggingface_hub"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 2,
40
+ "metadata": {
41
+ "id": "NbQeXxudVJW9"
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "from datetime import datetime\n",
46
+ "\n",
47
+ "import matplotlib.pyplot as plt\n",
48
+ "import pandas as pd\n",
49
+ "from huggingface_hub import HfApi\n"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 3,
55
+ "metadata": {
56
+ "colab": {
57
+ "base_uri": "https://localhost:8080/"
58
+ },
59
+ "id": "ogyTHBYJVZ8I",
60
+ "outputId": "f23a554a-7328-4e50-d87c-90368294467d"
61
+ },
62
+ "outputs": [
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
68
+ "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
69
+ "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
70
+ "You will be able to reuse this secret in all of your notebooks.\n",
71
+ "Please note that authentication is recommended but still optional to access public models or datasets.\n",
72
+ " warnings.warn(\n"
73
+ ]
74
+ },
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "145101\n"
80
+ ]
81
+ }
82
+ ],
83
+ "source": [
84
+ "hf_api = HfApi()\n",
85
+ "\n",
86
+ "all_datasets = hf_api.list_datasets(full=True)\n",
87
+ "\n",
88
+ "total_count = len(list(all_datasets))\n",
89
+ "print(total_count)"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 4,
95
+ "metadata": {
96
+ "id": "GXDMUU-4XmaI"
97
+ },
98
+ "outputs": [],
99
+ "source": [
100
+ "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n",
101
+ "\n",
102
+ "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
103
+ "\n",
104
+ "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n",
105
+ "\n",
106
+ "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n",
107
+ "\n",
108
+ "#for dataset in spanish_only_datasets:\n",
109
+ "# print(dataset)\n",
110
+ "# break"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 5,
116
+ "metadata": {
117
+ "colab": {
118
+ "base_uri": "https://localhost:8080/"
119
+ },
120
+ "id": "pjCvHVq_hChx",
121
+ "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8"
122
+ },
123
+ "outputs": [
124
+ {
125
+ "name": "stdout",
126
+ "output_type": "stream",
127
+ "text": [
128
+ "318\n"
129
+ ]
130
+ }
131
+ ],
132
+ "source": [
133
+ "hf_api = HfApi()\n",
134
+ "\n",
135
+ "all_datasets = hf_api.list_datasets(full=True)\n",
136
+ "\n",
137
+ "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
138
+ "spanish_datasets = list(spanish_filter)\n",
139
+ "spanish_count = len(list(spanish_datasets))\n",
140
+ "print(spanish_count)\n"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 6,
146
+ "metadata": {
147
+ "colab": {
148
+ "base_uri": "https://localhost:8080/"
149
+ },
150
+ "id": "WANGkTpGRw8t",
151
+ "outputId": "da8931bf-7ae2-438d-8188-20190f568193"
152
+ },
153
+ "outputs": [
154
+ {
155
+ "name": "stdout",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "8357\n"
159
+ ]
160
+ }
161
+ ],
162
+ "source": [
163
+ "hf_api = HfApi()\n",
164
+ "\n",
165
+ "all_datasets = hf_api.list_datasets(full=True)\n",
166
+ "\n",
167
+ "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n",
168
+ "english_datasets = list(english_filter)\n",
169
+ "english_count = len(list(english_datasets))\n",
170
+ "print(english_count)"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 7,
176
+ "metadata": {
177
+ "colab": {
178
+ "base_uri": "https://localhost:8080/"
179
+ },
180
+ "id": "yPtF0G7SWS53",
181
+ "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69"
182
+ },
183
+ "outputs": [
184
+ {
185
+ "name": "stdout",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "568\n"
189
+ ]
190
+ }
191
+ ],
192
+ "source": [
193
+ "hf_api = HfApi()\n",
194
+ "\n",
195
+ "all_datasets = hf_api.list_datasets(full=True)\n",
196
+ "\n",
197
+ "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n",
198
+ "chinese_datasets = list(chinese_filter)\n",
199
+ "chinese_count = len(list(chinese_datasets))\n",
200
+ "print(chinese_count)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 8,
206
+ "metadata": {
207
+ "colab": {
208
+ "base_uri": "https://localhost:8080/"
209
+ },
210
+ "id": "RlxAlOOsW7p9",
211
+ "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322"
212
+ },
213
+ "outputs": [
214
+ {
215
+ "name": "stdout",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "436\n"
219
+ ]
220
+ }
221
+ ],
222
+ "source": [
223
+ "hf_api = HfApi()\n",
224
+ "\n",
225
+ "all_datasets = hf_api.list_datasets(full=True)\n",
226
+ "\n",
227
+ "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n",
228
+ "french_datasets = list(french_filter)\n",
229
+ "french_count = len(list(french_datasets))\n",
230
+ "print(french_count)"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 9,
236
+ "metadata": {
237
+ "colab": {
238
+ "base_uri": "https://localhost:8080/"
239
+ },
240
+ "id": "OMQfBXjUYBPz",
241
+ "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed"
242
+ },
243
+ "outputs": [
244
+ {
245
+ "name": "stdout",
246
+ "output_type": "stream",
247
+ "text": [
248
+ "13886\n"
249
+ ]
250
+ }
251
+ ],
252
+ "source": [
253
+ "hf_api = HfApi()\n",
254
+ "\n",
255
+ "all_datasets = hf_api.list_datasets(full=True)\n",
256
+ "\n",
257
+ "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n",
258
+ "mono_datasets = list(mono_filter)\n",
259
+ "mono_count = len(list(mono_datasets))\n",
260
+ "print(mono_count)"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 10,
266
+ "metadata": {
267
+ "colab": {
268
+ "base_uri": "https://localhost:8080/",
269
+ "height": 180
270
+ },
271
+ "id": "sTPechkdWmYS",
272
+ "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3"
273
+ },
274
+ "outputs": [
275
+ {
276
+ "ename": "AssertionError",
277
+ "evalue": "",
278
+ "output_type": "error",
279
+ "traceback": [
280
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
281
+ "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
282
+ "\u001b[0;32m<ipython-input-10-da38b5a6b412>\u001b[0m in \u001b[0;36m<cell line: 7>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mcreation_dates_english\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0md\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreated_at\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menglish_datasets\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcreation_dates_english\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m8336\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
283
+ "\u001b[0;31mAssertionError\u001b[0m: "
284
+ ]
285
+ }
286
+ ],
287
+ "source": [
288
+ "# Extract creation date\n",
289
+ "\n",
290
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
291
+ "assert len(creation_dates_spanish) == 318\n",
292
+ "\n",
293
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
294
+ "assert len(creation_dates_english) == 8336"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": null,
300
+ "metadata": {
301
+ "id": "hefZVynDSjjE"
302
+ },
303
+ "outputs": [],
304
+ "source": [
305
+ "print(creation_dates_spanish[0])"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "markdown",
310
+ "metadata": {
311
+ "id": "aFaEBlkkSbrs"
312
+ },
313
+ "source": [
314
+ "## Bar Chart\n",
315
+ "\n"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": null,
321
+ "metadata": {
322
+ "id": "dYJ2zd4dShYh"
323
+ },
324
+ "outputs": [],
325
+ "source": [
326
+ "import matplotlib.pyplot as plt\n",
327
+ "from collections import Counter\n",
328
+ "\n",
329
+ "# Sample data (replace with your actual data)\n",
330
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
331
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
332
+ "\n",
333
+ "# Extract years from the creation dates\n",
334
+ "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
335
+ "english_counts = Counter(date.year for date in creation_dates_english)\n",
336
+ "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
337
+ "\n",
338
+ "# Plotting the bar chart\n",
339
+ "plt.figure(figsize=(10, 6))\n",
340
+ "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
341
+ "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
342
+ "\n",
343
+ "# Adding labels and title\n",
344
+ "plt.xlabel('Year')\n",
345
+ "plt.ylabel('Number of Datasets')\n",
346
+ "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
347
+ "plt.xticks(years)\n",
348
+ "plt.legend()\n",
349
+ "\n",
350
+ "# Display the plot\n",
351
+ "plt.grid(True)\n",
352
+ "plt.tight_layout()\n",
353
+ "plt.show()\n",
354
+ "plt.savefig(\"plots/bar_stack.png\")\n"
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": null,
360
+ "metadata": {
361
+ "id": "wViEE4wCUVgs"
362
+ },
363
+ "outputs": [],
364
+ "source": [
365
+ "import matplotlib.pyplot as plt\n",
366
+ "import numpy as np\n",
367
+ "from collections import Counter\n",
368
+ "\n",
369
+ "# Sample data (replace with your actual data)\n",
370
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
371
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
372
+ "\n",
373
+ "# Extract years from the creation dates\n",
374
+ "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
375
+ "english_counts = Counter(date.year for date in creation_dates_english)\n",
376
+ "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
377
+ "\n",
378
+ "# Define the width of each bar\n",
379
+ "bar_width = 0.4\n",
380
+ "\n",
381
+ "# Define the x-coordinates for the bars\n",
382
+ "years_index = np.arange(len(years))\n",
383
+ "\n",
384
+ "# Plotting the side-by-side bar chart\n",
385
+ "plt.figure(figsize=(10, 6))\n",
386
+ "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
387
+ "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
388
+ "\n",
389
+ "# Adding labels and title\n",
390
+ "plt.xlabel('Year')\n",
391
+ "plt.ylabel('Number of Datasets')\n",
392
+ "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
393
+ "plt.xticks(years_index, years)\n",
394
+ "plt.legend()\n",
395
+ "\n",
396
+ "# Display the plot\n",
397
+ "plt.grid(True)\n",
398
+ "plt.tight_layout()\n",
399
+ "plt.show()\n",
400
+ "plt.savefig(\"plots/bar_width.png\")"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "markdown",
405
+ "metadata": {
406
+ "id": "Hp8vNA6LUA1E"
407
+ },
408
+ "source": [
409
+ "# Stacked Area Chart\n"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": null,
415
+ "metadata": {
416
+ "id": "CWgCunzGUCot"
417
+ },
418
+ "outputs": [],
419
+ "source": [
420
+ "import matplotlib.pyplot as plt\n",
421
+ "from collections import Counter\n",
422
+ "\n",
423
+ "# Sample data (replace with your actual data)\n",
424
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
425
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
426
+ "\n",
427
+ "# Extract years from the creation dates\n",
428
+ "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
429
+ "english_counts = Counter(date.year for date in creation_dates_english)\n",
430
+ "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
431
+ "\n",
432
+ "# Calculate cumulative counts\n",
433
+ "english_datasets_cumulative = [english_counts[year] for year in years]\n",
434
+ "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n",
435
+ "for i in range(1, len(years)):\n",
436
+ " english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n",
437
+ " spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
438
+ "\n",
439
+ "# Plotting the stacked area chart\n",
440
+ "plt.figure(figsize=(10, 6))\n",
441
+ "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
442
+ "\n",
443
+ "# Adding labels and title\n",
444
+ "plt.xlabel('Year')\n",
445
+ "plt.ylabel('Cumulative Number of Datasets')\n",
446
+ "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
447
+ "plt.xticks(years)\n",
448
+ "plt.legend(loc='upper left')\n",
449
+ "\n",
450
+ "# Display the plot\n",
451
+ "plt.grid(True)\n",
452
+ "plt.tight_layout()\n",
453
+ "plt.show()\n",
454
+ "\n",
455
+ "plt.savefig(\"plots/stack_area_1.png\")"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": null,
461
+ "metadata": {
462
+ "id": "GwRpZwYWhau3"
463
+ },
464
+ "outputs": [],
465
+ "source": [
466
+ "import matplotlib.pyplot as plt\n",
467
+ "import pandas as pd\n",
468
+ "from collections import Counter\n",
469
+ "\n",
470
+ "# Sample data (replace with your actual data)\n",
471
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
472
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
473
+ "\n",
474
+ "# Extract months from the creation dates\n",
475
+ "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
476
+ "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
477
+ "\n",
478
+ "# Count the occurrences of each month\n",
479
+ "english_counts = Counter(months_english)\n",
480
+ "spanish_counts = Counter(months_spanish)\n",
481
+ "\n",
482
+ "# Create a DataFrame for English datasets\n",
483
+ "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
484
+ "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
485
+ "df_english = df_english.sort_index()\n",
486
+ "\n",
487
+ "# Create a DataFrame for Spanish datasets\n",
488
+ "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
489
+ "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
490
+ "df_spanish = df_spanish.sort_index()\n",
491
+ "\n",
492
+ "# Merge the DataFrames\n",
493
+ "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
494
+ "\n",
495
+ "# Convert index to datetime\n",
496
+ "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
497
+ "\n",
498
+ "# Calculate cumulative sum\n",
499
+ "df_cumulative = df.cumsum()\n",
500
+ "\n",
501
+ "# Plotting the stacked area chart\n",
502
+ "plt.figure(figsize=(8, 5))\n",
503
+ "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
504
+ "\n",
505
+ "# Adding labels and title\n",
506
+ "plt.xlabel('Creation date')\n",
507
+ "plt.ylabel('Cumulative number of monolingual datasets')\n",
508
+ "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
509
+ "\n",
510
+ "# Display the plot\n",
511
+ "plt.xticks(rotation=45)\n",
512
+ "plt.legend(loc='upper left')\n",
513
+ "plt.grid(False)\n",
514
+ "plt.tight_layout()\n",
515
+ "plt.show()\n",
516
+ "\n",
517
+ "plt.savefig(\"plots/stack_area_2.png\")"
518
+ ]
519
+ },
520
+ {
521
+ "cell_type": "code",
522
+ "execution_count": null,
523
+ "metadata": {
524
+ "id": "kJQ0OgRtglOQ"
525
+ },
526
+ "outputs": [],
527
+ "source": [
528
+ "import matplotlib.pyplot as plt\n",
529
+ "import pandas as pd\n",
530
+ "from collections import Counter\n",
531
+ "\n",
532
+ "# Sample data (replace with your actual data)\n",
533
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
534
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
535
+ "\n",
536
+ "# Extract months from the creation dates\n",
537
+ "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
538
+ "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
539
+ "\n",
540
+ "# Count the occurrences of each month\n",
541
+ "english_counts = Counter(months_english)\n",
542
+ "spanish_counts = Counter(months_spanish)\n",
543
+ "\n",
544
+ "# Create a DataFrame for English datasets\n",
545
+ "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
546
+ "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
547
+ "df_english = df_english.sort_index()\n",
548
+ "\n",
549
+ "# Create a DataFrame for Spanish datasets\n",
550
+ "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
551
+ "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
552
+ "df_spanish = df_spanish.sort_index()\n",
553
+ "\n",
554
+ "# Merge the DataFrames\n",
555
+ "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
556
+ "\n",
557
+ "# Convert index to datetime\n",
558
+ "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
559
+ "\n",
560
+ "# Plotting the stacked area chart\n",
561
+ "plt.figure(figsize=(10, 6))\n",
562
+ "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
563
+ "\n",
564
+ "# Adding labels and title\n",
565
+ "plt.xlabel('Date')\n",
566
+ "plt.ylabel('Cumulative Number of Datasets')\n",
567
+ "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
568
+ "\n",
569
+ "# Display the plot\n",
570
+ "plt.xticks(rotation=45)\n",
571
+ "plt.legend(loc='upper left')\n",
572
+ "plt.grid(True)\n",
573
+ "plt.tight_layout()\n",
574
+ "plt.show()\n",
575
+ "\n",
576
+ "plt.savefig(\"plots/stack_area_3.png\")"
577
+ ]
578
+ },
579
+ {
580
+ "cell_type": "markdown",
581
+ "metadata": {
582
+ "id": "IAnFHiPlgnRE"
583
+ },
584
+ "source": [
585
+ "## Pie Chart"
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "execution_count": null,
591
+ "metadata": {
592
+ "id": "8tKR1x-kVeZT"
593
+ },
594
+ "outputs": [],
595
+ "source": [
596
+ "import matplotlib.pyplot as plt\n",
597
+ "from collections import Counter\n",
598
+ "\n",
599
+ "# Calculate the count of \"other\" datasets\n",
600
+ "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n",
601
+ "\n",
602
+ "# Pie chart data\n",
603
+ "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n",
604
+ "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n",
605
+ "\n",
606
+ "# Plotting the pie chart\n",
607
+ "plt.figure(figsize=(8, 8))\n",
608
+ "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
609
+ "plt.title('Distribution of Monolingual Datasets by Language')\n",
610
+ "plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
611
+ "\n",
612
+ "# Display the plot\n",
613
+ "plt.show()\n",
614
+ "\n",
615
+ "plt.savefig(\"plots/pie_chart.png\")"
616
+ ]
617
+ },
618
+ {
619
+ "cell_type": "markdown",
620
+ "metadata": {
621
+ "id": "z2xf8FrHROxy"
622
+ },
623
+ "source": [
624
+ "# Time series plot"
625
+ ]
626
+ },
627
+ {
628
+ "cell_type": "code",
629
+ "execution_count": null,
630
+ "metadata": {
631
+ "id": "DuPFSZKUhyQj"
632
+ },
633
+ "outputs": [],
634
+ "source": [
635
+ "# Prepare data for plotting\n",
636
+ "\n",
637
+ "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n",
638
+ "df[\"Count\"] = 1\n",
639
+ "# Ensure the 'Date' column is of type datetime\n",
640
+ "df['Date'] = pd.to_datetime(df['Date'])\n",
641
+ "# Group by month and calculate cumulative sum\n",
642
+ "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n",
643
+ "\n",
644
+ "# Plot the data\n",
645
+ "plt.figure(figsize=(10, 6))\n",
646
+ "plt.plot(\n",
647
+ " df.index,\n",
648
+ " df[\"Count\"],\n",
649
+ " #marker=\"o\",\n",
650
+ " color=\"g\"\n",
651
+ ")\n",
652
+ "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
653
+ "plt.xlabel(\"Fecha\")\n",
654
+ "plt.ylabel(\"Número de bases de datos\")\n",
655
+ "plt.grid(True)\n",
656
+ "plt.xticks(rotation=45)\n",
657
+ "plt.tight_layout()\n",
658
+ "plt.show()"
659
+ ]
660
+ },
661
+ {
662
+ "cell_type": "code",
663
+ "execution_count": null,
664
+ "metadata": {
665
+ "id": "-Vu3PIe2hITq"
666
+ },
667
+ "outputs": [],
668
+ "source": [
669
+ "import matplotlib.pyplot as plt\n",
670
+ "import pandas as pd\n",
671
+ "from collections import Counter\n",
672
+ "\n",
673
+ "# Sample data (replace with your actual data)\n",
674
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
675
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
676
+ "\n",
677
+ "# Extract months from the creation dates\n",
678
+ "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
679
+ "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
680
+ "\n",
681
+ "# Count the occurrences of each month\n",
682
+ "english_counts = Counter(months_english)\n",
683
+ "spanish_counts = Counter(months_spanish)\n",
684
+ "\n",
685
+ "# Create a DataFrame for English datasets\n",
686
+ "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
687
+ "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
688
+ "df_english = df_english.sort_index()\n",
689
+ "\n",
690
+ "# Create a DataFrame for Spanish datasets\n",
691
+ "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
692
+ "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
693
+ "df_spanish = df_spanish.sort_index()\n",
694
+ "\n",
695
+ "# Merge the DataFrames\n",
696
+ "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
697
+ "\n",
698
+ "# Convert index to datetime\n",
699
+ "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
700
+ "\n",
701
+ "# Calculate cumulative sum\n",
702
+ "df_cumulative = df.cumsum()\n",
703
+ "\n",
704
+ "# Plotting the cumulative chart\n",
705
+ "plt.figure(figsize=(10, 6))\n",
706
+ "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n",
707
+ "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
708
+ "\n",
709
+ "# Adding labels and title\n",
710
+ "plt.xlabel('Date')\n",
711
+ "plt.ylabel('Cumulative Number of Datasets')\n",
712
+ "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
713
+ "\n",
714
+ "# Display the plot\n",
715
+ "plt.xticks(rotation=45)\n",
716
+ "plt.legend(loc='upper left')\n",
717
+ "plt.grid(True)\n",
718
+ "plt.tight_layout()\n",
719
+ "plt.show()\n"
720
+ ]
721
+ },
722
+ {
723
+ "cell_type": "code",
724
+ "execution_count": null,
725
+ "metadata": {
726
+ "id": "KG__of2IfdHu"
727
+ },
728
+ "outputs": [],
729
+ "source": [
730
+ "import matplotlib.pyplot as plt\n",
731
+ "import pandas as pd\n",
732
+ "from collections import Counter\n",
733
+ "\n",
734
+ "# Sample data (replace with your actual data)\n",
735
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
736
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
737
+ "\n",
738
+ "# Extract years from the creation dates\n",
739
+ "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
740
+ "english_counts = Counter(date.year for date in creation_dates_english)\n",
741
+ "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
742
+ "\n",
743
+ "# Prepare data for plotting\n",
744
+ "english_series = pd.Series([english_counts[year] for year in years], index=years)\n",
745
+ "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n",
746
+ "\n",
747
+ "# Plotting the time series\n",
748
+ "plt.figure(figsize=(10, 6))\n",
749
+ "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n",
750
+ "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
751
+ "\n",
752
+ "# Adding labels and title\n",
753
+ "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
754
+ "plt.xlabel('Year')\n",
755
+ "plt.ylabel('Number of Datasets')\n",
756
+ "plt.legend()\n",
757
+ "plt.grid(True)\n",
758
+ "plt.xticks(rotation=45)\n",
759
+ "plt.tight_layout()\n",
760
+ "plt.show()\n"
761
+ ]
762
+ }
763
+ ],
764
+ "metadata": {
765
+ "accelerator": "GPU",
766
+ "colab": {
767
+ "gpuType": "T4",
768
+ "provenance": []
769
+ },
770
+ "kernelspec": {
771
+ "display_name": "Python 3",
772
+ "name": "python3"
773
+ },
774
+ "language_info": {
775
+ "name": "python",
776
+ "version": "3.11.6"
777
+ }
778
+ },
779
+ "nbformat": 4,
780
+ "nbformat_minor": 0
781
+ }
numero_datasets_hub_output.ipynb ADDED
@@ -0,0 +1,918 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "9b494ecb",
7
+ "metadata": {
8
+ "colab": {
9
+ "base_uri": "https://localhost:8080/"
10
+ },
11
+ "id": "bCPvBCk_VLoi",
12
+ "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba",
13
+ "papermill": {
14
+ "duration": null,
15
+ "end_time": null,
16
+ "exception": null,
17
+ "start_time": null,
18
+ "status": "completed"
19
+ },
20
+ "tags": []
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "!pip install huggingface_hub"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "id": "d736660e",
31
+ "metadata": {
32
+ "id": "NbQeXxudVJW9",
33
+ "papermill": {
34
+ "duration": null,
35
+ "end_time": null,
36
+ "exception": null,
37
+ "start_time": null,
38
+ "status": "completed"
39
+ },
40
+ "tags": []
41
+ },
42
+ "outputs": [],
43
+ "source": [
44
+ "from datetime import datetime\n",
45
+ "\n",
46
+ "import matplotlib.pyplot as plt\n",
47
+ "import pandas as pd\n",
48
+ "from huggingface_hub import HfApi\n"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "id": "8dc1a8d8",
55
+ "metadata": {
56
+ "colab": {
57
+ "base_uri": "https://localhost:8080/"
58
+ },
59
+ "id": "ogyTHBYJVZ8I",
60
+ "outputId": "f23a554a-7328-4e50-d87c-90368294467d",
61
+ "papermill": {
62
+ "duration": null,
63
+ "end_time": null,
64
+ "exception": null,
65
+ "start_time": null,
66
+ "status": "completed"
67
+ },
68
+ "tags": []
69
+ },
70
+ "outputs": [],
71
+ "source": [
72
+ "hf_api = HfApi()\n",
73
+ "\n",
74
+ "all_datasets = hf_api.list_datasets(full=True)\n",
75
+ "\n",
76
+ "total_count = len(list(all_datasets))\n",
77
+ "print(total_count)"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "id": "299e6d56",
84
+ "metadata": {
85
+ "id": "GXDMUU-4XmaI",
86
+ "papermill": {
87
+ "duration": null,
88
+ "end_time": null,
89
+ "exception": null,
90
+ "start_time": null,
91
+ "status": "completed"
92
+ },
93
+ "tags": []
94
+ },
95
+ "outputs": [],
96
+ "source": [
97
+ "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n",
98
+ "\n",
99
+ "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
100
+ "\n",
101
+ "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n",
102
+ "\n",
103
+ "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n",
104
+ "\n",
105
+ "#for dataset in spanish_only_datasets:\n",
106
+ "# print(dataset)\n",
107
+ "# break"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "id": "691d8f3a",
114
+ "metadata": {
115
+ "colab": {
116
+ "base_uri": "https://localhost:8080/"
117
+ },
118
+ "id": "pjCvHVq_hChx",
119
+ "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8",
120
+ "papermill": {
121
+ "duration": null,
122
+ "end_time": null,
123
+ "exception": null,
124
+ "start_time": null,
125
+ "status": "completed"
126
+ },
127
+ "tags": []
128
+ },
129
+ "outputs": [],
130
+ "source": [
131
+ "hf_api = HfApi()\n",
132
+ "\n",
133
+ "all_datasets = hf_api.list_datasets(full=True)\n",
134
+ "\n",
135
+ "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
136
+ "spanish_datasets = list(spanish_filter)\n",
137
+ "spanish_count = len(list(spanish_datasets))\n",
138
+ "print(spanish_count)\n"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": null,
144
+ "id": "c9676c89",
145
+ "metadata": {
146
+ "colab": {
147
+ "base_uri": "https://localhost:8080/"
148
+ },
149
+ "id": "WANGkTpGRw8t",
150
+ "outputId": "da8931bf-7ae2-438d-8188-20190f568193",
151
+ "papermill": {
152
+ "duration": null,
153
+ "end_time": null,
154
+ "exception": null,
155
+ "start_time": null,
156
+ "status": "completed"
157
+ },
158
+ "tags": []
159
+ },
160
+ "outputs": [],
161
+ "source": [
162
+ "hf_api = HfApi()\n",
163
+ "\n",
164
+ "all_datasets = hf_api.list_datasets(full=True)\n",
165
+ "\n",
166
+ "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n",
167
+ "english_datasets = list(english_filter)\n",
168
+ "english_count = len(list(english_datasets))\n",
169
+ "print(english_count)"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": null,
175
+ "id": "bf300ce6",
176
+ "metadata": {
177
+ "colab": {
178
+ "base_uri": "https://localhost:8080/"
179
+ },
180
+ "id": "yPtF0G7SWS53",
181
+ "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69",
182
+ "papermill": {
183
+ "duration": null,
184
+ "end_time": null,
185
+ "exception": null,
186
+ "start_time": null,
187
+ "status": "completed"
188
+ },
189
+ "tags": []
190
+ },
191
+ "outputs": [],
192
+ "source": [
193
+ "hf_api = HfApi()\n",
194
+ "\n",
195
+ "all_datasets = hf_api.list_datasets(full=True)\n",
196
+ "\n",
197
+ "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n",
198
+ "chinese_datasets = list(chinese_filter)\n",
199
+ "chinese_count = len(list(chinese_datasets))\n",
200
+ "print(chinese_count)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": null,
206
+ "id": "407c46fc",
207
+ "metadata": {
208
+ "colab": {
209
+ "base_uri": "https://localhost:8080/"
210
+ },
211
+ "id": "RlxAlOOsW7p9",
212
+ "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322",
213
+ "papermill": {
214
+ "duration": null,
215
+ "end_time": null,
216
+ "exception": null,
217
+ "start_time": null,
218
+ "status": "completed"
219
+ },
220
+ "tags": []
221
+ },
222
+ "outputs": [],
223
+ "source": [
224
+ "hf_api = HfApi()\n",
225
+ "\n",
226
+ "all_datasets = hf_api.list_datasets(full=True)\n",
227
+ "\n",
228
+ "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n",
229
+ "french_datasets = list(french_filter)\n",
230
+ "french_count = len(list(french_datasets))\n",
231
+ "print(french_count)"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": null,
237
+ "id": "a7d82d5d",
238
+ "metadata": {
239
+ "colab": {
240
+ "base_uri": "https://localhost:8080/"
241
+ },
242
+ "id": "OMQfBXjUYBPz",
243
+ "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed",
244
+ "papermill": {
245
+ "duration": null,
246
+ "end_time": null,
247
+ "exception": null,
248
+ "start_time": null,
249
+ "status": "completed"
250
+ },
251
+ "tags": []
252
+ },
253
+ "outputs": [],
254
+ "source": [
255
+ "hf_api = HfApi()\n",
256
+ "\n",
257
+ "all_datasets = hf_api.list_datasets(full=True)\n",
258
+ "\n",
259
+ "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n",
260
+ "mono_datasets = list(mono_filter)\n",
261
+ "mono_count = len(list(mono_datasets))\n",
262
+ "print(mono_count)"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": null,
268
+ "id": "6dc0ac68",
269
+ "metadata": {
270
+ "colab": {
271
+ "base_uri": "https://localhost:8080/",
272
+ "height": 180
273
+ },
274
+ "id": "sTPechkdWmYS",
275
+ "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3",
276
+ "papermill": {
277
+ "duration": null,
278
+ "end_time": null,
279
+ "exception": null,
280
+ "start_time": null,
281
+ "status": "completed"
282
+ },
283
+ "tags": []
284
+ },
285
+ "outputs": [],
286
+ "source": [
287
+ "# Extract creation date\n",
288
+ "\n",
289
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
290
+ "assert len(creation_dates_spanish) == 318\n",
291
+ "\n",
292
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
293
+ "assert len(creation_dates_english) == 8336"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "id": "57d206ec",
300
+ "metadata": {
301
+ "id": "hefZVynDSjjE",
302
+ "papermill": {
303
+ "duration": null,
304
+ "end_time": null,
305
+ "exception": null,
306
+ "start_time": null,
307
+ "status": "completed"
308
+ },
309
+ "tags": []
310
+ },
311
+ "outputs": [],
312
+ "source": [
313
+ "print(creation_dates_spanish[0])"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "markdown",
318
+ "id": "b80e411d",
319
+ "metadata": {
320
+ "id": "aFaEBlkkSbrs",
321
+ "papermill": {
322
+ "duration": null,
323
+ "end_time": null,
324
+ "exception": null,
325
+ "start_time": null,
326
+ "status": "completed"
327
+ },
328
+ "tags": []
329
+ },
330
+ "source": [
331
+ "## Bar Chart\n",
332
+ "\n"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": null,
338
+ "id": "96652421",
339
+ "metadata": {
340
+ "id": "dYJ2zd4dShYh",
341
+ "papermill": {
342
+ "duration": null,
343
+ "end_time": null,
344
+ "exception": null,
345
+ "start_time": null,
346
+ "status": "completed"
347
+ },
348
+ "tags": []
349
+ },
350
+ "outputs": [],
351
+ "source": [
352
+ "import matplotlib.pyplot as plt\n",
353
+ "from collections import Counter\n",
354
+ "\n",
355
+ "# Sample data (replace with your actual data)\n",
356
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
357
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
358
+ "\n",
359
+ "# Extract years from the creation dates\n",
360
+ "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
361
+ "english_counts = Counter(date.year for date in creation_dates_english)\n",
362
+ "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
363
+ "\n",
364
+ "# Plotting the bar chart\n",
365
+ "plt.figure(figsize=(10, 6))\n",
366
+ "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
367
+ "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
368
+ "\n",
369
+ "# Adding labels and title\n",
370
+ "plt.xlabel('Year')\n",
371
+ "plt.ylabel('Number of Datasets')\n",
372
+ "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
373
+ "plt.xticks(years)\n",
374
+ "plt.legend()\n",
375
+ "\n",
376
+ "# Display the plot\n",
377
+ "plt.grid(True)\n",
378
+ "plt.tight_layout()\n",
379
+ "plt.show()\n",
380
+ "plt.savefig(\"plots/bar_stack.png\")\n"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "code",
385
+ "execution_count": null,
386
+ "id": "2d1ae015",
387
+ "metadata": {
388
+ "id": "wViEE4wCUVgs",
389
+ "papermill": {
390
+ "duration": null,
391
+ "end_time": null,
392
+ "exception": null,
393
+ "start_time": null,
394
+ "status": "completed"
395
+ },
396
+ "tags": []
397
+ },
398
+ "outputs": [],
399
+ "source": [
400
+ "import matplotlib.pyplot as plt\n",
401
+ "import numpy as np\n",
402
+ "from collections import Counter\n",
403
+ "\n",
404
+ "# Sample data (replace with your actual data)\n",
405
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
406
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
407
+ "\n",
408
+ "# Extract years from the creation dates\n",
409
+ "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
410
+ "english_counts = Counter(date.year for date in creation_dates_english)\n",
411
+ "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
412
+ "\n",
413
+ "# Define the width of each bar\n",
414
+ "bar_width = 0.4\n",
415
+ "\n",
416
+ "# Define the x-coordinates for the bars\n",
417
+ "years_index = np.arange(len(years))\n",
418
+ "\n",
419
+ "# Plotting the side-by-side bar chart\n",
420
+ "plt.figure(figsize=(10, 6))\n",
421
+ "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
422
+ "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
423
+ "\n",
424
+ "# Adding labels and title\n",
425
+ "plt.xlabel('Year')\n",
426
+ "plt.ylabel('Number of Datasets')\n",
427
+ "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
428
+ "plt.xticks(years_index, years)\n",
429
+ "plt.legend()\n",
430
+ "\n",
431
+ "# Display the plot\n",
432
+ "plt.grid(True)\n",
433
+ "plt.tight_layout()\n",
434
+ "plt.show()\n",
435
+ "plt.savefig(\"plots/bar_width.png\")"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "markdown",
440
+ "id": "cddf7237",
441
+ "metadata": {
442
+ "id": "Hp8vNA6LUA1E",
443
+ "papermill": {
444
+ "duration": null,
445
+ "end_time": null,
446
+ "exception": null,
447
+ "start_time": null,
448
+ "status": "completed"
449
+ },
450
+ "tags": []
451
+ },
452
+ "source": [
453
+ "# Stacked Area Chart\n"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": null,
459
+ "id": "68255399",
460
+ "metadata": {
461
+ "id": "CWgCunzGUCot",
462
+ "papermill": {
463
+ "duration": null,
464
+ "end_time": null,
465
+ "exception": null,
466
+ "start_time": null,
467
+ "status": "completed"
468
+ },
469
+ "tags": []
470
+ },
471
+ "outputs": [],
472
+ "source": [
473
+ "import matplotlib.pyplot as plt\n",
474
+ "from collections import Counter\n",
475
+ "\n",
476
+ "# Sample data (replace with your actual data)\n",
477
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
478
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
479
+ "\n",
480
+ "# Extract years from the creation dates\n",
481
+ "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
482
+ "english_counts = Counter(date.year for date in creation_dates_english)\n",
483
+ "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
484
+ "\n",
485
+ "# Calculate cumulative counts\n",
486
+ "english_datasets_cumulative = [english_counts[year] for year in years]\n",
487
+ "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n",
488
+ "for i in range(1, len(years)):\n",
489
+ " english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n",
490
+ " spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
491
+ "\n",
492
+ "# Plotting the stacked area chart\n",
493
+ "plt.figure(figsize=(10, 6))\n",
494
+ "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
495
+ "\n",
496
+ "# Adding labels and title\n",
497
+ "plt.xlabel('Year')\n",
498
+ "plt.ylabel('Cumulative Number of Datasets')\n",
499
+ "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
500
+ "plt.xticks(years)\n",
501
+ "plt.legend(loc='upper left')\n",
502
+ "\n",
503
+ "# Display the plot\n",
504
+ "plt.grid(True)\n",
505
+ "plt.tight_layout()\n",
506
+ "plt.show()\n",
507
+ "\n",
508
+ "plt.savefig(\"plots/stack_area_1.png\")"
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "execution_count": null,
514
+ "id": "4ba74cf5",
515
+ "metadata": {
516
+ "id": "GwRpZwYWhau3",
517
+ "papermill": {
518
+ "duration": null,
519
+ "end_time": null,
520
+ "exception": null,
521
+ "start_time": null,
522
+ "status": "completed"
523
+ },
524
+ "tags": []
525
+ },
526
+ "outputs": [],
527
+ "source": [
528
+ "import matplotlib.pyplot as plt\n",
529
+ "import pandas as pd\n",
530
+ "from collections import Counter\n",
531
+ "\n",
532
+ "# Sample data (replace with your actual data)\n",
533
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
534
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
535
+ "\n",
536
+ "# Extract months from the creation dates\n",
537
+ "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
538
+ "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
539
+ "\n",
540
+ "# Count the occurrences of each month\n",
541
+ "english_counts = Counter(months_english)\n",
542
+ "spanish_counts = Counter(months_spanish)\n",
543
+ "\n",
544
+ "# Create a DataFrame for English datasets\n",
545
+ "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
546
+ "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
547
+ "df_english = df_english.sort_index()\n",
548
+ "\n",
549
+ "# Create a DataFrame for Spanish datasets\n",
550
+ "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
551
+ "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
552
+ "df_spanish = df_spanish.sort_index()\n",
553
+ "\n",
554
+ "# Merge the DataFrames\n",
555
+ "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
556
+ "\n",
557
+ "# Convert index to datetime\n",
558
+ "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
559
+ "\n",
560
+ "# Calculate cumulative sum\n",
561
+ "df_cumulative = df.cumsum()\n",
562
+ "\n",
563
+ "# Plotting the stacked area chart\n",
564
+ "plt.figure(figsize=(8, 5))\n",
565
+ "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
566
+ "\n",
567
+ "# Adding labels and title\n",
568
+ "plt.xlabel('Creation date')\n",
569
+ "plt.ylabel('Cumulative number of monolingual datasets')\n",
570
+ "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
571
+ "\n",
572
+ "# Display the plot\n",
573
+ "plt.xticks(rotation=45)\n",
574
+ "plt.legend(loc='upper left')\n",
575
+ "plt.grid(False)\n",
576
+ "plt.tight_layout()\n",
577
+ "plt.show()\n",
578
+ "\n",
579
+ "plt.savefig(\"plots/stack_area_2.png\")"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": null,
585
+ "id": "d96225ce",
586
+ "metadata": {
587
+ "id": "kJQ0OgRtglOQ",
588
+ "papermill": {
589
+ "duration": null,
590
+ "end_time": null,
591
+ "exception": null,
592
+ "start_time": null,
593
+ "status": "completed"
594
+ },
595
+ "tags": []
596
+ },
597
+ "outputs": [],
598
+ "source": [
599
+ "import matplotlib.pyplot as plt\n",
600
+ "import pandas as pd\n",
601
+ "from collections import Counter\n",
602
+ "\n",
603
+ "# Sample data (replace with your actual data)\n",
604
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
605
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
606
+ "\n",
607
+ "# Extract months from the creation dates\n",
608
+ "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
609
+ "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
610
+ "\n",
611
+ "# Count the occurrences of each month\n",
612
+ "english_counts = Counter(months_english)\n",
613
+ "spanish_counts = Counter(months_spanish)\n",
614
+ "\n",
615
+ "# Create a DataFrame for English datasets\n",
616
+ "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
617
+ "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
618
+ "df_english = df_english.sort_index()\n",
619
+ "\n",
620
+ "# Create a DataFrame for Spanish datasets\n",
621
+ "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
622
+ "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
623
+ "df_spanish = df_spanish.sort_index()\n",
624
+ "\n",
625
+ "# Merge the DataFrames\n",
626
+ "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
627
+ "\n",
628
+ "# Convert index to datetime\n",
629
+ "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
630
+ "\n",
631
+ "# Plotting the stacked area chart\n",
632
+ "plt.figure(figsize=(10, 6))\n",
633
+ "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
634
+ "\n",
635
+ "# Adding labels and title\n",
636
+ "plt.xlabel('Date')\n",
637
+ "plt.ylabel('Cumulative Number of Datasets')\n",
638
+ "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
639
+ "\n",
640
+ "# Display the plot\n",
641
+ "plt.xticks(rotation=45)\n",
642
+ "plt.legend(loc='upper left')\n",
643
+ "plt.grid(True)\n",
644
+ "plt.tight_layout()\n",
645
+ "plt.show()\n",
646
+ "\n",
647
+ "plt.savefig(\"plots/stack_area_3.png\")"
648
+ ]
649
+ },
650
+ {
651
+ "cell_type": "markdown",
652
+ "id": "7bbec0ac",
653
+ "metadata": {
654
+ "id": "IAnFHiPlgnRE",
655
+ "papermill": {
656
+ "duration": null,
657
+ "end_time": null,
658
+ "exception": null,
659
+ "start_time": null,
660
+ "status": "completed"
661
+ },
662
+ "tags": []
663
+ },
664
+ "source": [
665
+ "## Pie Chart"
666
+ ]
667
+ },
668
+ {
669
+ "cell_type": "code",
670
+ "execution_count": null,
671
+ "id": "7c3dd684",
672
+ "metadata": {
673
+ "id": "8tKR1x-kVeZT",
674
+ "papermill": {
675
+ "duration": null,
676
+ "end_time": null,
677
+ "exception": null,
678
+ "start_time": null,
679
+ "status": "completed"
680
+ },
681
+ "tags": []
682
+ },
683
+ "outputs": [],
684
+ "source": [
685
+ "import matplotlib.pyplot as plt\n",
686
+ "from collections import Counter\n",
687
+ "\n",
688
+ "# Calculate the count of \"other\" datasets\n",
689
+ "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n",
690
+ "\n",
691
+ "# Pie chart data\n",
692
+ "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n",
693
+ "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n",
694
+ "\n",
695
+ "# Plotting the pie chart\n",
696
+ "plt.figure(figsize=(8, 8))\n",
697
+ "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
698
+ "plt.title('Distribution of Monolingual Datasets by Language')\n",
699
+ "plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
700
+ "\n",
701
+ "# Display the plot\n",
702
+ "plt.show()\n",
703
+ "\n",
704
+ "plt.savefig(\"plots/pie_chart.png\")"
705
+ ]
706
+ },
707
+ {
708
+ "cell_type": "markdown",
709
+ "id": "11c1c9c8",
710
+ "metadata": {
711
+ "id": "z2xf8FrHROxy",
712
+ "papermill": {
713
+ "duration": null,
714
+ "end_time": null,
715
+ "exception": null,
716
+ "start_time": null,
717
+ "status": "completed"
718
+ },
719
+ "tags": []
720
+ },
721
+ "source": [
722
+ "# Time series plot"
723
+ ]
724
+ },
725
+ {
726
+ "cell_type": "code",
727
+ "execution_count": null,
728
+ "id": "1bb6a676",
729
+ "metadata": {
730
+ "id": "DuPFSZKUhyQj",
731
+ "papermill": {
732
+ "duration": null,
733
+ "end_time": null,
734
+ "exception": null,
735
+ "start_time": null,
736
+ "status": "completed"
737
+ },
738
+ "tags": []
739
+ },
740
+ "outputs": [],
741
+ "source": [
742
+ "# Prepare data for plotting\n",
743
+ "\n",
744
+ "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n",
745
+ "df[\"Count\"] = 1\n",
746
+ "# Ensure the 'Date' column is of type datetime\n",
747
+ "df['Date'] = pd.to_datetime(df['Date'])\n",
748
+ "# Group by month and calculate cumulative sum\n",
749
+ "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n",
750
+ "\n",
751
+ "# Plot the data\n",
752
+ "plt.figure(figsize=(10, 6))\n",
753
+ "plt.plot(\n",
754
+ " df.index,\n",
755
+ " df[\"Count\"],\n",
756
+ " #marker=\"o\",\n",
757
+ " color=\"g\"\n",
758
+ ")\n",
759
+ "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
760
+ "plt.xlabel(\"Fecha\")\n",
761
+ "plt.ylabel(\"Número de bases de datos\")\n",
762
+ "plt.grid(True)\n",
763
+ "plt.xticks(rotation=45)\n",
764
+ "plt.tight_layout()\n",
765
+ "plt.show()"
766
+ ]
767
+ },
768
+ {
769
+ "cell_type": "code",
770
+ "execution_count": null,
771
+ "id": "2fc77d7f",
772
+ "metadata": {
773
+ "id": "-Vu3PIe2hITq",
774
+ "papermill": {
775
+ "duration": null,
776
+ "end_time": null,
777
+ "exception": null,
778
+ "start_time": null,
779
+ "status": "completed"
780
+ },
781
+ "tags": []
782
+ },
783
+ "outputs": [],
784
+ "source": [
785
+ "import matplotlib.pyplot as plt\n",
786
+ "import pandas as pd\n",
787
+ "from collections import Counter\n",
788
+ "\n",
789
+ "# Sample data (replace with your actual data)\n",
790
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
791
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
792
+ "\n",
793
+ "# Extract months from the creation dates\n",
794
+ "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
795
+ "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
796
+ "\n",
797
+ "# Count the occurrences of each month\n",
798
+ "english_counts = Counter(months_english)\n",
799
+ "spanish_counts = Counter(months_spanish)\n",
800
+ "\n",
801
+ "# Create a DataFrame for English datasets\n",
802
+ "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
803
+ "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
804
+ "df_english = df_english.sort_index()\n",
805
+ "\n",
806
+ "# Create a DataFrame for Spanish datasets\n",
807
+ "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
808
+ "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
809
+ "df_spanish = df_spanish.sort_index()\n",
810
+ "\n",
811
+ "# Merge the DataFrames\n",
812
+ "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
813
+ "\n",
814
+ "# Convert index to datetime\n",
815
+ "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
816
+ "\n",
817
+ "# Calculate cumulative sum\n",
818
+ "df_cumulative = df.cumsum()\n",
819
+ "\n",
820
+ "# Plotting the cumulative chart\n",
821
+ "plt.figure(figsize=(10, 6))\n",
822
+ "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n",
823
+ "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
824
+ "\n",
825
+ "# Adding labels and title\n",
826
+ "plt.xlabel('Date')\n",
827
+ "plt.ylabel('Cumulative Number of Datasets')\n",
828
+ "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
829
+ "\n",
830
+ "# Display the plot\n",
831
+ "plt.xticks(rotation=45)\n",
832
+ "plt.legend(loc='upper left')\n",
833
+ "plt.grid(True)\n",
834
+ "plt.tight_layout()\n",
835
+ "plt.show()\n"
836
+ ]
837
+ },
838
+ {
839
+ "cell_type": "code",
840
+ "execution_count": null,
841
+ "id": "6c0d23ac",
842
+ "metadata": {
843
+ "id": "KG__of2IfdHu",
844
+ "papermill": {
845
+ "duration": null,
846
+ "end_time": null,
847
+ "exception": null,
848
+ "start_time": null,
849
+ "status": "completed"
850
+ },
851
+ "tags": []
852
+ },
853
+ "outputs": [],
854
+ "source": [
855
+ "import matplotlib.pyplot as plt\n",
856
+ "import pandas as pd\n",
857
+ "from collections import Counter\n",
858
+ "\n",
859
+ "# Sample data (replace with your actual data)\n",
860
+ "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
861
+ "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
862
+ "\n",
863
+ "# Extract years from the creation dates\n",
864
+ "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
865
+ "english_counts = Counter(date.year for date in creation_dates_english)\n",
866
+ "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
867
+ "\n",
868
+ "# Prepare data for plotting\n",
869
+ "english_series = pd.Series([english_counts[year] for year in years], index=years)\n",
870
+ "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n",
871
+ "\n",
872
+ "# Plotting the time series\n",
873
+ "plt.figure(figsize=(10, 6))\n",
874
+ "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n",
875
+ "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
876
+ "\n",
877
+ "# Adding labels and title\n",
878
+ "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
879
+ "plt.xlabel('Year')\n",
880
+ "plt.ylabel('Number of Datasets')\n",
881
+ "plt.legend()\n",
882
+ "plt.grid(True)\n",
883
+ "plt.xticks(rotation=45)\n",
884
+ "plt.tight_layout()\n",
885
+ "plt.show()\n"
886
+ ]
887
+ }
888
+ ],
889
+ "metadata": {
890
+ "accelerator": "GPU",
891
+ "colab": {
892
+ "gpuType": "T4",
893
+ "provenance": []
894
+ },
895
+ "kernelspec": {
896
+ "display_name": "Python 3",
897
+ "name": "python3"
898
+ },
899
+ "language_info": {
900
+ "name": "python",
901
+ "version": "3.11.6"
902
+ },
903
+ "papermill": {
904
+ "default_parameters": {},
905
+ "duration": 0.047858,
906
+ "end_time": "2024-05-15T09:04:29.634379",
907
+ "environment_variables": {},
908
+ "exception": null,
909
+ "input_path": "numero_datasets_hub.ipynb",
910
+ "output_path": "numero_datasets_hub_output.ipynb",
911
+ "parameters": {},
912
+ "start_time": "2024-05-15T09:04:29.586521",
913
+ "version": "2.6.0"
914
+ }
915
+ },
916
+ "nbformat": 4,
917
+ "nbformat_minor": 5
918
+ }
plots/datasets_hub.png ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==4.31.0
2
+ nbconvert
3
+ papermill