Spaces:
Build error
Build error
Commit
路
e2c631c
1
Parent(s):
ff1302a
Added cost and analysis
Browse files
media/arabic-rag-embeddings-cost.png
ADDED
media/arabic-rag-embeddings-metrics.png
ADDED
notebooks/04_get_embeddings.ipynb
CHANGED
@@ -43,7 +43,7 @@
|
|
43 |
},
|
44 |
{
|
45 |
"cell_type": "code",
|
46 |
-
"execution_count":
|
47 |
"id": "7e873652-8257-4aae-92bc-94e1bac54b73",
|
48 |
"metadata": {
|
49 |
"tags": []
|
@@ -78,7 +78,7 @@
|
|
78 |
},
|
79 |
{
|
80 |
"cell_type": "code",
|
81 |
-
"execution_count":
|
82 |
"id": "52edfc97-5b6f-44f9-8d89-8578cf79fae9",
|
83 |
"metadata": {
|
84 |
"tags": []
|
@@ -105,16 +105,16 @@
|
|
105 |
"metadata": {},
|
106 |
"source": [
|
107 |
"## Start TEI with Inference Endpoints\n",
|
108 |
-
"Another option is to run TEI on Inference Endpoints. Its cheap and fast. It took me less than 5 minutes to get it up and running!\n",
|
109 |
"\n",
|
110 |
-
"Check here for a [guide](https://huggingface.co/blog/inference-endpoints-embeddings#3-deploy-embedding-model-as-inference-endpoint). Make sure to set these options
|
111 |
-
"1. Model Repository = transformers/paraphrase-multilingual-minilm-l12-v2
|
112 |
"1. Name your endpoint\n",
|
113 |
-
"1. Choose a GPU
|
114 |
"1. Advanced Configuration\n",
|
115 |
-
" 1. Task = Sentence Embeddings
|
116 |
-
" 1. Revision (based on [this pull request for safetensors](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/discussions/8) = a21e6630
|
117 |
-
" 1. Container Type = Text Embeddings Inference
|
118 |
" \n",
|
119 |
"Set the other options as you prefer."
|
120 |
]
|
@@ -129,7 +129,7 @@
|
|
129 |
},
|
130 |
{
|
131 |
"cell_type": "code",
|
132 |
-
"execution_count":
|
133 |
"id": "a69e2ee1-67f2-4f0a-b496-02f5415a52ca",
|
134 |
"metadata": {
|
135 |
"tags": []
|
@@ -139,20 +139,20 @@
|
|
139 |
"name": "stdin",
|
140 |
"output_type": "stream",
|
141 |
"text": [
|
142 |
-
"What is your
|
143 |
-
"What is your
|
144 |
]
|
145 |
}
|
146 |
],
|
147 |
"source": [
|
148 |
"import getpass\n",
|
149 |
-
"
|
150 |
-
"
|
151 |
]
|
152 |
},
|
153 |
{
|
154 |
"cell_type": "code",
|
155 |
-
"execution_count":
|
156 |
"id": "949d6bf8-804f-496b-a59a-834483cc7073",
|
157 |
"metadata": {
|
158 |
"tags": []
|
@@ -169,7 +169,7 @@
|
|
169 |
},
|
170 |
{
|
171 |
"cell_type": "code",
|
172 |
-
"execution_count":
|
173 |
"id": "d00b4af1-8fbc-4f7a-8a78-e1c52dd77a66",
|
174 |
"metadata": {
|
175 |
"tags": []
|
@@ -207,7 +207,7 @@
|
|
207 |
},
|
208 |
{
|
209 |
"cell_type": "code",
|
210 |
-
"execution_count":
|
211 |
"id": "abb5186b-ee67-4e1e-882d-3d8d5b4575d4",
|
212 |
"metadata": {
|
213 |
"tags": []
|
@@ -226,7 +226,7 @@
|
|
226 |
},
|
227 |
{
|
228 |
"cell_type": "code",
|
229 |
-
"execution_count":
|
230 |
"id": "c4b82ea2-8b30-4c2e-99f0-9a30f2f1bfb7",
|
231 |
"metadata": {
|
232 |
"tags": []
|
@@ -255,7 +255,7 @@
|
|
255 |
},
|
256 |
{
|
257 |
"cell_type": "code",
|
258 |
-
"execution_count":
|
259 |
"id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
|
260 |
"metadata": {
|
261 |
"tags": []
|
@@ -263,7 +263,8 @@
|
|
263 |
"outputs": [],
|
264 |
"source": [
|
265 |
"files_in = list((proj_dir / 'data/processed/').glob('*.ndjson'))\n",
|
266 |
-
"folder_out = proj_dir / 'data/embedded/'"
|
|
|
267 |
]
|
268 |
},
|
269 |
{
|
@@ -288,7 +289,7 @@
|
|
288 |
},
|
289 |
{
|
290 |
"cell_type": "code",
|
291 |
-
"execution_count":
|
292 |
"id": "e455dd52-aad3-4313-8738-03141ee5152a",
|
293 |
"metadata": {
|
294 |
"tags": []
|
@@ -328,7 +329,7 @@
|
|
328 |
},
|
329 |
{
|
330 |
"cell_type": "code",
|
331 |
-
"execution_count":
|
332 |
"id": "f0d17264-72dc-40be-aa46-17cde38c8189",
|
333 |
"metadata": {
|
334 |
"tags": []
|
@@ -337,7 +338,7 @@
|
|
337 |
{
|
338 |
"data": {
|
339 |
"application/vnd.jupyter.widget-view+json": {
|
340 |
-
"model_id": "
|
341 |
"version_major": 2,
|
342 |
"version_minor": 0
|
343 |
},
|
@@ -351,7 +352,7 @@
|
|
351 |
{
|
352 |
"data": {
|
353 |
"application/vnd.jupyter.widget-view+json": {
|
354 |
-
"model_id": "
|
355 |
"version_major": 2,
|
356 |
"version_minor": 0
|
357 |
},
|
@@ -372,7 +373,7 @@
|
|
372 |
{
|
373 |
"data": {
|
374 |
"application/vnd.jupyter.widget-view+json": {
|
375 |
-
"model_id": "
|
376 |
"version_major": 2,
|
377 |
"version_minor": 0
|
378 |
},
|
@@ -393,7 +394,7 @@
|
|
393 |
{
|
394 |
"data": {
|
395 |
"application/vnd.jupyter.widget-view+json": {
|
396 |
-
"model_id": "
|
397 |
"version_major": 2,
|
398 |
"version_minor": 0
|
399 |
},
|
@@ -414,7 +415,7 @@
|
|
414 |
{
|
415 |
"data": {
|
416 |
"application/vnd.jupyter.widget-view+json": {
|
417 |
-
"model_id": "
|
418 |
"version_major": 2,
|
419 |
"version_minor": 0
|
420 |
},
|
@@ -435,7 +436,7 @@
|
|
435 |
{
|
436 |
"data": {
|
437 |
"application/vnd.jupyter.widget-view+json": {
|
438 |
-
"model_id": "
|
439 |
"version_major": 2,
|
440 |
"version_minor": 0
|
441 |
},
|
@@ -456,7 +457,7 @@
|
|
456 |
{
|
457 |
"data": {
|
458 |
"application/vnd.jupyter.widget-view+json": {
|
459 |
-
"model_id": "
|
460 |
"version_major": 2,
|
461 |
"version_minor": 0
|
462 |
},
|
@@ -477,7 +478,7 @@
|
|
477 |
{
|
478 |
"data": {
|
479 |
"application/vnd.jupyter.widget-view+json": {
|
480 |
-
"model_id": "
|
481 |
"version_major": 2,
|
482 |
"version_minor": 0
|
483 |
},
|
@@ -498,7 +499,7 @@
|
|
498 |
{
|
499 |
"data": {
|
500 |
"application/vnd.jupyter.widget-view+json": {
|
501 |
-
"model_id": "
|
502 |
"version_major": 2,
|
503 |
"version_minor": 0
|
504 |
},
|
@@ -519,7 +520,7 @@
|
|
519 |
{
|
520 |
"data": {
|
521 |
"application/vnd.jupyter.widget-view+json": {
|
522 |
-
"model_id": "
|
523 |
"version_major": 2,
|
524 |
"version_minor": 0
|
525 |
},
|
@@ -540,7 +541,7 @@
|
|
540 |
{
|
541 |
"data": {
|
542 |
"application/vnd.jupyter.widget-view+json": {
|
543 |
-
"model_id": "
|
544 |
"version_major": 2,
|
545 |
"version_minor": 0
|
546 |
},
|
@@ -561,7 +562,7 @@
|
|
561 |
{
|
562 |
"data": {
|
563 |
"application/vnd.jupyter.widget-view+json": {
|
564 |
-
"model_id": "
|
565 |
"version_major": 2,
|
566 |
"version_minor": 0
|
567 |
},
|
@@ -582,7 +583,7 @@
|
|
582 |
{
|
583 |
"data": {
|
584 |
"application/vnd.jupyter.widget-view+json": {
|
585 |
-
"model_id": "
|
586 |
"version_major": 2,
|
587 |
"version_minor": 0
|
588 |
},
|
@@ -603,7 +604,7 @@
|
|
603 |
{
|
604 |
"data": {
|
605 |
"application/vnd.jupyter.widget-view+json": {
|
606 |
-
"model_id": "
|
607 |
"version_major": 2,
|
608 |
"version_minor": 0
|
609 |
},
|
@@ -624,7 +625,7 @@
|
|
624 |
{
|
625 |
"data": {
|
626 |
"application/vnd.jupyter.widget-view+json": {
|
627 |
-
"model_id": "
|
628 |
"version_major": 2,
|
629 |
"version_minor": 0
|
630 |
},
|
@@ -645,7 +646,7 @@
|
|
645 |
{
|
646 |
"data": {
|
647 |
"application/vnd.jupyter.widget-view+json": {
|
648 |
-
"model_id": "
|
649 |
"version_major": 2,
|
650 |
"version_minor": 0
|
651 |
},
|
@@ -666,7 +667,7 @@
|
|
666 |
{
|
667 |
"data": {
|
668 |
"application/vnd.jupyter.widget-view+json": {
|
669 |
-
"model_id": "
|
670 |
"version_major": 2,
|
671 |
"version_minor": 0
|
672 |
},
|
@@ -687,7 +688,7 @@
|
|
687 |
{
|
688 |
"data": {
|
689 |
"application/vnd.jupyter.widget-view+json": {
|
690 |
-
"model_id": "
|
691 |
"version_major": 2,
|
692 |
"version_minor": 0
|
693 |
},
|
@@ -708,7 +709,7 @@
|
|
708 |
{
|
709 |
"data": {
|
710 |
"application/vnd.jupyter.widget-view+json": {
|
711 |
-
"model_id": "
|
712 |
"version_major": 2,
|
713 |
"version_minor": 0
|
714 |
},
|
@@ -729,7 +730,7 @@
|
|
729 |
{
|
730 |
"data": {
|
731 |
"application/vnd.jupyter.widget-view+json": {
|
732 |
-
"model_id": "
|
733 |
"version_major": 2,
|
734 |
"version_minor": 0
|
735 |
},
|
@@ -750,7 +751,7 @@
|
|
750 |
{
|
751 |
"data": {
|
752 |
"application/vnd.jupyter.widget-view+json": {
|
753 |
-
"model_id": "
|
754 |
"version_major": 2,
|
755 |
"version_minor": 0
|
756 |
},
|
@@ -771,7 +772,7 @@
|
|
771 |
{
|
772 |
"data": {
|
773 |
"application/vnd.jupyter.widget-view+json": {
|
774 |
-
"model_id": "
|
775 |
"version_major": 2,
|
776 |
"version_minor": 0
|
777 |
},
|
@@ -792,7 +793,7 @@
|
|
792 |
{
|
793 |
"data": {
|
794 |
"application/vnd.jupyter.widget-view+json": {
|
795 |
-
"model_id": "
|
796 |
"version_major": 2,
|
797 |
"version_minor": 0
|
798 |
},
|
@@ -813,7 +814,7 @@
|
|
813 |
{
|
814 |
"data": {
|
815 |
"application/vnd.jupyter.widget-view+json": {
|
816 |
-
"model_id": "
|
817 |
"version_major": 2,
|
818 |
"version_minor": 0
|
819 |
},
|
@@ -829,7 +830,7 @@
|
|
829 |
"output_type": "stream",
|
830 |
"text": [
|
831 |
"Batch 23: Embeddings = 70322 documents = 70322\n",
|
832 |
-
"
|
833 |
]
|
834 |
}
|
835 |
],
|
@@ -855,39 +856,477 @@
|
|
855 |
" for document in documents:\n",
|
856 |
" json_str = json.dumps(document, ensure_ascii=False)\n",
|
857 |
" f.write(json_str + '\\n')\n",
|
858 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
859 |
]
|
860 |
},
|
861 |
{
|
862 |
"cell_type": "code",
|
863 |
-
"execution_count":
|
864 |
-
"id": "
|
865 |
"metadata": {
|
866 |
"tags": []
|
867 |
},
|
868 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
869 |
"source": [
|
870 |
-
"
|
871 |
]
|
872 |
},
|
873 |
{
|
874 |
-
"cell_type": "
|
875 |
-
"
|
876 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
877 |
"source": [
|
878 |
-
"
|
879 |
]
|
880 |
},
|
881 |
{
|
882 |
"cell_type": "markdown",
|
883 |
-
"id": "
|
884 |
-
"metadata": {
|
885 |
-
"tags": []
|
886 |
-
},
|
887 |
"source": [
|
888 |
-
"
|
889 |
-
"
|
890 |
-
"
|
|
|
|
|
|
|
|
|
|
|
891 |
]
|
892 |
},
|
893 |
{
|
|
|
43 |
},
|
44 |
{
|
45 |
"cell_type": "code",
|
46 |
+
"execution_count": 1,
|
47 |
"id": "7e873652-8257-4aae-92bc-94e1bac54b73",
|
48 |
"metadata": {
|
49 |
"tags": []
|
|
|
78 |
},
|
79 |
{
|
80 |
"cell_type": "code",
|
81 |
+
"execution_count": 2,
|
82 |
"id": "52edfc97-5b6f-44f9-8d89-8578cf79fae9",
|
83 |
"metadata": {
|
84 |
"tags": []
|
|
|
105 |
"metadata": {},
|
106 |
"source": [
|
107 |
"## Start TEI with Inference Endpoints\n",
|
108 |
+
"Another option is to run TEI on [Inference Endpoints](https://huggingface.co/inference-endpoints). Its cheap and fast. It took me less than 5 minutes to get it up and running!\n",
|
109 |
"\n",
|
110 |
+
"Check here for a [comprehensive guide](https://huggingface.co/blog/inference-endpoints-embeddings#3-deploy-embedding-model-as-inference-endpoint). Make sure to set these options **IN ORDER**:\n",
|
111 |
+
"1. Model Repository = `transformers/paraphrase-multilingual-minilm-l12-v2`\n",
|
112 |
"1. Name your endpoint\n",
|
113 |
+
"1. Choose a GPU, I chose `Nvidia A10G` which is **$1.3/hr**.\n",
|
114 |
"1. Advanced Configuration\n",
|
115 |
+
" 1. Task = `Sentence Embeddings`\n",
|
116 |
+
" 1. Revision (based on [this pull request for safetensors](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/discussions/8) = `a21e6630`\n",
|
117 |
+
" 1. Container Type = `Text Embeddings Inference`\n",
|
118 |
" \n",
|
119 |
"Set the other options as you prefer."
|
120 |
]
|
|
|
129 |
},
|
130 |
{
|
131 |
"cell_type": "code",
|
132 |
+
"execution_count": 3,
|
133 |
"id": "a69e2ee1-67f2-4f0a-b496-02f5415a52ca",
|
134 |
"metadata": {
|
135 |
"tags": []
|
|
|
139 |
"name": "stdin",
|
140 |
"output_type": "stream",
|
141 |
"text": [
|
142 |
+
"What is your API_URL? 路路路路路路路路\n",
|
143 |
+
"What is your BEARER TOKEN? Check your endpoint. 路路路路路路路路\n"
|
144 |
]
|
145 |
}
|
146 |
],
|
147 |
"source": [
|
148 |
"import getpass\n",
|
149 |
+
"API_URL = getpass.getpass(prompt='What is your API_URL?')\n",
|
150 |
+
"bearer_token = getpass.getpass(prompt='What is your BEARER TOKEN? Check your endpoint.')"
|
151 |
]
|
152 |
},
|
153 |
{
|
154 |
"cell_type": "code",
|
155 |
+
"execution_count": 4,
|
156 |
"id": "949d6bf8-804f-496b-a59a-834483cc7073",
|
157 |
"metadata": {
|
158 |
"tags": []
|
|
|
169 |
},
|
170 |
{
|
171 |
"cell_type": "code",
|
172 |
+
"execution_count": 5,
|
173 |
"id": "d00b4af1-8fbc-4f7a-8a78-e1c52dd77a66",
|
174 |
"metadata": {
|
175 |
"tags": []
|
|
|
207 |
},
|
208 |
{
|
209 |
"cell_type": "code",
|
210 |
+
"execution_count": 6,
|
211 |
"id": "abb5186b-ee67-4e1e-882d-3d8d5b4575d4",
|
212 |
"metadata": {
|
213 |
"tags": []
|
|
|
226 |
},
|
227 |
{
|
228 |
"cell_type": "code",
|
229 |
+
"execution_count": 7,
|
230 |
"id": "c4b82ea2-8b30-4c2e-99f0-9a30f2f1bfb7",
|
231 |
"metadata": {
|
232 |
"tags": []
|
|
|
255 |
},
|
256 |
{
|
257 |
"cell_type": "code",
|
258 |
+
"execution_count": 8,
|
259 |
"id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
|
260 |
"metadata": {
|
261 |
"tags": []
|
|
|
263 |
"outputs": [],
|
264 |
"source": [
|
265 |
"files_in = list((proj_dir / 'data/processed/').glob('*.ndjson'))\n",
|
266 |
+
"folder_out = proj_dir / 'data/embedded/'\n",
|
267 |
+
"folder_out_str = str(folder_out)"
|
268 |
]
|
269 |
},
|
270 |
{
|
|
|
289 |
},
|
290 |
{
|
291 |
"cell_type": "code",
|
292 |
+
"execution_count": 9,
|
293 |
"id": "e455dd52-aad3-4313-8738-03141ee5152a",
|
294 |
"metadata": {
|
295 |
"tags": []
|
|
|
329 |
},
|
330 |
{
|
331 |
"cell_type": "code",
|
332 |
+
"execution_count": 10,
|
333 |
"id": "f0d17264-72dc-40be-aa46-17cde38c8189",
|
334 |
"metadata": {
|
335 |
"tags": []
|
|
|
338 |
{
|
339 |
"data": {
|
340 |
"application/vnd.jupyter.widget-view+json": {
|
341 |
+
"model_id": "c4b7384336ad4c39a417a54a5a00a4ad",
|
342 |
"version_major": 2,
|
343 |
"version_minor": 0
|
344 |
},
|
|
|
352 |
{
|
353 |
"data": {
|
354 |
"application/vnd.jupyter.widget-view+json": {
|
355 |
+
"model_id": "0b034dc636df440594550f56dc152c8b",
|
356 |
"version_major": 2,
|
357 |
"version_minor": 0
|
358 |
},
|
|
|
373 |
{
|
374 |
"data": {
|
375 |
"application/vnd.jupyter.widget-view+json": {
|
376 |
+
"model_id": "0203531009644b75abb22725a38b3ace",
|
377 |
"version_major": 2,
|
378 |
"version_minor": 0
|
379 |
},
|
|
|
394 |
{
|
395 |
"data": {
|
396 |
"application/vnd.jupyter.widget-view+json": {
|
397 |
+
"model_id": "a4c781089c42466ba380b0b598b2f9e6",
|
398 |
"version_major": 2,
|
399 |
"version_minor": 0
|
400 |
},
|
|
|
415 |
{
|
416 |
"data": {
|
417 |
"application/vnd.jupyter.widget-view+json": {
|
418 |
+
"model_id": "66a0feea106145a0aadeb64fab48b6f8",
|
419 |
"version_major": 2,
|
420 |
"version_minor": 0
|
421 |
},
|
|
|
436 |
{
|
437 |
"data": {
|
438 |
"application/vnd.jupyter.widget-view+json": {
|
439 |
+
"model_id": "c6976832a78e48c5be335c5fef14bb5d",
|
440 |
"version_major": 2,
|
441 |
"version_minor": 0
|
442 |
},
|
|
|
457 |
{
|
458 |
"data": {
|
459 |
"application/vnd.jupyter.widget-view+json": {
|
460 |
+
"model_id": "764ceac837b040a39c2541074386e1f6",
|
461 |
"version_major": 2,
|
462 |
"version_minor": 0
|
463 |
},
|
|
|
478 |
{
|
479 |
"data": {
|
480 |
"application/vnd.jupyter.widget-view+json": {
|
481 |
+
"model_id": "6d268907dd9844cd8f81f48f8568f576",
|
482 |
"version_major": 2,
|
483 |
"version_minor": 0
|
484 |
},
|
|
|
499 |
{
|
500 |
"data": {
|
501 |
"application/vnd.jupyter.widget-view+json": {
|
502 |
+
"model_id": "53bb70b332774c3d867cdb1cb3c48958",
|
503 |
"version_major": 2,
|
504 |
"version_minor": 0
|
505 |
},
|
|
|
520 |
{
|
521 |
"data": {
|
522 |
"application/vnd.jupyter.widget-view+json": {
|
523 |
+
"model_id": "525ae3cf63af47b2acad508cb3c3efb7",
|
524 |
"version_major": 2,
|
525 |
"version_minor": 0
|
526 |
},
|
|
|
541 |
{
|
542 |
"data": {
|
543 |
"application/vnd.jupyter.widget-view+json": {
|
544 |
+
"model_id": "1a4c5103a1184ca1999b452c716131be",
|
545 |
"version_major": 2,
|
546 |
"version_minor": 0
|
547 |
},
|
|
|
562 |
{
|
563 |
"data": {
|
564 |
"application/vnd.jupyter.widget-view+json": {
|
565 |
+
"model_id": "9a06e2e21c6d4d12a04a55ca746594a4",
|
566 |
"version_major": 2,
|
567 |
"version_minor": 0
|
568 |
},
|
|
|
583 |
{
|
584 |
"data": {
|
585 |
"application/vnd.jupyter.widget-view+json": {
|
586 |
+
"model_id": "933d457b2f4f4f1fa3d20b469dc22d75",
|
587 |
"version_major": 2,
|
588 |
"version_minor": 0
|
589 |
},
|
|
|
604 |
{
|
605 |
"data": {
|
606 |
"application/vnd.jupyter.widget-view+json": {
|
607 |
+
"model_id": "0e4ce5ea591f431ca1ba6497ccf82b84",
|
608 |
"version_major": 2,
|
609 |
"version_minor": 0
|
610 |
},
|
|
|
625 |
{
|
626 |
"data": {
|
627 |
"application/vnd.jupyter.widget-view+json": {
|
628 |
+
"model_id": "1d267ec29d864694b9f89fbf15e3e34a",
|
629 |
"version_major": 2,
|
630 |
"version_minor": 0
|
631 |
},
|
|
|
646 |
{
|
647 |
"data": {
|
648 |
"application/vnd.jupyter.widget-view+json": {
|
649 |
+
"model_id": "bfb1ecea3b2143c1916beb446201fe7f",
|
650 |
"version_major": 2,
|
651 |
"version_minor": 0
|
652 |
},
|
|
|
667 |
{
|
668 |
"data": {
|
669 |
"application/vnd.jupyter.widget-view+json": {
|
670 |
+
"model_id": "cdb83413a46e4ba984eb261994d05cd3",
|
671 |
"version_major": 2,
|
672 |
"version_minor": 0
|
673 |
},
|
|
|
688 |
{
|
689 |
"data": {
|
690 |
"application/vnd.jupyter.widget-view+json": {
|
691 |
+
"model_id": "a6cb8095952f4db3ab6d31219c21087e",
|
692 |
"version_major": 2,
|
693 |
"version_minor": 0
|
694 |
},
|
|
|
709 |
{
|
710 |
"data": {
|
711 |
"application/vnd.jupyter.widget-view+json": {
|
712 |
+
"model_id": "0699bd10530c4a34aaaf9e88523ad5e6",
|
713 |
"version_major": 2,
|
714 |
"version_minor": 0
|
715 |
},
|
|
|
730 |
{
|
731 |
"data": {
|
732 |
"application/vnd.jupyter.widget-view+json": {
|
733 |
+
"model_id": "736895c24eb84a8a9f514d99c628bdc7",
|
734 |
"version_major": 2,
|
735 |
"version_minor": 0
|
736 |
},
|
|
|
751 |
{
|
752 |
"data": {
|
753 |
"application/vnd.jupyter.widget-view+json": {
|
754 |
+
"model_id": "632c1bfc4370488ab977bedc8c31d404",
|
755 |
"version_major": 2,
|
756 |
"version_minor": 0
|
757 |
},
|
|
|
772 |
{
|
773 |
"data": {
|
774 |
"application/vnd.jupyter.widget-view+json": {
|
775 |
+
"model_id": "d2c550797b5f4444b91b954c3f3958b1",
|
776 |
"version_major": 2,
|
777 |
"version_minor": 0
|
778 |
},
|
|
|
793 |
{
|
794 |
"data": {
|
795 |
"application/vnd.jupyter.widget-view+json": {
|
796 |
+
"model_id": "8918701c109f4ecdbb1d73e5fe97d6b5",
|
797 |
"version_major": 2,
|
798 |
"version_minor": 0
|
799 |
},
|
|
|
814 |
{
|
815 |
"data": {
|
816 |
"application/vnd.jupyter.widget-view+json": {
|
817 |
+
"model_id": "c69c13c3a1354f5c900d268500ffcb00",
|
818 |
"version_major": 2,
|
819 |
"version_minor": 0
|
820 |
},
|
|
|
830 |
"output_type": "stream",
|
831 |
"text": [
|
832 |
"Batch 23: Embeddings = 70322 documents = 70322\n",
|
833 |
+
"104 min 32.33 sec\n"
|
834 |
]
|
835 |
}
|
836 |
],
|
|
|
856 |
" for document in documents:\n",
|
857 |
" json_str = json.dumps(document, ensure_ascii=False)\n",
|
858 |
" f.write(json_str + '\\n')\n",
|
859 |
+
" \n",
|
860 |
+
"# Print elapsed time\n",
|
861 |
+
"elapsed_time = time.perf_counter() - start\n",
|
862 |
+
"minutes, seconds = divmod(elapsed_time, 60)\n",
|
863 |
+
"print(f\"{int(minutes)} min {seconds:.2f} sec\")"
|
864 |
+
]
|
865 |
+
},
|
866 |
+
{
|
867 |
+
"cell_type": "markdown",
|
868 |
+
"id": "3f0d9e6d-68f2-4086-9bcc-ffb27971fd63",
|
869 |
+
"metadata": {},
|
870 |
+
"source": [
|
871 |
+
"Lets make sure that we still have all our documents:"
|
872 |
]
|
873 |
},
|
874 |
{
|
875 |
"cell_type": "code",
|
876 |
+
"execution_count": 11,
|
877 |
+
"id": "abc6dccc-0e5c-45e2-a269-b9f02cff2d05",
|
878 |
"metadata": {
|
879 |
"tags": []
|
880 |
},
|
881 |
+
"outputs": [
|
882 |
+
{
|
883 |
+
"name": "stdout",
|
884 |
+
"output_type": "stream",
|
885 |
+
"text": [
|
886 |
+
"/home/ec2-user/arabic-wiki/data/embedded\n",
|
887 |
+
"2094596\n"
|
888 |
+
]
|
889 |
+
}
|
890 |
+
],
|
891 |
"source": [
|
892 |
+
"!echo \"$folder_out_str\" && cat \"$folder_out_str\"/*.ndjson | wc -l"
|
893 |
]
|
894 |
},
|
895 |
{
|
896 |
+
"cell_type": "code",
|
897 |
+
"execution_count": 12,
|
898 |
+
"id": "cdee2b1c-0493-4b3e-8ecb-9d79109c756e",
|
899 |
+
"metadata": {
|
900 |
+
"collapsed": true,
|
901 |
+
"jupyter": {
|
902 |
+
"outputs_hidden": true
|
903 |
+
},
|
904 |
+
"tags": []
|
905 |
+
},
|
906 |
+
"outputs": [
|
907 |
+
{
|
908 |
+
"data": {
|
909 |
+
"text/plain": [
|
910 |
+
"{'content': '毓卮丕亍 賰丕乇賷賳 賴賵 賲賳 爻賱爻賱丞 賲胤丕毓賲 丕爻鬲乇丕賱賷丞 賷賴丿賮 毓賲丿丕賸 毓賳 鬲噩乇亘丞 鬲賳丕賵賱 胤毓丕賲 睾賷乇 爻丕乇賻丞 賵賷鬲賲 鬲賵噩賷賴 丕賱賲賵馗賮賷賳 賱廿賴丕賳丞 丕賱毓賲賱丕亍 胤賵丕賱 賵噩亘丕鬲賴賲.\\n丕賯鬲亘爻 丕爻賲 丕賱賲胤毓賲 賲賳 丕賱賲氐胤賱丨 丕賱毓丕賲賷 毓賱賶 丕賱廿賳鬲乇賳鬲 (賰丕乇賷賳) 賵丕賱匕賷 賷爻鬲禺丿賲 賱賵氐賮 丕賲乇兀丞 亘賷囟丕亍 賲爻賳丞 賵賯丨丞 亘卮賰賱 賳賲胤賷.\\n鬲丕乇賷禺 丕賱賲胤毓賲.\\n鬲賲 廿賳卮丕亍 丕賱爻賱爻賱丞 賮賷 兀爻鬲乇丕賱賷丕 (爻賷丿賳賷) 賮賷 毓丕賲 2021 賲賳 賯亘賱 廿賷丿賷賳 賱賷賮賳 賵噩賷賲爻 賮丕乇賷賱. 丕賱賲胤毓賲 匕賵 胤丕亘毓 禺丕氐 賷毓鬲賲丿 毓賱賶 禺丿賲丞 鬲噩乇亘丞 胤毓丕賲 睾賷乇 爻丕乇丞 丨賷孬 賷丿賮毓 丕賱毓賲賱丕亍 賱賱賲賵馗賮賷賳 賱廿賴丕賳鬲賴賲 賵賰丕賳 賲賳 丕賱賲賮鬲乇囟 丕賳 賷賰賵賳 丕賱賲胤毓賲 賲胤毓賲丕賸 賲賳亘孬賯丕賸 賱賲丿丞 爻鬲丞 兀卮賴乇 賮賷 賵賵乇賱丿 爻賰賵賷乇.\\n丕孬丕乇鬲 賮賰乇丞 丕賱賲胤毓賲 賮賷 丕賱亘丿丕賷丞 乇丿丕鬲 賮毓賱 賲鬲睾丕賷乇丞 賲賲丕 兀孬丕乇 丕賱禺賵賮 亘卮兀賳 賲丕 廿匕丕 賰丕賳鬲 丕賱廿賴丕賳丕鬲 丕賱賲鬲亘丕丿賱丞 賲賳 丕賱賲賲賰賳 丕賳 鬲毓乇囟 丕賱賲賵馗賮賷賳 賱爻賵亍 丕賱賲毓丕賲賱丞 賲賳 賯亘賱 丕賱毓賲賱丕亍.\\n丕爻賲 (賰丕乇賷賳) 賴賵 廿卮丕乇丞 廿賱賶 丕賱廿爻賲 丕賱賲爻鬲禺丿賲 賮賷 丕賱賲賷賲丕鬲 (丕賱賳賰鬲 丕賱鬲賷 鬲賳卮賴乇 亘爻乇毓丞 賮賷 賲賵丕賯毓 丕賱鬲賵丕氐賱) 賱賵氐賮 丕賲乇兀丞 亘賷囟丕亍 賮賷 賲賳鬲氐賮 丕賱毓賲乇 賵賵賯丨丞 亘卮賰賱 賳賲胤賷.\\n賷胤賱亘 賲賳 丕賱賲賵馗賮賷賳 丕乇鬲丿丕亍 卮禺氐賷丞 賵賯丨丞 賵丕賱爻禺乇賷丞 賲賳 丕賱毓賲賱丕亍 亘卮賰賱 賴夭賱賷 丕孬賳丕亍 鬲賳丕賵賱 賵噩亘丕鬲賴賲 賵賲賳 丕賱賲鬲賵賯毓 丕賳 賷毓賷丿 丕賱毓賲賱丕亍 賴匕丕 丕賱爻賱賵賰 賲賳 禺賱丕賱 丕賱鬲氐乇賮 亘賵賯丕丨丞 鬲噩丕賴 丕賱賲賵馗賮賷賳 賵賲毓 匕賱賰 賷購丨馗乇 毓賱賶 丕賱毓賲賱丕亍 賵丕賱賲賵馗賮賷賳 丕爻鬲禺丿丕賲 丕賱廿賴丕賳丕鬲 丕賱毓賳氐乇賷丞 兀賵 丕賱鬲丨賷夭 丕賱噩賳爻賷 兀賵 乇賴丕亘 丕賱賲孬賱賷丞 丕賱噩賳爻賷丞.\\n鬲鬲囟賲賳 丕賱毓丿賷丿 賲賳 賴匕賴 丕賱鬲亘丕丿賱丕鬲 賱睾丞 賳丕亘賷丞 賵賷噩亘 丕賳 賷賰賵賳 亘乇賮賯丞 丕賱丕卮禺丕氐 丕賱賱匕賷賳 賷賯賱賵賳 毓賳 16 毓丕賲丕賻賻 亘丕賱睾賷賳.\\n賰賲丕 賷賲賰賳 賱賲丕賱賰賷 亘胤丕賯丞 賴賵賷丞 鬲馗賴乇 丕賳 丕爻賲賴賲 賰丕乇賷賳 丕賱丨氐賵賱 毓賱賶 賲卮乇賵亘 賲噩丕賳賷.\\n',\n",
|
911 |
+
" 'content_type': 'text',\n",
|
912 |
+
" 'score': None,\n",
|
913 |
+
" 'meta': {'id': '8974231',\n",
|
914 |
+
" 'revid': '593870',\n",
|
915 |
+
" 'url': 'https://ar.wikipedia.org/wiki?curid=8974231',\n",
|
916 |
+
" 'title': '賲胤毓賲 毓卮丕亍 賰丕乇賷賳',\n",
|
917 |
+
" '_split_id': 0,\n",
|
918 |
+
" '_split_overlap': [{'doc_id': '288196225044b53e6ff86f2485257a0a',\n",
|
919 |
+
" 'range': [790, 1225]}]},\n",
|
920 |
+
" 'id_hash_keys': ['content'],\n",
|
921 |
+
" 'embedding': [0.053985596,\n",
|
922 |
+
" -0.06933594,\n",
|
923 |
+
" -0.046417236,\n",
|
924 |
+
" 0.07788086,\n",
|
925 |
+
" -0.06768799,\n",
|
926 |
+
" -0.01789856,\n",
|
927 |
+
" 0.03265381,\n",
|
928 |
+
" -0.1060791,\n",
|
929 |
+
" 0.046325684,\n",
|
930 |
+
" 0.00022745132,\n",
|
931 |
+
" 0.01524353,\n",
|
932 |
+
" 0.021408081,\n",
|
933 |
+
" -0.039398193,\n",
|
934 |
+
" -0.076049805,\n",
|
935 |
+
" -0.058380127,\n",
|
936 |
+
" -0.027786255,\n",
|
937 |
+
" 0.067993164,\n",
|
938 |
+
" -0.00894928,\n",
|
939 |
+
" 0.11645508,\n",
|
940 |
+
" 0.09039307,\n",
|
941 |
+
" 0.032684326,\n",
|
942 |
+
" -0.023635864,\n",
|
943 |
+
" 0.00970459,\n",
|
944 |
+
" 0.08312988,\n",
|
945 |
+
" 0.01638794,\n",
|
946 |
+
" -0.058380127,\n",
|
947 |
+
" 0.0501709,\n",
|
948 |
+
" 0.035583496,\n",
|
949 |
+
" -0.042297363,\n",
|
950 |
+
" -0.011688232,\n",
|
951 |
+
" 0.010314941,\n",
|
952 |
+
" 0.026626587,\n",
|
953 |
+
" 0.039978027,\n",
|
954 |
+
" 0.04095459,\n",
|
955 |
+
" 0.039398193,\n",
|
956 |
+
" -0.029754639,\n",
|
957 |
+
" 0.01360321,\n",
|
958 |
+
" -0.0015621185,\n",
|
959 |
+
" -0.009384155,\n",
|
960 |
+
" 0.043151855,\n",
|
961 |
+
" 0.09710693,\n",
|
962 |
+
" -0.0044937134,\n",
|
963 |
+
" 0.0059928894,\n",
|
964 |
+
" -0.07293701,\n",
|
965 |
+
" -0.012710571,\n",
|
966 |
+
" 0.013046265,\n",
|
967 |
+
" -0.08099365,\n",
|
968 |
+
" 0.015457153,\n",
|
969 |
+
" -0.037963867,\n",
|
970 |
+
" -0.016601562,\n",
|
971 |
+
" -0.09564209,\n",
|
972 |
+
" 0.038238525,\n",
|
973 |
+
" -0.020751953,\n",
|
974 |
+
" -0.016494751,\n",
|
975 |
+
" 0.05529785,\n",
|
976 |
+
" -0.024490356,\n",
|
977 |
+
" 0.011192322,\n",
|
978 |
+
" 0.021087646,\n",
|
979 |
+
" -0.1274414,\n",
|
980 |
+
" 0.04840088,\n",
|
981 |
+
" 0.068603516,\n",
|
982 |
+
" -0.026535034,\n",
|
983 |
+
" 0.010772705,\n",
|
984 |
+
" -0.03778076,\n",
|
985 |
+
" -0.046447754,\n",
|
986 |
+
" 0.012870789,\n",
|
987 |
+
" -0.008674622,\n",
|
988 |
+
" 0.062561035,\n",
|
989 |
+
" -0.06188965,\n",
|
990 |
+
" -0.05831909,\n",
|
991 |
+
" -0.05557251,\n",
|
992 |
+
" 0.018417358,\n",
|
993 |
+
" -0.028656006,\n",
|
994 |
+
" 0.015487671,\n",
|
995 |
+
" 0.0146865845,\n",
|
996 |
+
" -0.010131836,\n",
|
997 |
+
" 0.06652832,\n",
|
998 |
+
" -0.09710693,\n",
|
999 |
+
" -0.012542725,\n",
|
1000 |
+
" 0.04815674,\n",
|
1001 |
+
" 0.019470215,\n",
|
1002 |
+
" 0.0017337799,\n",
|
1003 |
+
" -0.0069770813,\n",
|
1004 |
+
" -0.027648926,\n",
|
1005 |
+
" -0.103149414,\n",
|
1006 |
+
" -0.019058228,\n",
|
1007 |
+
" -0.049926758,\n",
|
1008 |
+
" -0.05307007,\n",
|
1009 |
+
" 0.13562012,\n",
|
1010 |
+
" 0.016311646,\n",
|
1011 |
+
" -0.10913086,\n",
|
1012 |
+
" -0.04837036,\n",
|
1013 |
+
" 0.033996582,\n",
|
1014 |
+
" -0.042144775,\n",
|
1015 |
+
" 0.043060303,\n",
|
1016 |
+
" -0.10797119,\n",
|
1017 |
+
" -0.087402344,\n",
|
1018 |
+
" -0.051452637,\n",
|
1019 |
+
" 0.018203735,\n",
|
1020 |
+
" 0.15771484,\n",
|
1021 |
+
" -0.025131226,\n",
|
1022 |
+
" 0.074401855,\n",
|
1023 |
+
" -0.0033817291,\n",
|
1024 |
+
" -0.026138306,\n",
|
1025 |
+
" 0.032165527,\n",
|
1026 |
+
" -0.010025024,\n",
|
1027 |
+
" -0.016189575,\n",
|
1028 |
+
" -0.001121521,\n",
|
1029 |
+
" -0.049560547,\n",
|
1030 |
+
" -0.034057617,\n",
|
1031 |
+
" -0.04336548,\n",
|
1032 |
+
" 0.11694336,\n",
|
1033 |
+
" -0.035949707,\n",
|
1034 |
+
" -0.061767578,\n",
|
1035 |
+
" -0.101623535,\n",
|
1036 |
+
" 0.0051116943,\n",
|
1037 |
+
" 0.039245605,\n",
|
1038 |
+
" 0.0053977966,\n",
|
1039 |
+
" 0.016708374,\n",
|
1040 |
+
" -0.012908936,\n",
|
1041 |
+
" -0.022369385,\n",
|
1042 |
+
" 0.03475952,\n",
|
1043 |
+
" 0.035583496,\n",
|
1044 |
+
" 0.046417236,\n",
|
1045 |
+
" -0.06097412,\n",
|
1046 |
+
" 0.07409668,\n",
|
1047 |
+
" -0.07312012,\n",
|
1048 |
+
" -0.019683838,\n",
|
1049 |
+
" -0.07086182,\n",
|
1050 |
+
" -0.03967285,\n",
|
1051 |
+
" -0.016937256,\n",
|
1052 |
+
" -0.02658081,\n",
|
1053 |
+
" 0.105651855,\n",
|
1054 |
+
" 0.05831909,\n",
|
1055 |
+
" 0.03778076,\n",
|
1056 |
+
" -0.07543945,\n",
|
1057 |
+
" 0.055267334,\n",
|
1058 |
+
" 0.012268066,\n",
|
1059 |
+
" 0.07720947,\n",
|
1060 |
+
" -0.068359375,\n",
|
1061 |
+
" -0.011100769,\n",
|
1062 |
+
" -0.0072898865,\n",
|
1063 |
+
" 0.051635742,\n",
|
1064 |
+
" 0.027954102,\n",
|
1065 |
+
" 0.043121338,\n",
|
1066 |
+
" 0.032440186,\n",
|
1067 |
+
" 0.081604004,\n",
|
1068 |
+
" -0.058807373,\n",
|
1069 |
+
" -0.048706055,\n",
|
1070 |
+
" 0.07867432,\n",
|
1071 |
+
" 0.014404297,\n",
|
1072 |
+
" -0.0024490356,\n",
|
1073 |
+
" -0.008575439,\n",
|
1074 |
+
" 0.010345459,\n",
|
1075 |
+
" -0.013870239,\n",
|
1076 |
+
" -0.019424438,\n",
|
1077 |
+
" 0.020889282,\n",
|
1078 |
+
" 0.013290405,\n",
|
1079 |
+
" 0.012817383,\n",
|
1080 |
+
" 0.015930176,\n",
|
1081 |
+
" 0.03768921,\n",
|
1082 |
+
" -0.0012264252,\n",
|
1083 |
+
" 0.0010223389,\n",
|
1084 |
+
" -0.023483276,\n",
|
1085 |
+
" -0.005252838,\n",
|
1086 |
+
" -0.051574707,\n",
|
1087 |
+
" 0.034729004,\n",
|
1088 |
+
" -0.004081726,\n",
|
1089 |
+
" 0.0317688,\n",
|
1090 |
+
" 0.0087890625,\n",
|
1091 |
+
" 0.11047363,\n",
|
1092 |
+
" 0.05291748,\n",
|
1093 |
+
" -0.12841797,\n",
|
1094 |
+
" 0.031799316,\n",
|
1095 |
+
" -0.09881592,\n",
|
1096 |
+
" 0.07299805,\n",
|
1097 |
+
" 0.02859497,\n",
|
1098 |
+
" 0.024780273,\n",
|
1099 |
+
" 0.049316406,\n",
|
1100 |
+
" -0.07122803,\n",
|
1101 |
+
" 0.03930664,\n",
|
1102 |
+
" 0.012702942,\n",
|
1103 |
+
" 0.06915283,\n",
|
1104 |
+
" -0.03967285,\n",
|
1105 |
+
" 0.035949707,\n",
|
1106 |
+
" -0.045166016,\n",
|
1107 |
+
" 0.09625244,\n",
|
1108 |
+
" 0.001203537,\n",
|
1109 |
+
" 0.022750854,\n",
|
1110 |
+
" -0.03665161,\n",
|
1111 |
+
" -0.013633728,\n",
|
1112 |
+
" -0.018112183,\n",
|
1113 |
+
" 0.054107666,\n",
|
1114 |
+
" -0.007106781,\n",
|
1115 |
+
" 0.004924774,\n",
|
1116 |
+
" -0.014953613,\n",
|
1117 |
+
" 0.07147217,\n",
|
1118 |
+
" -0.013092041,\n",
|
1119 |
+
" 0.016845703,\n",
|
1120 |
+
" -0.002910614,\n",
|
1121 |
+
" -0.05593872,\n",
|
1122 |
+
" 0.027404785,\n",
|
1123 |
+
" 0.002696991,\n",
|
1124 |
+
" 0.05822754,\n",
|
1125 |
+
" 0.0066566467,\n",
|
1126 |
+
" -0.09729004,\n",
|
1127 |
+
" 0.040100098,\n",
|
1128 |
+
" -0.00868988,\n",
|
1129 |
+
" 0.10290527,\n",
|
1130 |
+
" 0.04144287,\n",
|
1131 |
+
" -0.012680054,\n",
|
1132 |
+
" 0.039215088,\n",
|
1133 |
+
" -0.14074707,\n",
|
1134 |
+
" 0.08215332,\n",
|
1135 |
+
" -0.05078125,\n",
|
1136 |
+
" -0.028549194,\n",
|
1137 |
+
" 0.011962891,\n",
|
1138 |
+
" 0.028900146,\n",
|
1139 |
+
" -0.02444458,\n",
|
1140 |
+
" 0.004207611,\n",
|
1141 |
+
" -0.00995636,\n",
|
1142 |
+
" 0.028717041,\n",
|
1143 |
+
" -0.08325195,\n",
|
1144 |
+
" -0.047424316,\n",
|
1145 |
+
" 0.032043457,\n",
|
1146 |
+
" -0.04675293,\n",
|
1147 |
+
" -0.064575195,\n",
|
1148 |
+
" -0.03857422,\n",
|
1149 |
+
" 0.0070266724,\n",
|
1150 |
+
" -0.12634277,\n",
|
1151 |
+
" -0.0803833,\n",
|
1152 |
+
" -0.05419922,\n",
|
1153 |
+
" 0.064331055,\n",
|
1154 |
+
" -0.004421234,\n",
|
1155 |
+
" -0.00844574,\n",
|
1156 |
+
" -0.05923462,\n",
|
1157 |
+
" 0.052490234,\n",
|
1158 |
+
" 0.032592773,\n",
|
1159 |
+
" 0.024230957,\n",
|
1160 |
+
" 0.075683594,\n",
|
1161 |
+
" 0.011390686,\n",
|
1162 |
+
" 0.013252258,\n",
|
1163 |
+
" -0.029403687,\n",
|
1164 |
+
" -0.03338623,\n",
|
1165 |
+
" -0.045928955,\n",
|
1166 |
+
" 0.015022278,\n",
|
1167 |
+
" -0.08343506,\n",
|
1168 |
+
" 0.060180664,\n",
|
1169 |
+
" 0.076171875,\n",
|
1170 |
+
" 0.058898926,\n",
|
1171 |
+
" 0.026184082,\n",
|
1172 |
+
" -0.04031372,\n",
|
1173 |
+
" -0.0847168,\n",
|
1174 |
+
" -0.06628418,\n",
|
1175 |
+
" -0.017974854,\n",
|
1176 |
+
" -0.09967041,\n",
|
1177 |
+
" 0.07952881,\n",
|
1178 |
+
" 0.012413025,\n",
|
1179 |
+
" 0.04006958,\n",
|
1180 |
+
" 0.07788086,\n",
|
1181 |
+
" 0.014640808,\n",
|
1182 |
+
" -0.029281616,\n",
|
1183 |
+
" -0.04949951,\n",
|
1184 |
+
" 0.012565613,\n",
|
1185 |
+
" 0.025466919,\n",
|
1186 |
+
" -0.018478394,\n",
|
1187 |
+
" -0.072753906,\n",
|
1188 |
+
" 0.08905029,\n",
|
1189 |
+
" 0.023849487,\n",
|
1190 |
+
" 0.012306213,\n",
|
1191 |
+
" -0.089538574,\n",
|
1192 |
+
" -0.05657959,\n",
|
1193 |
+
" 0.11804199,\n",
|
1194 |
+
" -0.035827637,\n",
|
1195 |
+
" 0.049194336,\n",
|
1196 |
+
" -0.008911133,\n",
|
1197 |
+
" 0.09680176,\n",
|
1198 |
+
" -0.03616333,\n",
|
1199 |
+
" -0.057525635,\n",
|
1200 |
+
" 0.03375244,\n",
|
1201 |
+
" -0.027435303,\n",
|
1202 |
+
" 0.0035476685,\n",
|
1203 |
+
" 0.010269165,\n",
|
1204 |
+
" 0.031921387,\n",
|
1205 |
+
" 0.0011024475,\n",
|
1206 |
+
" 0.045715332,\n",
|
1207 |
+
" -0.026885986,\n",
|
1208 |
+
" 0.033935547,\n",
|
1209 |
+
" 0.06341553,\n",
|
1210 |
+
" 0.019958496,\n",
|
1211 |
+
" 0.008239746,\n",
|
1212 |
+
" 0.015174866,\n",
|
1213 |
+
" -0.023071289,\n",
|
1214 |
+
" 0.0056762695,\n",
|
1215 |
+
" 0.064575195,\n",
|
1216 |
+
" 0.0042533875,\n",
|
1217 |
+
" -0.05718994,\n",
|
1218 |
+
" 0.04486084,\n",
|
1219 |
+
" 0.020614624,\n",
|
1220 |
+
" 0.01461792,\n",
|
1221 |
+
" -0.09283447,\n",
|
1222 |
+
" 0.019592285,\n",
|
1223 |
+
" -0.022644043,\n",
|
1224 |
+
" 0.011512756,\n",
|
1225 |
+
" -0.005874634,\n",
|
1226 |
+
" -0.018569946,\n",
|
1227 |
+
" 0.006614685,\n",
|
1228 |
+
" 0.009269714,\n",
|
1229 |
+
" -0.04296875,\n",
|
1230 |
+
" -0.052856445,\n",
|
1231 |
+
" 0.084106445,\n",
|
1232 |
+
" 0.0043563843,\n",
|
1233 |
+
" -0.020721436,\n",
|
1234 |
+
" 0.029022217,\n",
|
1235 |
+
" 0.03982544,\n",
|
1236 |
+
" -0.109436035,\n",
|
1237 |
+
" -0.036071777,\n",
|
1238 |
+
" 0.03253174,\n",
|
1239 |
+
" 0.011558533,\n",
|
1240 |
+
" -0.10650635,\n",
|
1241 |
+
" 0.034454346,\n",
|
1242 |
+
" -0.06951904,\n",
|
1243 |
+
" -0.025817871,\n",
|
1244 |
+
" 0.10668945,\n",
|
1245 |
+
" 0.010101318,\n",
|
1246 |
+
" -0.070739746,\n",
|
1247 |
+
" 0.049621582,\n",
|
1248 |
+
" -0.09057617,\n",
|
1249 |
+
" 0.037231445,\n",
|
1250 |
+
" -0.03152466,\n",
|
1251 |
+
" -0.043914795,\n",
|
1252 |
+
" 0.07507324,\n",
|
1253 |
+
" 0.061645508,\n",
|
1254 |
+
" 0.0085372925,\n",
|
1255 |
+
" 0.004142761,\n",
|
1256 |
+
" -0.051971436,\n",
|
1257 |
+
" -0.05480957,\n",
|
1258 |
+
" 0.0030975342,\n",
|
1259 |
+
" -0.046875,\n",
|
1260 |
+
" -0.039398193,\n",
|
1261 |
+
" 0.08782959,\n",
|
1262 |
+
" -0.012550354,\n",
|
1263 |
+
" -0.003955841,\n",
|
1264 |
+
" -0.07775879,\n",
|
1265 |
+
" -0.021133423,\n",
|
1266 |
+
" 0.0062713623,\n",
|
1267 |
+
" -0.02255249,\n",
|
1268 |
+
" 0.017868042,\n",
|
1269 |
+
" 0.049560547,\n",
|
1270 |
+
" 0.028121948,\n",
|
1271 |
+
" 0.031707764,\n",
|
1272 |
+
" 0.041168213,\n",
|
1273 |
+
" 0.009559631,\n",
|
1274 |
+
" 0.036956787,\n",
|
1275 |
+
" 0.008987427,\n",
|
1276 |
+
" 0.0024776459,\n",
|
1277 |
+
" -0.003440857,\n",
|
1278 |
+
" -0.0067749023,\n",
|
1279 |
+
" -0.06439209,\n",
|
1280 |
+
" -0.010902405,\n",
|
1281 |
+
" -0.07104492,\n",
|
1282 |
+
" 0.006214142,\n",
|
1283 |
+
" -0.06359863,\n",
|
1284 |
+
" 0.062316895,\n",
|
1285 |
+
" 0.005367279,\n",
|
1286 |
+
" 0.015197754,\n",
|
1287 |
+
" -0.043182373,\n",
|
1288 |
+
" 0.050933838,\n",
|
1289 |
+
" 0.0035800934,\n",
|
1290 |
+
" 0.0032138824,\n",
|
1291 |
+
" -0.017974854,\n",
|
1292 |
+
" 0.08972168,\n",
|
1293 |
+
" 0.011268616,\n",
|
1294 |
+
" 0.020477295,\n",
|
1295 |
+
" -0.05050659,\n",
|
1296 |
+
" -0.07232666,\n",
|
1297 |
+
" 0.07055664,\n",
|
1298 |
+
" -0.010002136,\n",
|
1299 |
+
" 0.11480713,\n",
|
1300 |
+
" 0.02130127,\n",
|
1301 |
+
" 0.039093018,\n",
|
1302 |
+
" 0.009597778,\n",
|
1303 |
+
" -0.0619812,\n",
|
1304 |
+
" -0.016952515],\n",
|
1305 |
+
" 'id': '1af84f3b4cc6a9f1018f2f80b4fd3ba7'}"
|
1306 |
+
]
|
1307 |
+
},
|
1308 |
+
"execution_count": 12,
|
1309 |
+
"metadata": {},
|
1310 |
+
"output_type": "execute_result"
|
1311 |
+
}
|
1312 |
+
],
|
1313 |
"source": [
|
1314 |
+
"documents[0]"
|
1315 |
]
|
1316 |
},
|
1317 |
{
|
1318 |
"cell_type": "markdown",
|
1319 |
+
"id": "93d6ab01-bd3b-479d-918d-2bdb30b00fac",
|
1320 |
+
"metadata": {},
|
|
|
|
|
1321 |
"source": [
|
1322 |
+
"# Performance and Cost Analysis\n",
|
1323 |
+
"You can see that we are quite cost effective!\n",
|
1324 |
+
"![Cost](../media/arabic-rag-embeddings-cost.png)\n",
|
1325 |
+
"Note that the performance is over just the last 30 min window.\n",
|
1326 |
+
"Observations:\n",
|
1327 |
+
"- We have a througput of `~333/s`\n",
|
1328 |
+
"- Our median latency per request is `~50ms`\n",
|
1329 |
+
"![Metrics](../media/arabic-rag-embeddings-metrics.png)"
|
1330 |
]
|
1331 |
},
|
1332 |
{
|