--- base_model: aubmindlab/bert-base-arabertv02 library_name: sentence-transformers metrics: - pearson_cosine - spearman_cosine - pearson_manhattan - spearman_manhattan - pearson_euclidean - spearman_euclidean - pearson_dot - spearman_dot - pearson_max - spearman_max pipeline_tag: sentence-similarity tags: - sentence-transformers - sentence-similarity - feature-extraction - generated_from_trainer - loss:CosineSimilarityLoss - mteb model-index: - name: silma-ai/silma-embeddding-matryoshka-v0.1 results: - dataset: config: ar name: MTEB MassiveIntentClassification (ar) revision: 4672e20407010da34463acc759c162ca9734bca6 split: test type: mteb/amazon_massive_intent metrics: - type: accuracy value: 56.445864156018835 - type: f1 value: 53.58282538318122 - type: f1_weighted value: 56.821808211639315 - type: main_score value: 56.445864156018835 task: type: Classification - dataset: config: en name: MTEB MassiveIntentClassification (en) revision: 4672e20407010da34463acc759c162ca9734bca6 split: test type: mteb/amazon_massive_intent metrics: - type: accuracy value: 47.40080699394754 - type: f1 value: 44.729286773524755 - type: f1_weighted value: 47.83506683571795 - type: main_score value: 47.40080699394754 task: type: Classification - dataset: config: ar name: MTEB MassiveIntentClassification (ar) revision: 4672e20407010da34463acc759c162ca9734bca6 split: validation type: mteb/amazon_massive_intent metrics: - type: accuracy value: 56.97983275946876 - type: f1 value: 53.809263807080086 - type: f1_weighted value: 57.14993215193604 - type: main_score value: 56.97983275946876 task: type: Classification - dataset: config: en name: MTEB MassiveIntentClassification (en) revision: 4672e20407010da34463acc759c162ca9734bca6 split: validation type: mteb/amazon_massive_intent metrics: - type: accuracy value: 47.683226758485006 - type: f1 value: 44.905317333393775 - type: f1_weighted value: 48.051379514830195 - type: main_score value: 47.683226758485006 task: type: Classification - dataset: config: ar name: MTEB MassiveScenarioClassification (ar) revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 split: test type: mteb/amazon_massive_scenario metrics: - type: accuracy value: 63.31876260928042 - type: f1 value: 63.197056314678754 - type: f1_weighted value: 62.7166315473092 - type: main_score value: 63.31876260928042 task: type: Classification - dataset: config: en name: MTEB MassiveScenarioClassification (en) revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 split: test type: mteb/amazon_massive_scenario metrics: - type: accuracy value: 53.35574983187627 - type: f1 value: 50.35837223252574 - type: f1_weighted value: 54.11644042208904 - type: main_score value: 53.35574983187627 task: type: Classification - dataset: config: ar name: MTEB MassiveScenarioClassification (ar) revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 split: validation type: mteb/amazon_massive_scenario metrics: - type: accuracy value: 62.26758484997541 - type: f1 value: 62.477928166560325 - type: f1_weighted value: 61.92238394647396 - type: main_score value: 62.26758484997541 task: type: Classification - dataset: config: en name: MTEB MassiveScenarioClassification (en) revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 split: validation type: mteb/amazon_massive_scenario metrics: - type: accuracy value: 52.62174126906049 - type: f1 value: 50.470501485026716 - type: f1_weighted value: 53.16459392827557 - type: main_score value: 52.62174126906049 task: type: Classification - dataset: config: en-en name: MTEB STS17 (en-en) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 74.33941506827517 - type: cosine_spearman value: 74.42197838273297 - type: euclidean_pearson value: 75.33836191339782 - type: euclidean_spearman value: 74.37385193453852 - type: main_score value: 74.42197838273297 - type: manhattan_pearson value: 75.41881517194568 - type: manhattan_spearman value: 74.47237277057877 - type: pearson value: 74.33941645999855 - type: spearman value: 74.42197838273297 task: type: STS - dataset: config: nl-en name: MTEB STS17 (nl-en) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 31.84872826199112 - type: cosine_spearman value: 32.22496230755917 - type: euclidean_pearson value: 21.830860533929688 - type: euclidean_spearman value: 21.38205815348658 - type: main_score value: 32.22496230755917 - type: manhattan_pearson value: 21.852430479395576 - type: manhattan_spearman value: 21.37848326556159 - type: pearson value: 31.84872485436001 - type: spearman value: 32.22496230755917 task: type: STS - dataset: config: en-ar name: MTEB STS17 (en-ar) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 43.37529327788584 - type: cosine_spearman value: 42.763149514327225 - type: euclidean_pearson value: 39.625411905897394 - type: euclidean_spearman value: 39.26727199746294 - type: main_score value: 42.763149514327225 - type: manhattan_pearson value: 40.49857681486655 - type: manhattan_spearman value: 40.63669314166475 - type: pearson value: 43.37529078998193 - type: spearman value: 42.763149514327225 task: type: STS - dataset: config: en-tr name: MTEB STS17 (en-tr) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 17.16722415938186 - type: cosine_spearman value: 15.590330355526344 - type: euclidean_pearson value: 4.430499555984906 - type: euclidean_spearman value: 2.729050802084264 - type: main_score value: 15.590330355526344 - type: manhattan_pearson value: 2.805408490135879 - type: manhattan_spearman value: 1.5237347692119627 - type: pearson value: 17.167228709176676 - type: spearman value: 15.590330355526344 task: type: STS - dataset: config: fr-en name: MTEB STS17 (fr-en) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 36.093945717347395 - type: cosine_spearman value: 37.33997345407934 - type: euclidean_pearson value: 23.156103022485055 - type: euclidean_spearman value: 20.62925594786342 - type: main_score value: 37.33997345407934 - type: manhattan_pearson value: 22.035024322719813 - type: manhattan_spearman value: 19.147522562438795 - type: pearson value: 36.09395175426761 - type: spearman value: 37.33997345407934 task: type: STS - dataset: config: en-de name: MTEB STS17 (en-de) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 29.064411455563 - type: cosine_spearman value: 29.232781114344697 - type: euclidean_pearson value: 16.90458086330736 - type: euclidean_spearman value: 17.462020565289887 - type: main_score value: 29.232781114344697 - type: manhattan_pearson value: 16.882446230243286 - type: manhattan_spearman value: 17.06144091941576 - type: pearson value: 29.06441922605839 - type: spearman value: 29.232781114344697 task: type: STS - dataset: config: es-en name: MTEB STS17 (es-en) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 27.686316587339473 - type: cosine_spearman value: 28.650995973102205 - type: euclidean_pearson value: 12.954885279630565 - type: euclidean_spearman value: 11.970815927480198 - type: main_score value: 28.650995973102205 - type: manhattan_pearson value: 12.079730127474948 - type: manhattan_spearman value: 10.606967901984147 - type: pearson value: 27.68631836666537 - type: spearman value: 28.650995973102205 task: type: STS - dataset: config: ar-ar name: MTEB STS17 (ar-ar) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 84.12612492708037 - type: cosine_spearman value: 84.24703763883515 - type: euclidean_pearson value: 81.38085140113648 - type: euclidean_spearman value: 83.17403450502965 - type: main_score value: 84.24703763883515 - type: manhattan_pearson value: 81.18466522597414 - type: manhattan_spearman value: 82.61184409962614 - type: pearson value: 84.12612546419625 - type: spearman value: 84.25077492152536 task: type: STS - dataset: config: it-en name: MTEB STS17 (it-en) revision: faeb762787bd10488a50c8b5be4a3b82e411949c split: test type: mteb/sts17-crosslingual-sts metrics: - type: cosine_pearson value: 27.697680546701868 - type: cosine_spearman value: 25.19277336255784 - type: euclidean_pearson value: 13.964798090314115 - type: euclidean_spearman value: 10.512169361528596 - type: main_score value: 25.19277336255784 - type: manhattan_pearson value: 13.537525485694433 - type: manhattan_spearman value: 10.334001560105834 - type: pearson value: 27.697681880242325 - type: spearman value: 25.19277336255784 task: type: STS - dataset: config: de-en name: MTEB STS22.v2 (de-en) revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd split: test type: mteb/sts22-crosslingual-sts metrics: - type: cosine_pearson value: 32.87548760760924 - type: cosine_spearman value: 30.69782036694315 - type: euclidean_pearson value: 29.925045225262142 - type: euclidean_spearman value: 34.076021250318334 - type: main_score value: 30.69782036694315 - type: manhattan_pearson value: 30.815090565180945 - type: manhattan_spearman value: 34.91615861045259 - type: pearson value: 32.8754813614174 - type: spearman value: 30.69782036694315 task: type: STS - dataset: config: zh-en name: MTEB STS22.v2 (zh-en) revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd split: test type: mteb/sts22-crosslingual-sts metrics: - type: cosine_pearson value: 23.93269292232737 - type: cosine_spearman value: 16.781461291066496 - type: euclidean_pearson value: 20.87679825681155 - type: euclidean_spearman value: 13.764510796592536 - type: main_score value: 16.781461291066496 - type: manhattan_pearson value: 23.416430850444588 - type: manhattan_spearman value: 17.10405713909058 - type: pearson value: 23.932682034899777 - type: spearman value: 16.781461291066496 task: type: STS - dataset: config: ar name: MTEB STS22.v2 (ar) revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd split: test type: mteb/sts22-crosslingual-sts metrics: - type: cosine_pearson value: 51.73784691362425 - type: cosine_spearman value: 60.01035490847343 - type: euclidean_pearson value: 52.717195602630305 - type: euclidean_spearman value: 60.22164097529916 - type: main_score value: 60.01035490847343 - type: manhattan_pearson value: 53.04979941729716 - type: manhattan_spearman value: 60.393100473647706 - type: pearson value: 51.73784381247053 - type: spearman value: 60.020906672817276 task: type: STS - dataset: config: es-en name: MTEB STS22.v2 (es-en) revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd split: test type: mteb/sts22-crosslingual-sts metrics: - type: cosine_pearson value: 47.917244237624864 - type: cosine_spearman value: 53.23173373821509 - type: euclidean_pearson value: 48.172861539004636 - type: euclidean_spearman value: 53.32970069145014 - type: main_score value: 53.23173373821509 - type: manhattan_pearson value: 48.163716825216646 - type: manhattan_spearman value: 53.77963871495307 - type: pearson value: 47.91724405724847 - type: spearman value: 53.23173373821509 task: type: STS - dataset: config: pl-en name: MTEB STS22.v2 (pl-en) revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd split: test type: mteb/sts22-crosslingual-sts metrics: - type: cosine_pearson value: 43.66748993183993 - type: cosine_spearman value: 38.518248671828594 - type: euclidean_pearson value: 50.475058499541134 - type: euclidean_spearman value: 44.76070858743843 - type: main_score value: 38.518248671828594 - type: manhattan_pearson value: 50.576185727010014 - type: manhattan_spearman value: 45.5306304403841 - type: pearson value: 43.66750472144702 - type: spearman value: 38.518248671828594 task: type: STS - dataset: config: en name: MTEB STS22.v2 (en) revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd split: test type: mteb/sts22-crosslingual-sts metrics: - type: cosine_pearson value: 56.41373213565263 - type: cosine_spearman value: 59.03774516602592 - type: euclidean_pearson value: 54.173092638047294 - type: euclidean_spearman value: 59.130444355085885 - type: main_score value: 59.03774516602592 - type: manhattan_pearson value: 54.18950361517434 - type: manhattan_spearman value: 58.78927227383971 - type: pearson value: 56.413733329868045 - type: spearman value: 59.03774516602592 task: type: STS license: apache-2.0 language: - ar - en --- # SILMA Arabic Matryoshka Embedding Model 0.1 The **SILMA Arabic Matryoshka Embedding Model 0.1** is an advanced Arabic text embedding model designed to produce powerful, contextually rich representations of text, facilitating a wide range of applications, from semantic search to document classification. This model leverages the innovative **Matryoshka** Embedding technique which can be used in different dimensions to optimize the speed, storage, and accuracy trade-offs. ## Usage ### Direct Usage (Sentence Transformers) First, install the Sentence Transformers library: ```bash pip install -U sentence-transformers ``` Then load the model ```python from sentence_transformers import SentenceTransformer from sentence_transformers.util import cos_sim import pandas as pd model_name = "silma-ai/silma-embeddding-matryoshka-0.1" model = SentenceTransformer(model_name) ``` ### Samples Using Matryoshka, you can specify the first `(n)` dimensions to represent each text. In the following samples, you can check how each dimension affects the `cosine similarity` between a query and the two inputs. You can notice the in most cases, even too low dimension (i.e. 8) can produce acceptable semantic similarity scores. #### [+] Short Sentence Similarity ```python query = "الطقس اليوم مشمس" sentence_1 = "الجو اليوم كان مشمسًا ورائعًا" sentence_2 = "الطقس اليوم غائم" scores = [] for dim in [768, 256, 48, 16, 8]: query_embedding = model.encode(query)[:dim] sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist() sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist() scores.append({ "dim": dim, "valid_top": sent1_score > sent2_score, "sent1_score": sent1_score, "sent2_score": sent2_score, }) scores_df = pd.DataFrame(scores) print(scores_df.to_markdown(index=False)) # | dim | valid_top | sent1_score | sent2_score | # |------:|:------------|--------------:|--------------:| # | 768 | True | 0.479942 | 0.233572 | # | 256 | True | 0.509289 | 0.208452 | # | 48 | True | 0.598825 | 0.191677 | # | 16 | True | 0.917707 | 0.458854 | # | 8 | True | 0.948563 | 0.675662 | ``` #### [+] Long Sentence Similarity ```python query = "الكتاب يتحدث عن أهمية الذكاء الاصطناعي في تطوير المجتمعات الحديثة" sentence_1 = "في هذا الكتاب، يناقش الكاتب كيف يمكن للتكنولوجيا أن تغير العالم" sentence_2 = "الكاتب يتحدث عن أساليب الطبخ التقليدية في دول البحر الأبيض المتوسط" scores = [] for dim in [768, 256, 48, 16, 8]: query_embedding = model.encode(query)[:dim] sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist() sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist() scores.append({ "dim": dim, "valid_top": sent1_score > sent2_score, "sent1_score": sent1_score, "sent2_score": sent2_score, }) scores_df = pd.DataFrame(scores) print(scores_df.to_markdown(index=False)) # | dim | valid_top | sent1_score | sent2_score | # |------:|:------------|--------------:|--------------:| # | 768 | True | 0.637418 | 0.262693 | # | 256 | True | 0.614761 | 0.268267 | # | 48 | True | 0.758887 | 0.384649 | # | 16 | True | 0.885737 | 0.204213 | # | 8 | True | 0.918684 | 0.146478 | ``` #### [+] Question to Paragraph Matching ```python query = "ما هي فوائد ممارسة الرياضة؟" sentence_1 = "ممارسة الرياضة بشكل منتظم تساعد على تحسين الصحة العامة واللياقة البدنية" sentence_2 = "تعليم الأطفال في سن مبكرة يساعدهم على تطوير المهارات العقلية بسرعة" scores = [] for dim in [768, 256, 48, 16, 8]: query_embedding = model.encode(query)[:dim] sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist() sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist() scores.append({ "dim": dim, "valid_top": sent1_score > sent2_score, "sent1_score": sent1_score, "sent2_score": sent2_score, }) scores_df = pd.DataFrame(scores) print(scores_df.to_markdown(index=False)) # | dim | valid_top | sent1_score | sent2_score | # |------:|:------------|--------------:|--------------:| # | 768 | True | 0.520329 | 0.00295128 | # | 256 | True | 0.556088 | -0.017764 | # | 48 | True | 0.586194 | -0.110691 | # | 16 | True | 0.606462 | -0.331682 | # | 8 | True | 0.689649 | -0.359202 | ``` #### [+] Message to Intent-Name Mapping ```python query = "أرغب في حجز تذكرة طيران من دبي الى القاهرة يوم الثلاثاء القادم" sentence_1 = "حجز رحلة" sentence_2 = "إلغاء حجز" scores = [] for dim in [768, 256, 48, 16, 8]: query_embedding = model.encode(query)[:dim] sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist() sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist() scores.append({ "dim": dim, "valid_top": sent1_score > sent2_score, "sent1_score": sent1_score, "sent2_score": sent2_score, }) scores_df = pd.DataFrame(scores) print(scores_df.to_markdown(index=False)) # | dim | valid_top | sent1_score | sent2_score | # |------:|:------------|--------------:|--------------:| # | 768 | True | 0.476535 | 0.221451 | # | 256 | True | 0.392701 | 0.224967 | # | 48 | True | 0.316223 | 0.0210683 | # | 16 | False | -0.0242871 | 0.0250766 | # | 8 | True | -0.215241 | -0.258904 | ``` ## Training Details We curated a dataset [silma-ai/silma-arabic-triplets-dataset-v1.0](https://huggingface.co/datasets/silma-ai/silma-arabic-triplets-dataset-v1.0) which contains more than `2.25M` records of (anchor, positive and negative) Arabic/English samples. Only the first `600` samples were taken to be the `eval` dataset, while the rest were used for fine-tuning. This produced a finetuned `Matryoshka` model based on [aubmindlab/bert-base-arabertv02](https://huggingface.co/aubmindlab/bert-base-arabertv02) with the following hyperparameters: - `per_device_train_batch_size`: 250 - `per_device_eval_batch_size`: 10 - `learning_rate`: 1e-05 - `num_train_epochs`: 3 - `bf16`: True - `dataloader_drop_last`: True - `optim`: adamw_torch_fused - `batch_sampler`: no_duplicates **[training script](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/matryoshka/matryoshka_sts.py)** ### Framework Versions - Python: 3.10.14 - Sentence Transformers: 3.2.0 - Transformers: 4.45.2 - PyTorch: 2.3.1 - Accelerate: 1.0.1 - Datasets: 3.0.1 - Tokenizers: 0.20.1 ### Full Model Architecture ``` SentenceTransformer( (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True}) ) ``` ### Citation: #### BibTeX: ```bibtex @misc{silma2024embedding, author = {Abu Bakr Soliman, Karim Ouda, SILMA AI}, title = {SILMA Embedding Matryoshka 0.1}, year = {2024}, publisher = {Hugging Face}, howpublished = {\url{https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1}}, } ``` #### APA: ```apa Abu Bakr Soliman, Karim Ouda, SILMA AI. (2024). SILMA Embedding Matryoshka STS 0.1 [Model]. Hugging Face. https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1 ``` #### Sentence Transformers ```bibtex @inproceedings{reimers-2019-sentence-bert, title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", author = "Reimers, Nils and Gurevych, Iryna", booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", month = "11", year = "2019", publisher = "Association for Computational Linguistics", url = "https://arxiv.org/abs/1908.10084", } ``` #### MatryoshkaLoss ```bibtex @misc{kusupati2024matryoshka, title={Matryoshka Representation Learning}, author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi}, year={2024}, eprint={2205.13147}, archivePrefix={arXiv}, primaryClass={cs.LG} } ``` #### MultipleNegativesRankingLoss ```bibtex @misc{henderson2017efficient, title={Efficient Natural Language Response Suggestion for Smart Reply}, author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil}, year={2017}, eprint={1705.00652}, archivePrefix={arXiv}, primaryClass={cs.CL} } ```