base_model: aubmindlab/bert-base-arabertv02
library_name: sentence-transformers
metrics:
- pearson_cosine
- spearman_cosine
- pearson_manhattan
- spearman_manhattan
- pearson_euclidean
- spearman_euclidean
- pearson_dot
- spearman_dot
- pearson_max
- spearman_max
pipeline_tag: sentence-similarity
tags:
- sentence-transformers
- sentence-similarity
- feature-extraction
- generated_from_trainer
- loss:CosineSimilarityLoss
- mteb
model-index:
- name: silma-ai/silma-embeddding-matryoshka-v0.1
results:
- dataset:
config: ar
name: MTEB MassiveIntentClassification (ar)
revision: 4672e20407010da34463acc759c162ca9734bca6
split: test
type: mteb/amazon_massive_intent
metrics:
- type: accuracy
value: 56.445864156018835
- type: f1
value: 53.58282538318122
- type: f1_weighted
value: 56.821808211639315
- type: main_score
value: 56.445864156018835
task:
type: Classification
- dataset:
config: en
name: MTEB MassiveIntentClassification (en)
revision: 4672e20407010da34463acc759c162ca9734bca6
split: test
type: mteb/amazon_massive_intent
metrics:
- type: accuracy
value: 47.40080699394754
- type: f1
value: 44.729286773524755
- type: f1_weighted
value: 47.83506683571795
- type: main_score
value: 47.40080699394754
task:
type: Classification
- dataset:
config: ar
name: MTEB MassiveIntentClassification (ar)
revision: 4672e20407010da34463acc759c162ca9734bca6
split: validation
type: mteb/amazon_massive_intent
metrics:
- type: accuracy
value: 56.97983275946876
- type: f1
value: 53.809263807080086
- type: f1_weighted
value: 57.14993215193604
- type: main_score
value: 56.97983275946876
task:
type: Classification
- dataset:
config: en
name: MTEB MassiveIntentClassification (en)
revision: 4672e20407010da34463acc759c162ca9734bca6
split: validation
type: mteb/amazon_massive_intent
metrics:
- type: accuracy
value: 47.683226758485006
- type: f1
value: 44.905317333393775
- type: f1_weighted
value: 48.051379514830195
- type: main_score
value: 47.683226758485006
task:
type: Classification
- dataset:
config: ar
name: MTEB MassiveScenarioClassification (ar)
revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8
split: test
type: mteb/amazon_massive_scenario
metrics:
- type: accuracy
value: 63.31876260928042
- type: f1
value: 63.197056314678754
- type: f1_weighted
value: 62.7166315473092
- type: main_score
value: 63.31876260928042
task:
type: Classification
- dataset:
config: en
name: MTEB MassiveScenarioClassification (en)
revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8
split: test
type: mteb/amazon_massive_scenario
metrics:
- type: accuracy
value: 53.35574983187627
- type: f1
value: 50.35837223252574
- type: f1_weighted
value: 54.11644042208904
- type: main_score
value: 53.35574983187627
task:
type: Classification
- dataset:
config: ar
name: MTEB MassiveScenarioClassification (ar)
revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8
split: validation
type: mteb/amazon_massive_scenario
metrics:
- type: accuracy
value: 62.26758484997541
- type: f1
value: 62.477928166560325
- type: f1_weighted
value: 61.92238394647396
- type: main_score
value: 62.26758484997541
task:
type: Classification
- dataset:
config: en
name: MTEB MassiveScenarioClassification (en)
revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8
split: validation
type: mteb/amazon_massive_scenario
metrics:
- type: accuracy
value: 52.62174126906049
- type: f1
value: 50.470501485026716
- type: f1_weighted
value: 53.16459392827557
- type: main_score
value: 52.62174126906049
task:
type: Classification
- dataset:
config: en-en
name: MTEB STS17 (en-en)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 74.33941506827517
- type: cosine_spearman
value: 74.42197838273297
- type: euclidean_pearson
value: 75.33836191339782
- type: euclidean_spearman
value: 74.37385193453852
- type: main_score
value: 74.42197838273297
- type: manhattan_pearson
value: 75.41881517194568
- type: manhattan_spearman
value: 74.47237277057877
- type: pearson
value: 74.33941645999855
- type: spearman
value: 74.42197838273297
task:
type: STS
- dataset:
config: nl-en
name: MTEB STS17 (nl-en)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 31.84872826199112
- type: cosine_spearman
value: 32.22496230755917
- type: euclidean_pearson
value: 21.830860533929688
- type: euclidean_spearman
value: 21.38205815348658
- type: main_score
value: 32.22496230755917
- type: manhattan_pearson
value: 21.852430479395576
- type: manhattan_spearman
value: 21.37848326556159
- type: pearson
value: 31.84872485436001
- type: spearman
value: 32.22496230755917
task:
type: STS
- dataset:
config: en-ar
name: MTEB STS17 (en-ar)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 43.37529327788584
- type: cosine_spearman
value: 42.763149514327225
- type: euclidean_pearson
value: 39.625411905897394
- type: euclidean_spearman
value: 39.26727199746294
- type: main_score
value: 42.763149514327225
- type: manhattan_pearson
value: 40.49857681486655
- type: manhattan_spearman
value: 40.63669314166475
- type: pearson
value: 43.37529078998193
- type: spearman
value: 42.763149514327225
task:
type: STS
- dataset:
config: en-tr
name: MTEB STS17 (en-tr)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 17.16722415938186
- type: cosine_spearman
value: 15.590330355526344
- type: euclidean_pearson
value: 4.430499555984906
- type: euclidean_spearman
value: 2.729050802084264
- type: main_score
value: 15.590330355526344
- type: manhattan_pearson
value: 2.805408490135879
- type: manhattan_spearman
value: 1.5237347692119627
- type: pearson
value: 17.167228709176676
- type: spearman
value: 15.590330355526344
task:
type: STS
- dataset:
config: fr-en
name: MTEB STS17 (fr-en)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 36.093945717347395
- type: cosine_spearman
value: 37.33997345407934
- type: euclidean_pearson
value: 23.156103022485055
- type: euclidean_spearman
value: 20.62925594786342
- type: main_score
value: 37.33997345407934
- type: manhattan_pearson
value: 22.035024322719813
- type: manhattan_spearman
value: 19.147522562438795
- type: pearson
value: 36.09395175426761
- type: spearman
value: 37.33997345407934
task:
type: STS
- dataset:
config: en-de
name: MTEB STS17 (en-de)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 29.064411455563
- type: cosine_spearman
value: 29.232781114344697
- type: euclidean_pearson
value: 16.90458086330736
- type: euclidean_spearman
value: 17.462020565289887
- type: main_score
value: 29.232781114344697
- type: manhattan_pearson
value: 16.882446230243286
- type: manhattan_spearman
value: 17.06144091941576
- type: pearson
value: 29.06441922605839
- type: spearman
value: 29.232781114344697
task:
type: STS
- dataset:
config: es-en
name: MTEB STS17 (es-en)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 27.686316587339473
- type: cosine_spearman
value: 28.650995973102205
- type: euclidean_pearson
value: 12.954885279630565
- type: euclidean_spearman
value: 11.970815927480198
- type: main_score
value: 28.650995973102205
- type: manhattan_pearson
value: 12.079730127474948
- type: manhattan_spearman
value: 10.606967901984147
- type: pearson
value: 27.68631836666537
- type: spearman
value: 28.650995973102205
task:
type: STS
- dataset:
config: ar-ar
name: MTEB STS17 (ar-ar)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 84.12612492708037
- type: cosine_spearman
value: 84.24703763883515
- type: euclidean_pearson
value: 81.38085140113648
- type: euclidean_spearman
value: 83.17403450502965
- type: main_score
value: 84.24703763883515
- type: manhattan_pearson
value: 81.18466522597414
- type: manhattan_spearman
value: 82.61184409962614
- type: pearson
value: 84.12612546419625
- type: spearman
value: 84.25077492152536
task:
type: STS
- dataset:
config: it-en
name: MTEB STS17 (it-en)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 27.697680546701868
- type: cosine_spearman
value: 25.19277336255784
- type: euclidean_pearson
value: 13.964798090314115
- type: euclidean_spearman
value: 10.512169361528596
- type: main_score
value: 25.19277336255784
- type: manhattan_pearson
value: 13.537525485694433
- type: manhattan_spearman
value: 10.334001560105834
- type: pearson
value: 27.697681880242325
- type: spearman
value: 25.19277336255784
task:
type: STS
- dataset:
config: de-en
name: MTEB STS22.v2 (de-en)
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd
split: test
type: mteb/sts22-crosslingual-sts
metrics:
- type: cosine_pearson
value: 32.87548760760924
- type: cosine_spearman
value: 30.69782036694315
- type: euclidean_pearson
value: 29.925045225262142
- type: euclidean_spearman
value: 34.076021250318334
- type: main_score
value: 30.69782036694315
- type: manhattan_pearson
value: 30.815090565180945
- type: manhattan_spearman
value: 34.91615861045259
- type: pearson
value: 32.8754813614174
- type: spearman
value: 30.69782036694315
task:
type: STS
- dataset:
config: zh-en
name: MTEB STS22.v2 (zh-en)
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd
split: test
type: mteb/sts22-crosslingual-sts
metrics:
- type: cosine_pearson
value: 23.93269292232737
- type: cosine_spearman
value: 16.781461291066496
- type: euclidean_pearson
value: 20.87679825681155
- type: euclidean_spearman
value: 13.764510796592536
- type: main_score
value: 16.781461291066496
- type: manhattan_pearson
value: 23.416430850444588
- type: manhattan_spearman
value: 17.10405713909058
- type: pearson
value: 23.932682034899777
- type: spearman
value: 16.781461291066496
task:
type: STS
- dataset:
config: ar
name: MTEB STS22.v2 (ar)
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd
split: test
type: mteb/sts22-crosslingual-sts
metrics:
- type: cosine_pearson
value: 51.73784691362425
- type: cosine_spearman
value: 60.01035490847343
- type: euclidean_pearson
value: 52.717195602630305
- type: euclidean_spearman
value: 60.22164097529916
- type: main_score
value: 60.01035490847343
- type: manhattan_pearson
value: 53.04979941729716
- type: manhattan_spearman
value: 60.393100473647706
- type: pearson
value: 51.73784381247053
- type: spearman
value: 60.020906672817276
task:
type: STS
- dataset:
config: es-en
name: MTEB STS22.v2 (es-en)
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd
split: test
type: mteb/sts22-crosslingual-sts
metrics:
- type: cosine_pearson
value: 47.917244237624864
- type: cosine_spearman
value: 53.23173373821509
- type: euclidean_pearson
value: 48.172861539004636
- type: euclidean_spearman
value: 53.32970069145014
- type: main_score
value: 53.23173373821509
- type: manhattan_pearson
value: 48.163716825216646
- type: manhattan_spearman
value: 53.77963871495307
- type: pearson
value: 47.91724405724847
- type: spearman
value: 53.23173373821509
task:
type: STS
- dataset:
config: pl-en
name: MTEB STS22.v2 (pl-en)
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd
split: test
type: mteb/sts22-crosslingual-sts
metrics:
- type: cosine_pearson
value: 43.66748993183993
- type: cosine_spearman
value: 38.518248671828594
- type: euclidean_pearson
value: 50.475058499541134
- type: euclidean_spearman
value: 44.76070858743843
- type: main_score
value: 38.518248671828594
- type: manhattan_pearson
value: 50.576185727010014
- type: manhattan_spearman
value: 45.5306304403841
- type: pearson
value: 43.66750472144702
- type: spearman
value: 38.518248671828594
task:
type: STS
- dataset:
config: en
name: MTEB STS22.v2 (en)
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd
split: test
type: mteb/sts22-crosslingual-sts
metrics:
- type: cosine_pearson
value: 56.41373213565263
- type: cosine_spearman
value: 59.03774516602592
- type: euclidean_pearson
value: 54.173092638047294
- type: euclidean_spearman
value: 59.130444355085885
- type: main_score
value: 59.03774516602592
- type: manhattan_pearson
value: 54.18950361517434
- type: manhattan_spearman
value: 58.78927227383971
- type: pearson
value: 56.413733329868045
- type: spearman
value: 59.03774516602592
task:
type: STS
license: apache-2.0
language:
- ar
- en
SILMA Arabic Matryoshka Embedding Model 0.1
The SILMA Arabic Matryoshka Embedding Model 0.1 is an advanced Arabic text embedding model designed to produce powerful, contextually rich representations of text, facilitating a wide range of applications, from semantic search to document classification.
This model leverages the innovative Matryoshka Embedding technique which can be used in different dimensions to optimize the speed, storage, and accuracy trade-offs.
Usage
Direct Usage (Sentence Transformers)
First, install the Sentence Transformers library:
pip install -U sentence-transformers
Then load the model
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import pandas as pd
model_name = "silma-ai/silma-embeddding-matryoshka-0.1"
model = SentenceTransformer(model_name)
Samples
Using Matryoshka, you can specify the first (n)
dimensions to represent each text.
In the following samples, you can check how each dimension affects the cosine similarity
between a query and the two inputs.
You can notice the in most cases, even too low dimension (i.e. 8) can produce acceptable semantic similarity scores.
[+] Short Sentence Similarity
query = "الطقس اليوم مشمس"
sentence_1 = "الجو اليوم كان مشمسًا ورائعًا"
sentence_2 = "الطقس اليوم غائم"
scores = []
for dim in [768, 256, 48, 16, 8]:
query_embedding = model.encode(query)[:dim]
sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist()
sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist()
scores.append({
"dim": dim,
"valid_top": sent1_score > sent2_score,
"sent1_score": sent1_score,
"sent2_score": sent2_score,
})
scores_df = pd.DataFrame(scores)
print(scores_df.to_markdown(index=False))
# | dim | valid_top | sent1_score | sent2_score |
# |------:|:------------|--------------:|--------------:|
# | 768 | True | 0.479942 | 0.233572 |
# | 256 | True | 0.509289 | 0.208452 |
# | 48 | True | 0.598825 | 0.191677 |
# | 16 | True | 0.917707 | 0.458854 |
# | 8 | True | 0.948563 | 0.675662 |
[+] Long Sentence Similarity
query = "الكتاب يتحدث عن أهمية الذكاء الاصطناعي في تطوير المجتمعات الحديثة"
sentence_1 = "في هذا الكتاب، يناقش الكاتب كيف يمكن للتكنولوجيا أن تغير العالم"
sentence_2 = "الكاتب يتحدث عن أساليب الطبخ التقليدية في دول البحر الأبيض المتوسط"
scores = []
for dim in [768, 256, 48, 16, 8]:
query_embedding = model.encode(query)[:dim]
sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist()
sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist()
scores.append({
"dim": dim,
"valid_top": sent1_score > sent2_score,
"sent1_score": sent1_score,
"sent2_score": sent2_score,
})
scores_df = pd.DataFrame(scores)
print(scores_df.to_markdown(index=False))
# | dim | valid_top | sent1_score | sent2_score |
# |------:|:------------|--------------:|--------------:|
# | 768 | True | 0.637418 | 0.262693 |
# | 256 | True | 0.614761 | 0.268267 |
# | 48 | True | 0.758887 | 0.384649 |
# | 16 | True | 0.885737 | 0.204213 |
# | 8 | True | 0.918684 | 0.146478 |
[+] Question to Paragraph Matching
query = "ما هي فوائد ممارسة الرياضة؟"
sentence_1 = "ممارسة الرياضة بشكل منتظم تساعد على تحسين الصحة العامة واللياقة البدنية"
sentence_2 = "تعليم الأطفال في سن مبكرة يساعدهم على تطوير المهارات العقلية بسرعة"
scores = []
for dim in [768, 256, 48, 16, 8]:
query_embedding = model.encode(query)[:dim]
sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist()
sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist()
scores.append({
"dim": dim,
"valid_top": sent1_score > sent2_score,
"sent1_score": sent1_score,
"sent2_score": sent2_score,
})
scores_df = pd.DataFrame(scores)
print(scores_df.to_markdown(index=False))
# | dim | valid_top | sent1_score | sent2_score |
# |------:|:------------|--------------:|--------------:|
# | 768 | True | 0.520329 | 0.00295128 |
# | 256 | True | 0.556088 | -0.017764 |
# | 48 | True | 0.586194 | -0.110691 |
# | 16 | True | 0.606462 | -0.331682 |
# | 8 | True | 0.689649 | -0.359202 |
[+] Message to Intent-Name Mapping
query = "أرغب في حجز تذكرة طيران من دبي الى القاهرة يوم الثلاثاء القادم"
sentence_1 = "حجز رحلة"
sentence_2 = "إلغاء حجز"
scores = []
for dim in [768, 256, 48, 16, 8]:
query_embedding = model.encode(query)[:dim]
sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist()
sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist()
scores.append({
"dim": dim,
"valid_top": sent1_score > sent2_score,
"sent1_score": sent1_score,
"sent2_score": sent2_score,
})
scores_df = pd.DataFrame(scores)
print(scores_df.to_markdown(index=False))
# | dim | valid_top | sent1_score | sent2_score |
# |------:|:------------|--------------:|--------------:|
# | 768 | True | 0.476535 | 0.221451 |
# | 256 | True | 0.392701 | 0.224967 |
# | 48 | True | 0.316223 | 0.0210683 |
# | 16 | False | -0.0242871 | 0.0250766 |
# | 8 | True | -0.215241 | -0.258904 |
Training Details
We curated a dataset silma-ai/silma-arabic-triplets-dataset-v1.0 which
contains more than 2.25M
records of (anchor, positive and negative) Arabic/English samples.
Only the first 600
samples were taken to be the eval
dataset, while the rest were used for fine-tuning.
This produced a finetuned Matryoshka
model based on aubmindlab/bert-base-arabertv02 with the following hyperparameters:
per_device_train_batch_size
: 250per_device_eval_batch_size
: 10learning_rate
: 1e-05num_train_epochs
: 3bf16
: Truedataloader_drop_last
: Trueoptim
: adamw_torch_fusedbatch_sampler
: no_duplicates
Framework Versions
- Python: 3.10.14
- Sentence Transformers: 3.2.0
- Transformers: 4.45.2
- PyTorch: 2.3.1
- Accelerate: 1.0.1
- Datasets: 3.0.1
- Tokenizers: 0.20.1
Full Model Architecture
SentenceTransformer(
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
Citation:
BibTeX:
@misc{silma2024embedding,
author = {Abu Bakr Soliman, Karim Ouda, SILMA AI},
title = {SILMA Embedding Matryoshka 0.1},
year = {2024},
publisher = {Hugging Face},
howpublished = {\url{https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1}},
}
APA:
Abu Bakr Soliman, Karim Ouda, SILMA AI. (2024). SILMA Embedding Matryoshka STS 0.1 [Model]. Hugging Face. https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1
Sentence Transformers
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
MatryoshkaLoss
@misc{kusupati2024matryoshka,
title={Matryoshka Representation Learning},
author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
year={2024},
eprint={2205.13147},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
MultipleNegativesRankingLoss
@misc{henderson2017efficient,
title={Efficient Natural Language Response Suggestion for Smart Reply},
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
year={2017},
eprint={1705.00652},
archivePrefix={arXiv},
primaryClass={cs.CL}
}