|
--- |
|
base_model: aubmindlab/bert-base-arabertv02 |
|
library_name: sentence-transformers |
|
metrics: |
|
- pearson_cosine |
|
- spearman_cosine |
|
- pearson_manhattan |
|
- spearman_manhattan |
|
- pearson_euclidean |
|
- spearman_euclidean |
|
- pearson_dot |
|
- spearman_dot |
|
- pearson_max |
|
- spearman_max |
|
pipeline_tag: sentence-similarity |
|
tags: |
|
- sentence-transformers |
|
- sentence-similarity |
|
- feature-extraction |
|
- generated_from_trainer |
|
- loss:CosineSimilarityLoss |
|
- mteb |
|
model-index: |
|
- name: silma-ai/silma-embeddding-matryoshka-v0.1 |
|
results: |
|
- dataset: |
|
config: ar |
|
name: MTEB MassiveIntentClassification (ar) |
|
revision: 4672e20407010da34463acc759c162ca9734bca6 |
|
split: test |
|
type: mteb/amazon_massive_intent |
|
metrics: |
|
- type: accuracy |
|
value: 56.445864156018835 |
|
- type: f1 |
|
value: 53.58282538318122 |
|
- type: f1_weighted |
|
value: 56.821808211639315 |
|
- type: main_score |
|
value: 56.445864156018835 |
|
task: |
|
type: Classification |
|
- dataset: |
|
config: en |
|
name: MTEB MassiveIntentClassification (en) |
|
revision: 4672e20407010da34463acc759c162ca9734bca6 |
|
split: test |
|
type: mteb/amazon_massive_intent |
|
metrics: |
|
- type: accuracy |
|
value: 47.40080699394754 |
|
- type: f1 |
|
value: 44.729286773524755 |
|
- type: f1_weighted |
|
value: 47.83506683571795 |
|
- type: main_score |
|
value: 47.40080699394754 |
|
task: |
|
type: Classification |
|
- dataset: |
|
config: ar |
|
name: MTEB MassiveIntentClassification (ar) |
|
revision: 4672e20407010da34463acc759c162ca9734bca6 |
|
split: validation |
|
type: mteb/amazon_massive_intent |
|
metrics: |
|
- type: accuracy |
|
value: 56.97983275946876 |
|
- type: f1 |
|
value: 53.809263807080086 |
|
- type: f1_weighted |
|
value: 57.14993215193604 |
|
- type: main_score |
|
value: 56.97983275946876 |
|
task: |
|
type: Classification |
|
- dataset: |
|
config: en |
|
name: MTEB MassiveIntentClassification (en) |
|
revision: 4672e20407010da34463acc759c162ca9734bca6 |
|
split: validation |
|
type: mteb/amazon_massive_intent |
|
metrics: |
|
- type: accuracy |
|
value: 47.683226758485006 |
|
- type: f1 |
|
value: 44.905317333393775 |
|
- type: f1_weighted |
|
value: 48.051379514830195 |
|
- type: main_score |
|
value: 47.683226758485006 |
|
task: |
|
type: Classification |
|
- dataset: |
|
config: ar |
|
name: MTEB MassiveScenarioClassification (ar) |
|
revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 |
|
split: test |
|
type: mteb/amazon_massive_scenario |
|
metrics: |
|
- type: accuracy |
|
value: 63.31876260928042 |
|
- type: f1 |
|
value: 63.197056314678754 |
|
- type: f1_weighted |
|
value: 62.7166315473092 |
|
- type: main_score |
|
value: 63.31876260928042 |
|
task: |
|
type: Classification |
|
- dataset: |
|
config: en |
|
name: MTEB MassiveScenarioClassification (en) |
|
revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 |
|
split: test |
|
type: mteb/amazon_massive_scenario |
|
metrics: |
|
- type: accuracy |
|
value: 53.35574983187627 |
|
- type: f1 |
|
value: 50.35837223252574 |
|
- type: f1_weighted |
|
value: 54.11644042208904 |
|
- type: main_score |
|
value: 53.35574983187627 |
|
task: |
|
type: Classification |
|
- dataset: |
|
config: ar |
|
name: MTEB MassiveScenarioClassification (ar) |
|
revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 |
|
split: validation |
|
type: mteb/amazon_massive_scenario |
|
metrics: |
|
- type: accuracy |
|
value: 62.26758484997541 |
|
- type: f1 |
|
value: 62.477928166560325 |
|
- type: f1_weighted |
|
value: 61.92238394647396 |
|
- type: main_score |
|
value: 62.26758484997541 |
|
task: |
|
type: Classification |
|
- dataset: |
|
config: en |
|
name: MTEB MassiveScenarioClassification (en) |
|
revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 |
|
split: validation |
|
type: mteb/amazon_massive_scenario |
|
metrics: |
|
- type: accuracy |
|
value: 52.62174126906049 |
|
- type: f1 |
|
value: 50.470501485026716 |
|
- type: f1_weighted |
|
value: 53.16459392827557 |
|
- type: main_score |
|
value: 52.62174126906049 |
|
task: |
|
type: Classification |
|
- dataset: |
|
config: en-en |
|
name: MTEB STS17 (en-en) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 74.33941506827517 |
|
- type: cosine_spearman |
|
value: 74.42197838273297 |
|
- type: euclidean_pearson |
|
value: 75.33836191339782 |
|
- type: euclidean_spearman |
|
value: 74.37385193453852 |
|
- type: main_score |
|
value: 74.42197838273297 |
|
- type: manhattan_pearson |
|
value: 75.41881517194568 |
|
- type: manhattan_spearman |
|
value: 74.47237277057877 |
|
- type: pearson |
|
value: 74.33941645999855 |
|
- type: spearman |
|
value: 74.42197838273297 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: nl-en |
|
name: MTEB STS17 (nl-en) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 31.84872826199112 |
|
- type: cosine_spearman |
|
value: 32.22496230755917 |
|
- type: euclidean_pearson |
|
value: 21.830860533929688 |
|
- type: euclidean_spearman |
|
value: 21.38205815348658 |
|
- type: main_score |
|
value: 32.22496230755917 |
|
- type: manhattan_pearson |
|
value: 21.852430479395576 |
|
- type: manhattan_spearman |
|
value: 21.37848326556159 |
|
- type: pearson |
|
value: 31.84872485436001 |
|
- type: spearman |
|
value: 32.22496230755917 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: en-ar |
|
name: MTEB STS17 (en-ar) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 43.37529327788584 |
|
- type: cosine_spearman |
|
value: 42.763149514327225 |
|
- type: euclidean_pearson |
|
value: 39.625411905897394 |
|
- type: euclidean_spearman |
|
value: 39.26727199746294 |
|
- type: main_score |
|
value: 42.763149514327225 |
|
- type: manhattan_pearson |
|
value: 40.49857681486655 |
|
- type: manhattan_spearman |
|
value: 40.63669314166475 |
|
- type: pearson |
|
value: 43.37529078998193 |
|
- type: spearman |
|
value: 42.763149514327225 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: en-tr |
|
name: MTEB STS17 (en-tr) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 17.16722415938186 |
|
- type: cosine_spearman |
|
value: 15.590330355526344 |
|
- type: euclidean_pearson |
|
value: 4.430499555984906 |
|
- type: euclidean_spearman |
|
value: 2.729050802084264 |
|
- type: main_score |
|
value: 15.590330355526344 |
|
- type: manhattan_pearson |
|
value: 2.805408490135879 |
|
- type: manhattan_spearman |
|
value: 1.5237347692119627 |
|
- type: pearson |
|
value: 17.167228709176676 |
|
- type: spearman |
|
value: 15.590330355526344 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: fr-en |
|
name: MTEB STS17 (fr-en) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 36.093945717347395 |
|
- type: cosine_spearman |
|
value: 37.33997345407934 |
|
- type: euclidean_pearson |
|
value: 23.156103022485055 |
|
- type: euclidean_spearman |
|
value: 20.62925594786342 |
|
- type: main_score |
|
value: 37.33997345407934 |
|
- type: manhattan_pearson |
|
value: 22.035024322719813 |
|
- type: manhattan_spearman |
|
value: 19.147522562438795 |
|
- type: pearson |
|
value: 36.09395175426761 |
|
- type: spearman |
|
value: 37.33997345407934 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: en-de |
|
name: MTEB STS17 (en-de) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 29.064411455563 |
|
- type: cosine_spearman |
|
value: 29.232781114344697 |
|
- type: euclidean_pearson |
|
value: 16.90458086330736 |
|
- type: euclidean_spearman |
|
value: 17.462020565289887 |
|
- type: main_score |
|
value: 29.232781114344697 |
|
- type: manhattan_pearson |
|
value: 16.882446230243286 |
|
- type: manhattan_spearman |
|
value: 17.06144091941576 |
|
- type: pearson |
|
value: 29.06441922605839 |
|
- type: spearman |
|
value: 29.232781114344697 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: es-en |
|
name: MTEB STS17 (es-en) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 27.686316587339473 |
|
- type: cosine_spearman |
|
value: 28.650995973102205 |
|
- type: euclidean_pearson |
|
value: 12.954885279630565 |
|
- type: euclidean_spearman |
|
value: 11.970815927480198 |
|
- type: main_score |
|
value: 28.650995973102205 |
|
- type: manhattan_pearson |
|
value: 12.079730127474948 |
|
- type: manhattan_spearman |
|
value: 10.606967901984147 |
|
- type: pearson |
|
value: 27.68631836666537 |
|
- type: spearman |
|
value: 28.650995973102205 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: ar-ar |
|
name: MTEB STS17 (ar-ar) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 84.12612492708037 |
|
- type: cosine_spearman |
|
value: 84.24703763883515 |
|
- type: euclidean_pearson |
|
value: 81.38085140113648 |
|
- type: euclidean_spearman |
|
value: 83.17403450502965 |
|
- type: main_score |
|
value: 84.24703763883515 |
|
- type: manhattan_pearson |
|
value: 81.18466522597414 |
|
- type: manhattan_spearman |
|
value: 82.61184409962614 |
|
- type: pearson |
|
value: 84.12612546419625 |
|
- type: spearman |
|
value: 84.25077492152536 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: it-en |
|
name: MTEB STS17 (it-en) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 27.697680546701868 |
|
- type: cosine_spearman |
|
value: 25.19277336255784 |
|
- type: euclidean_pearson |
|
value: 13.964798090314115 |
|
- type: euclidean_spearman |
|
value: 10.512169361528596 |
|
- type: main_score |
|
value: 25.19277336255784 |
|
- type: manhattan_pearson |
|
value: 13.537525485694433 |
|
- type: manhattan_spearman |
|
value: 10.334001560105834 |
|
- type: pearson |
|
value: 27.697681880242325 |
|
- type: spearman |
|
value: 25.19277336255784 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: de-en |
|
name: MTEB STS22.v2 (de-en) |
|
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd |
|
split: test |
|
type: mteb/sts22-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 32.87548760760924 |
|
- type: cosine_spearman |
|
value: 30.69782036694315 |
|
- type: euclidean_pearson |
|
value: 29.925045225262142 |
|
- type: euclidean_spearman |
|
value: 34.076021250318334 |
|
- type: main_score |
|
value: 30.69782036694315 |
|
- type: manhattan_pearson |
|
value: 30.815090565180945 |
|
- type: manhattan_spearman |
|
value: 34.91615861045259 |
|
- type: pearson |
|
value: 32.8754813614174 |
|
- type: spearman |
|
value: 30.69782036694315 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: zh-en |
|
name: MTEB STS22.v2 (zh-en) |
|
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd |
|
split: test |
|
type: mteb/sts22-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 23.93269292232737 |
|
- type: cosine_spearman |
|
value: 16.781461291066496 |
|
- type: euclidean_pearson |
|
value: 20.87679825681155 |
|
- type: euclidean_spearman |
|
value: 13.764510796592536 |
|
- type: main_score |
|
value: 16.781461291066496 |
|
- type: manhattan_pearson |
|
value: 23.416430850444588 |
|
- type: manhattan_spearman |
|
value: 17.10405713909058 |
|
- type: pearson |
|
value: 23.932682034899777 |
|
- type: spearman |
|
value: 16.781461291066496 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: ar |
|
name: MTEB STS22.v2 (ar) |
|
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd |
|
split: test |
|
type: mteb/sts22-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 51.73784691362425 |
|
- type: cosine_spearman |
|
value: 60.01035490847343 |
|
- type: euclidean_pearson |
|
value: 52.717195602630305 |
|
- type: euclidean_spearman |
|
value: 60.22164097529916 |
|
- type: main_score |
|
value: 60.01035490847343 |
|
- type: manhattan_pearson |
|
value: 53.04979941729716 |
|
- type: manhattan_spearman |
|
value: 60.393100473647706 |
|
- type: pearson |
|
value: 51.73784381247053 |
|
- type: spearman |
|
value: 60.020906672817276 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: es-en |
|
name: MTEB STS22.v2 (es-en) |
|
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd |
|
split: test |
|
type: mteb/sts22-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 47.917244237624864 |
|
- type: cosine_spearman |
|
value: 53.23173373821509 |
|
- type: euclidean_pearson |
|
value: 48.172861539004636 |
|
- type: euclidean_spearman |
|
value: 53.32970069145014 |
|
- type: main_score |
|
value: 53.23173373821509 |
|
- type: manhattan_pearson |
|
value: 48.163716825216646 |
|
- type: manhattan_spearman |
|
value: 53.77963871495307 |
|
- type: pearson |
|
value: 47.91724405724847 |
|
- type: spearman |
|
value: 53.23173373821509 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: pl-en |
|
name: MTEB STS22.v2 (pl-en) |
|
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd |
|
split: test |
|
type: mteb/sts22-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 43.66748993183993 |
|
- type: cosine_spearman |
|
value: 38.518248671828594 |
|
- type: euclidean_pearson |
|
value: 50.475058499541134 |
|
- type: euclidean_spearman |
|
value: 44.76070858743843 |
|
- type: main_score |
|
value: 38.518248671828594 |
|
- type: manhattan_pearson |
|
value: 50.576185727010014 |
|
- type: manhattan_spearman |
|
value: 45.5306304403841 |
|
- type: pearson |
|
value: 43.66750472144702 |
|
- type: spearman |
|
value: 38.518248671828594 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: en |
|
name: MTEB STS22.v2 (en) |
|
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd |
|
split: test |
|
type: mteb/sts22-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 56.41373213565263 |
|
- type: cosine_spearman |
|
value: 59.03774516602592 |
|
- type: euclidean_pearson |
|
value: 54.173092638047294 |
|
- type: euclidean_spearman |
|
value: 59.130444355085885 |
|
- type: main_score |
|
value: 59.03774516602592 |
|
- type: manhattan_pearson |
|
value: 54.18950361517434 |
|
- type: manhattan_spearman |
|
value: 58.78927227383971 |
|
- type: pearson |
|
value: 56.413733329868045 |
|
- type: spearman |
|
value: 59.03774516602592 |
|
task: |
|
type: STS |
|
license: apache-2.0 |
|
language: |
|
- ar |
|
- en |
|
--- |
|
|
|
# SILMA Arabic Matryoshka Embedding Model 0.1 |
|
|
|
The **SILMA Arabic Matryoshka Embedding Model 0.1** is an advanced Arabic text embedding model designed to produce powerful, contextually rich representations of text, |
|
facilitating a wide range of applications, from semantic search to document classification. |
|
|
|
This model leverages the innovative **Matryoshka** Embedding technique which can be used in different dimensions to optimize the speed, storage, and accuracy trade-offs. |
|
|
|
## Usage |
|
|
|
### Direct Usage (Sentence Transformers) |
|
|
|
First, install the Sentence Transformers library: |
|
|
|
```bash |
|
pip install -U sentence-transformers |
|
``` |
|
|
|
Then load the model |
|
|
|
```python |
|
from sentence_transformers import SentenceTransformer |
|
from sentence_transformers.util import cos_sim |
|
import pandas as pd |
|
|
|
model_name = "silma-ai/silma-embeddding-matryoshka-0.1" |
|
model = SentenceTransformer(model_name) |
|
``` |
|
|
|
### Samples |
|
|
|
Using Matryoshka, you can specify the first `(n)` dimensions to represent each text. |
|
|
|
In the following samples, you can check how each dimension affects the `cosine similarity` between a query and the two inputs. |
|
|
|
You can notice the in most cases, even too low dimension (i.e. 8) can produce acceptable semantic similarity scores. |
|
|
|
#### [+] Short Sentence Similarity |
|
|
|
```python |
|
query = "الطقس اليوم مشمس" |
|
sentence_1 = "الجو اليوم كان مشمسًا ورائعًا" |
|
sentence_2 = "الطقس اليوم غائم" |
|
|
|
scores = [] |
|
for dim in [768, 256, 48, 16, 8]: |
|
|
|
query_embedding = model.encode(query)[:dim] |
|
|
|
sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist() |
|
sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist() |
|
|
|
scores.append({ |
|
"dim": dim, |
|
"valid_top": sent1_score > sent2_score, |
|
"sent1_score": sent1_score, |
|
"sent2_score": sent2_score, |
|
}) |
|
|
|
scores_df = pd.DataFrame(scores) |
|
print(scores_df.to_markdown(index=False)) |
|
|
|
# | dim | valid_top | sent1_score | sent2_score | |
|
# |------:|:------------|--------------:|--------------:| |
|
# | 768 | True | 0.479942 | 0.233572 | |
|
# | 256 | True | 0.509289 | 0.208452 | |
|
# | 48 | True | 0.598825 | 0.191677 | |
|
# | 16 | True | 0.917707 | 0.458854 | |
|
# | 8 | True | 0.948563 | 0.675662 | |
|
|
|
``` |
|
|
|
#### [+] Long Sentence Similarity |
|
|
|
```python |
|
query = "الكتاب يتحدث عن أهمية الذكاء الاصطناعي في تطوير المجتمعات الحديثة" |
|
sentence_1 = "في هذا الكتاب، يناقش الكاتب كيف يمكن للتكنولوجيا أن تغير العالم" |
|
sentence_2 = "الكاتب يتحدث عن أساليب الطبخ التقليدية في دول البحر الأبيض المتوسط" |
|
|
|
scores = [] |
|
for dim in [768, 256, 48, 16, 8]: |
|
|
|
query_embedding = model.encode(query)[:dim] |
|
|
|
sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist() |
|
sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist() |
|
|
|
scores.append({ |
|
"dim": dim, |
|
"valid_top": sent1_score > sent2_score, |
|
"sent1_score": sent1_score, |
|
"sent2_score": sent2_score, |
|
}) |
|
|
|
scores_df = pd.DataFrame(scores) |
|
print(scores_df.to_markdown(index=False)) |
|
|
|
# | dim | valid_top | sent1_score | sent2_score | |
|
# |------:|:------------|--------------:|--------------:| |
|
# | 768 | True | 0.637418 | 0.262693 | |
|
# | 256 | True | 0.614761 | 0.268267 | |
|
# | 48 | True | 0.758887 | 0.384649 | |
|
# | 16 | True | 0.885737 | 0.204213 | |
|
# | 8 | True | 0.918684 | 0.146478 | |
|
``` |
|
|
|
#### [+] Question to Paragraph Matching |
|
|
|
```python |
|
query = "ما هي فوائد ممارسة الرياضة؟" |
|
sentence_1 = "ممارسة الرياضة بشكل منتظم تساعد على تحسين الصحة العامة واللياقة البدنية" |
|
sentence_2 = "تعليم الأطفال في سن مبكرة يساعدهم على تطوير المهارات العقلية بسرعة" |
|
|
|
scores = [] |
|
for dim in [768, 256, 48, 16, 8]: |
|
|
|
query_embedding = model.encode(query)[:dim] |
|
|
|
sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist() |
|
sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist() |
|
|
|
scores.append({ |
|
"dim": dim, |
|
"valid_top": sent1_score > sent2_score, |
|
"sent1_score": sent1_score, |
|
"sent2_score": sent2_score, |
|
}) |
|
|
|
scores_df = pd.DataFrame(scores) |
|
print(scores_df.to_markdown(index=False)) |
|
|
|
# | dim | valid_top | sent1_score | sent2_score | |
|
# |------:|:------------|--------------:|--------------:| |
|
# | 768 | True | 0.520329 | 0.00295128 | |
|
# | 256 | True | 0.556088 | -0.017764 | |
|
# | 48 | True | 0.586194 | -0.110691 | |
|
# | 16 | True | 0.606462 | -0.331682 | |
|
# | 8 | True | 0.689649 | -0.359202 | |
|
``` |
|
|
|
#### [+] Message to Intent-Name Mapping |
|
|
|
```python |
|
query = "أرغب في حجز تذكرة طيران من دبي الى القاهرة يوم الثلاثاء القادم" |
|
sentence_1 = "حجز رحلة" |
|
sentence_2 = "إلغاء حجز" |
|
|
|
scores = [] |
|
for dim in [768, 256, 48, 16, 8]: |
|
|
|
query_embedding = model.encode(query)[:dim] |
|
|
|
sent1_score = cos_sim(query_embedding, model.encode(sentence_1)[:dim])[0][0].tolist() |
|
sent2_score = cos_sim(query_embedding, model.encode(sentence_2)[:dim])[0][0].tolist() |
|
|
|
scores.append({ |
|
"dim": dim, |
|
"valid_top": sent1_score > sent2_score, |
|
"sent1_score": sent1_score, |
|
"sent2_score": sent2_score, |
|
}) |
|
|
|
scores_df = pd.DataFrame(scores) |
|
print(scores_df.to_markdown(index=False)) |
|
|
|
# | dim | valid_top | sent1_score | sent2_score | |
|
# |------:|:------------|--------------:|--------------:| |
|
# | 768 | True | 0.476535 | 0.221451 | |
|
# | 256 | True | 0.392701 | 0.224967 | |
|
# | 48 | True | 0.316223 | 0.0210683 | |
|
# | 16 | False | -0.0242871 | 0.0250766 | |
|
# | 8 | True | -0.215241 | -0.258904 | |
|
``` |
|
|
|
## Training Details |
|
|
|
We curated a dataset [silma-ai/silma-arabic-triplets-dataset-v1.0](https://huggingface.co/datasets/silma-ai/silma-arabic-triplets-dataset-v1.0) which |
|
contains more than `2.25M` records of (anchor, positive and negative) Arabic/English samples. |
|
Only the first `600` samples were taken to be the `eval` dataset, while the rest were used for fine-tuning. |
|
|
|
This produced a finetuned `Matryoshka` model based on [aubmindlab/bert-base-arabertv02](https://huggingface.co/aubmindlab/bert-base-arabertv02) with the following hyperparameters: |
|
|
|
- `per_device_train_batch_size`: 250 |
|
- `per_device_eval_batch_size`: 10 |
|
- `learning_rate`: 1e-05 |
|
- `num_train_epochs`: 3 |
|
- `bf16`: True |
|
- `dataloader_drop_last`: True |
|
- `optim`: adamw_torch_fused |
|
- `batch_sampler`: no_duplicates |
|
|
|
**[training script](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/matryoshka/matryoshka_sts.py)** |
|
|
|
### Framework Versions |
|
- Python: 3.10.14 |
|
- Sentence Transformers: 3.2.0 |
|
- Transformers: 4.45.2 |
|
- PyTorch: 2.3.1 |
|
- Accelerate: 1.0.1 |
|
- Datasets: 3.0.1 |
|
- Tokenizers: 0.20.1 |
|
|
|
### Full Model Architecture |
|
|
|
``` |
|
SentenceTransformer( |
|
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel |
|
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True}) |
|
) |
|
``` |
|
|
|
### Citation: |
|
|
|
#### BibTeX: |
|
|
|
```bibtex |
|
@misc{silma2024embedding, |
|
author = {Abu Bakr Soliman, Karim Ouda, SILMA AI}, |
|
title = {SILMA Embedding Matryoshka 0.1}, |
|
year = {2024}, |
|
publisher = {Hugging Face}, |
|
howpublished = {\url{https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1}}, |
|
} |
|
``` |
|
|
|
#### APA: |
|
|
|
```apa |
|
Abu Bakr Soliman, Karim Ouda, SILMA AI. (2024). SILMA Embedding Matryoshka STS 0.1 [Model]. Hugging Face. https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1 |
|
``` |
|
|
|
#### Sentence Transformers |
|
```bibtex |
|
@inproceedings{reimers-2019-sentence-bert, |
|
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", |
|
author = "Reimers, Nils and Gurevych, Iryna", |
|
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", |
|
month = "11", |
|
year = "2019", |
|
publisher = "Association for Computational Linguistics", |
|
url = "https://arxiv.org/abs/1908.10084", |
|
} |
|
``` |
|
|
|
#### MatryoshkaLoss |
|
```bibtex |
|
@misc{kusupati2024matryoshka, |
|
title={Matryoshka Representation Learning}, |
|
author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi}, |
|
year={2024}, |
|
eprint={2205.13147}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.LG} |
|
} |
|
``` |
|
|
|
#### MultipleNegativesRankingLoss |
|
```bibtex |
|
@misc{henderson2017efficient, |
|
title={Efficient Natural Language Response Suggestion for Smart Reply}, |
|
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil}, |
|
year={2017}, |
|
eprint={1705.00652}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |
|
|
|
<!-- |
|
## Glossary |
|
|
|
*Clearly define terms in order to be accessible across audiences.* |
|
--> |
|
|
|
<!-- |
|
## Model Card Authors |
|
|
|
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.* |
|
--> |
|
|
|
<!-- |
|
## Model Card Contact |
|
|
|
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.* |
|
--> |