|
import gradio as gr |
|
import pandas as pd |
|
from css_html_js import custom_css |
|
|
|
TITLE = """<h1 align="center" id="space-title">🇹🇭 Thai Sentence Embedding Leaderboard</h1>""" |
|
|
|
INTRODUCTION_TEXT = """ |
|
📐 The 🇹🇭 Thai Sentence Embedding Leaderboard aims to track, rank and evaluate open embedding models on Thai sentence embedding tasks. Source code for evaluation at https://github.com/mrpeerat/Thai-Sentence-Vector-Benchmark, feel free to submit your own score at https://huggingface.co/spaces/panuthept/thai_sentence_embedding_benchmark/discussions. |
|
## Dataset |
|
The evaluation is conducted on 4 tasks across 8 datasets: |
|
1. Semantic Textual Similarity (STS) |
|
- Translated STS-B, contains 1,379 test samples, https://github.com/mrpeerat/Thai-Sentence-Vector-Benchmark |
|
2. Text Classification |
|
- Wisesight, contains 2,671 test samples, https://huggingface.co/datasets/pythainlp/wisesight_sentiment |
|
- Wongnai, contains 6,203 test samples, https://huggingface.co/datasets/Wongnai/wongnai_reviews |
|
- Generated Review, contains 17,453 test samples, https://huggingface.co/datasets/airesearch/generated_reviews_enth |
|
3. Pair Classification |
|
- XNLI (Thai only), contains 3,340 test samples, https://github.com/facebookresearch/XNLI |
|
4. Retrieval |
|
- XQuAD (Thai only), contains 1,190 test samples, https://huggingface.co/datasets/google/xquad |
|
- MIRACL (Thai only), contains 733 test samples, https://huggingface.co/datasets/miracl/miracl |
|
- TyDiQA (Thai only), contains 763 test samples, https://huggingface.co/datasets/chompk/tydiqa-goldp-th |
|
## Metrics |
|
The evaluation metrics for each task are as follows: |
|
1. STS: Spearman’s Rank Correlation |
|
2. Text Classification: F1 Score |
|
3. Pair Classification: Average Precision |
|
3. Retrieval: MMR@10 |
|
""" |
|
|
|
results = [ |
|
{ |
|
'Model Name': '[XLMR-base](https://huggingface.co/FacebookAI/xlm-roberta-base)', |
|
'Model Size (Million Parameters)': 279, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 37.95, |
|
'STS (1 datasets)': 44.48, |
|
'Classification (3 datasets)': 58.42, |
|
'PairClassification (1 datasets)': 57.62, |
|
'Retrieval (3 datasets)': 5.57, |
|
}, |
|
{ |
|
'Model Name': '[XLMR-large](https://huggingface.co/FacebookAI/xlm-roberta-large)', |
|
'Model Size (Million Parameters)': 561, |
|
'Embedding Dimensions': 1024, |
|
'Average (8 datasets)': 38.59, |
|
'STS (1 datasets)': 38.31, |
|
'Classification (3 datasets)': 59.51, |
|
'PairClassification (1 datasets)': 54.56, |
|
'Retrieval (3 datasets)': 11.80, |
|
}, |
|
{ |
|
'Model Name': '[WangchanBERTa](https://huggingface.co/airesearch/wangchanberta-base-att-spm-uncased)', |
|
'Model Size (Million Parameters)': 106, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 36.34, |
|
'STS (1 datasets)': 21.32, |
|
'Classification (3 datasets)': 55.46, |
|
'PairClassification (1 datasets)': 52.96, |
|
'Retrieval (3 datasets)': 19.49, |
|
}, |
|
{ |
|
'Model Name': '[PhayaThaiBERT](https://huggingface.co/clicknext/phayathaibert)', |
|
'Model Size (Million Parameters)': 278, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 55.38, |
|
'STS (1 datasets)': 51.56, |
|
'Classification (3 datasets)': 59.90, |
|
'PairClassification (1 datasets)': 59.67, |
|
'Retrieval (3 datasets)': 56.31, |
|
}, |
|
{ |
|
'Model Name': '[MPNet-multilingual](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2)', |
|
'Model Size (Million Parameters)': 278, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 66.14, |
|
'STS (1 datasets)': 80.49, |
|
'Classification (3 datasets)': 56.89, |
|
'PairClassification (1 datasets)': 84.14, |
|
'Retrieval (3 datasets)': 64.13, |
|
}, |
|
{ |
|
'Model Name': '[DistilUSE-multilingual](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2)', |
|
'Model Size (Million Parameters)': 135, |
|
'Embedding Dimensions': 512, |
|
'Average (8 datasets)': 51.45, |
|
'STS (1 datasets)': 65.37, |
|
'Classification (3 datasets)': 50.93, |
|
'PairClassification (1 datasets)': 65.94, |
|
'Retrieval (3 datasets)': 42.72, |
|
}, |
|
{ |
|
'Model Name': '[BGE-M3](https://huggingface.co/BAAI/bge-m3)', |
|
'Model Size (Million Parameters)': 570, |
|
'Embedding Dimensions': 1024, |
|
'Average (8 datasets)': 75.64, |
|
'STS (1 datasets)': 77.22, |
|
'Classification (3 datasets)': 59.95, |
|
'PairClassification (1 datasets)': 79.02, |
|
'Retrieval (3 datasets)': 91.42, |
|
}, |
|
{ |
|
'Model Name': '[SimCSE-XLMR-base](https://huggingface.co/kornwtp/simcse-model-XLMR)', |
|
'Model Size (Million Parameters)': 279, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 53.83, |
|
'STS (1 datasets)': 63.98, |
|
'Classification (3 datasets)': 49.44, |
|
'PairClassification (1 datasets)': 61.87, |
|
'Retrieval (3 datasets)': 54.17, |
|
}, |
|
{ |
|
'Model Name': '[SimCSE-WangchanBERTa](https://huggingface.co/kornwtp/simcse-model-wangchanberta)', |
|
'Model Size (Million Parameters)': 106, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 54.01, |
|
'STS (1 datasets)': 60.73, |
|
'Classification (3 datasets)': 56.71, |
|
'PairClassification (1 datasets)': 59.14, |
|
'Retrieval (3 datasets)': 51.05, |
|
}, |
|
{ |
|
'Model Name': '[SimCSE-PhayaThaiBERT](https://huggingface.co/kornwtp/simcse-model-phayathaibert)', |
|
'Model Size (Million Parameters)': 278, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 60.02, |
|
'STS (1 datasets)': 67.82, |
|
'Classification (3 datasets)': 53.50, |
|
'PairClassification (1 datasets)': 63.35, |
|
'Retrieval (3 datasets)': 66.05, |
|
}, |
|
{ |
|
'Model Name': '[SCT-XLMR-base](https://huggingface.co/kornwtp/SCT-model-XLMR)', |
|
'Model Size (Million Parameters)': 279, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 57.69, |
|
'STS (1 datasets)': 68.91, |
|
'Classification (3 datasets)': 55.93, |
|
'PairClassification (1 datasets)': 66.49, |
|
'Retrieval (3 datasets)': 54.90, |
|
}, |
|
{ |
|
'Model Name': '[SCT-WangchanBERTa](https://huggingface.co/kornwtp/SCT-model-wangchanberta)', |
|
'Model Size (Million Parameters)': 106, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 62.22, |
|
'STS (1 datasets)': 71.35, |
|
'Classification (3 datasets)': 59.19, |
|
'PairClassification (1 datasets)': 67.04, |
|
'Retrieval (3 datasets)': 63.83, |
|
}, |
|
{ |
|
'Model Name': '[SCT-PhayaThaiBERT](https://huggingface.co/kornwtp/SCT-model-phayathaibert)', |
|
'Model Size (Million Parameters)': 278, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 63.28, |
|
'STS (1 datasets)': 74.08, |
|
'Classification (3 datasets)': 58.77, |
|
'PairClassification (1 datasets)': 65.87, |
|
'Retrieval (3 datasets)': 66.20, |
|
}, |
|
{ |
|
'Model Name': '[SCT-KD-XLMR-base](https://huggingface.co/kornwtp/SCT-KD-model-XLMR)', |
|
'Model Size (Million Parameters)': 279, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 65.37, |
|
'STS (1 datasets)': 78.78, |
|
'Classification (3 datasets)': 56.87, |
|
'PairClassification (1 datasets)': 79.78, |
|
'Retrieval (3 datasets)': 65.02, |
|
}, |
|
{ |
|
'Model Name': '[SCT-KD-WangchanBERTa](https://huggingface.co/kornwtp/SCT-KD-model-wangchanberta)', |
|
'Model Size (Million Parameters)': 106, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 63.55, |
|
'STS (1 datasets)': 77.77, |
|
'Classification (3 datasets)': 56.33, |
|
'PairClassification (1 datasets)': 77.04, |
|
'Retrieval (3 datasets)': 62.38, |
|
}, |
|
{ |
|
'Model Name': '[SCT-KD-PhayaThaiBERT](https://huggingface.co/kornwtp/SCT-KD-model-phayathaibert)', |
|
'Model Size (Million Parameters)': 278, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 66.00, |
|
'STS (1 datasets)': 77.80, |
|
'Classification (3 datasets)': 57.27, |
|
'PairClassification (1 datasets)': 77.84, |
|
'Retrieval (3 datasets)': 67.94, |
|
}, |
|
{ |
|
'Model Name': '[ConGen-XLMR-base](https://huggingface.co/kornwtp/ConGen-model-XLMR)', |
|
'Model Size (Million Parameters)': 279, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 66.84, |
|
'STS (1 datasets)': 79.69, |
|
'Classification (3 datasets)': 56.90, |
|
'PairClassification (1 datasets)': 81.47, |
|
'Retrieval (3 datasets)': 68.03, |
|
}, |
|
{ |
|
'Model Name': '[ConGen-WangchanBERTa](https://huggingface.co/kornwtp/ConGen-model-wangchanberta)', |
|
'Model Size (Million Parameters)': 106, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 67.17, |
|
'STS (1 datasets)': 78.78, |
|
'Classification (3 datasets)': 58.16, |
|
'PairClassification (1 datasets)': 82.43, |
|
'Retrieval (3 datasets)': 67.66, |
|
}, |
|
{ |
|
'Model Name': '[ConGen-PhayaThaiBERT](https://huggingface.co/kornwtp/ConGen-model-phayathaibert)', |
|
'Model Size (Million Parameters)': 278, |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 66.94, |
|
'STS (1 datasets)': 78.90, |
|
'Classification (3 datasets)': 57.63, |
|
'PairClassification (1 datasets)': 81.01, |
|
'Retrieval (3 datasets)': 68.04, |
|
}, |
|
{ |
|
'Model Name': '[E5-Mistral-7B-Instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct)', |
|
'Model Size (Million Parameters)': 7110, |
|
'Embedding Dimensions': 4096, |
|
'Average (8 datasets)': 71.94, |
|
'STS (1 datasets)': 75.52, |
|
'Classification (3 datasets)': 60.46, |
|
'PairClassification (1 datasets)': 68.04, |
|
'Retrieval (3 datasets)': 86.80, |
|
}, |
|
{ |
|
'Model Name': '[gte-Qwen2-7B-Instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)', |
|
'Model Size (Million Parameters)': 7610, |
|
'Embedding Dimensions': 3584, |
|
'Average (8 datasets)': 49.31, |
|
'STS (1 datasets)': 51.60, |
|
'Classification (3 datasets)': 57.55, |
|
'PairClassification (1 datasets)': 61.73, |
|
'Retrieval (3 datasets)': 38.31, |
|
}, |
|
{ |
|
'Model Name': '[GritLM-7B](https://huggingface.co/GritLM/GritLM-7B)', |
|
'Model Size (Million Parameters)': 7240, |
|
'Embedding Dimensions': 4096, |
|
'Average (8 datasets)': 42.38, |
|
'STS (1 datasets)': 45.50, |
|
'Classification (3 datasets)': 56.83, |
|
'PairClassification (1 datasets)': 56.40, |
|
'Retrieval (3 datasets)': 22.79, |
|
}, |
|
{ |
|
|
|
'Model Name': '[Llama3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)', |
|
'Model Size (Million Parameters)': 8030, |
|
'Embedding Dimensions': 4096, |
|
'Average (8 datasets)': 51.63, |
|
'STS (1 datasets)': 49.48, |
|
'Classification (3 datasets)': 58.54, |
|
'PairClassification (1 datasets)': 57.76, |
|
'Retrieval (3 datasets)': 47.93, |
|
}, |
|
{ |
|
'Model Name': '[Llama3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)', |
|
'Model Size (Million Parameters)': 8030, |
|
'Embedding Dimensions': 4096, |
|
'Average (8 datasets)': 52.81, |
|
'STS (1 datasets)': 50.63, |
|
'Classification (3 datasets)': 58.85, |
|
'PairClassification (1 datasets)': 58.04, |
|
'Retrieval (3 datasets)': 50.38, |
|
}, |
|
{ |
|
'Model Name': '[Llama3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)', |
|
'Model Size (Million Parameters)': 8030, |
|
'Embedding Dimensions': 4096, |
|
'Average (8 datasets)': 50.36, |
|
'STS (1 datasets)': 49.98, |
|
'Classification (3 datasets)': 58.18, |
|
'PairClassification (1 datasets)': 58.12, |
|
'Retrieval (3 datasets)': 43.64, |
|
}, |
|
{ |
|
'Model Name': '[Llama3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)', |
|
'Model Size (Million Parameters)': 8030, |
|
'Embedding Dimensions': 4096, |
|
'Average (8 datasets)': 50.06, |
|
'STS (1 datasets)': 49.76, |
|
'Classification (3 datasets)': 57.90, |
|
'PairClassification (1 datasets)': 57.47, |
|
'Retrieval (3 datasets)': 43.63, |
|
}, |
|
{ |
|
'Model Name': '[Typhoon-8B-Instruct](https://huggingface.co/scb10x/llama-3-typhoon-v1.5-8b-instruct)', |
|
'Model Size (Million Parameters)': 8030, |
|
'Embedding Dimensions': 4096, |
|
'Average (8 datasets)': 53.51, |
|
'STS (1 datasets)': 51.46, |
|
'Classification (3 datasets)': 58.91, |
|
'PairClassification (1 datasets)': 58.05, |
|
'Retrieval (3 datasets)': 52.65, |
|
}, |
|
{ |
|
'Model Name': 'Cohere-embed-multilingual-v2.0', |
|
'Model Size (Million Parameters)': "N/A", |
|
'Embedding Dimensions': 768, |
|
'Average (8 datasets)': 68.01, |
|
'STS (1 datasets)': 68.03, |
|
'Classification (3 datasets)': 57.31, |
|
'PairClassification (1 datasets)': 62.03, |
|
'Retrieval (3 datasets)': 85.23, |
|
}, |
|
{ |
|
'Model Name': 'Cohere-embed-multilingual-v3.0', |
|
'Model Size (Million Parameters)': "N/A", |
|
'Embedding Dimensions': 1024, |
|
'Average (8 datasets)': 74.86, |
|
'STS (1 datasets)': 77.87, |
|
'Classification (3 datasets)': 59.96, |
|
'PairClassification (1 datasets)': 73.28, |
|
'Retrieval (3 datasets)': 91.43, |
|
}, |
|
{ |
|
'Model Name': 'Openai-text-embedding-3-large', |
|
'Model Size (Million Parameters)': "N/A", |
|
'Embedding Dimensions': 3072, |
|
'Average (8 datasets)': 69.26, |
|
'STS (1 datasets)': 70.46, |
|
'Classification (3 datasets)': 58.79, |
|
'PairClassification (1 datasets)': 67.33, |
|
'Retrieval (3 datasets)': 83.87, |
|
}, |
|
] |
|
|
|
|
|
results = [ |
|
{ |
|
**result, |
|
'Average (8 datasets)': round(sum( |
|
result.get(key, 0) for key in ['STS (1 datasets)', 'Classification (3 datasets)', 'PairClassification (1 datasets)', 'Retrieval (3 datasets)'] |
|
) / 4, 2), |
|
} |
|
for result in results |
|
] |
|
|
|
results = sorted(results, key=lambda x: x['Average (8 datasets)'], reverse=True) |
|
|
|
data = pd.DataFrame(results) |
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
gr.DataFrame(data, datatype = 'markdown') |
|
|
|
demo.launch() |