Commit
·
28e88f2
1
Parent(s):
9d7aae7
implement evaluation and fix bugs
Browse files- app.py +20 -26
- pyproject.toml +1 -0
- src/envs.py +8 -6
- src/evaluator/evaluate.py +97 -53
- src/leaderboard/read_evals.py +2 -1
- src/submission/submit.py +128 -3
app.py
CHANGED
@@ -26,7 +26,9 @@ from src.display.utils import (
|
|
26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
28 |
from src.submission.submit import add_new_eval
|
29 |
-
from src.evaluator.evaluate import
|
|
|
|
|
30 |
|
31 |
|
32 |
def restart_space():
|
@@ -49,6 +51,23 @@ except Exception:
|
|
49 |
restart_space()
|
50 |
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
|
54 |
(
|
@@ -125,31 +144,6 @@ with demo:
|
|
125 |
gr.Markdown(LLM_BENCHMARKS_TEXT)
|
126 |
gr.Markdown(EVALUATION_QUEUE_TEXT)
|
127 |
|
128 |
-
with gr.TabItem("🚀 Evaluate Model", elem_id="evaluate-tab", id=3):
|
129 |
-
with gr.Row():
|
130 |
-
model_name = gr.Textbox(label="Model Name")
|
131 |
-
revision = gr.Textbox(label="Revision", value="main")
|
132 |
-
with gr.Row():
|
133 |
-
precision = gr.Dropdown(
|
134 |
-
choices=[p.value for p in Precision],
|
135 |
-
label="Precision",
|
136 |
-
value="fp32"
|
137 |
-
)
|
138 |
-
weight_type = gr.Dropdown(
|
139 |
-
choices=[w.value for w in WeightType],
|
140 |
-
label="Weight Type",
|
141 |
-
value="pytorch"
|
142 |
-
)
|
143 |
-
evaluate_button = gr.Button("Evaluate Model")
|
144 |
-
status_output = gr.Textbox(label="Evaluation Status")
|
145 |
-
|
146 |
-
evaluate_button.click(
|
147 |
-
fn=evaluate_and_update,
|
148 |
-
inputs=[model_name, revision, precision, weight_type],
|
149 |
-
outputs=[status_output]
|
150 |
-
)
|
151 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
152 |
-
|
153 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
154 |
with gr.Column():
|
155 |
with gr.Row():
|
|
|
26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
28 |
from src.submission.submit import add_new_eval
|
29 |
+
from src.evaluator.evaluate import process_evaluation_queue
|
30 |
+
import threading
|
31 |
+
import time
|
32 |
|
33 |
|
34 |
def restart_space():
|
|
|
51 |
restart_space()
|
52 |
|
53 |
|
54 |
+
# Start evaluator service in a separate thread
|
55 |
+
def run_evaluator():
|
56 |
+
print("Starting evaluator service...")
|
57 |
+
while True:
|
58 |
+
try:
|
59 |
+
process_evaluation_queue()
|
60 |
+
print("Evaluation queue processed. Sleeping for 5 minutes...")
|
61 |
+
time.sleep(300) # Sleep for 5 minutes
|
62 |
+
except Exception as e:
|
63 |
+
print(f"Error in evaluation process: {e}")
|
64 |
+
print("Retrying in 5 minutes...")
|
65 |
+
time.sleep(300)
|
66 |
+
|
67 |
+
# Start evaluator in a separate thread
|
68 |
+
evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
|
69 |
+
evaluator_thread.start()
|
70 |
+
|
71 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
72 |
|
73 |
(
|
|
|
144 |
gr.Markdown(LLM_BENCHMARKS_TEXT)
|
145 |
gr.Markdown(EVALUATION_QUEUE_TEXT)
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
148 |
with gr.Column():
|
149 |
with gr.Row():
|
pyproject.toml
CHANGED
@@ -18,6 +18,7 @@ dependencies = [
|
|
18 |
"numpy>=2.3.1",
|
19 |
"pandas>=2.3.0",
|
20 |
"python-dateutil>=2.9.0.post0",
|
|
|
21 |
"sentencepiece>=0.2.0",
|
22 |
"tokenizers>=0.15.0",
|
23 |
"torch>=2.7.1",
|
|
|
18 |
"numpy>=2.3.1",
|
19 |
"pandas>=2.3.0",
|
20 |
"python-dateutil>=2.9.0.post0",
|
21 |
+
"scikit-learn>=1.7.0",
|
22 |
"sentencepiece>=0.2.0",
|
23 |
"tokenizers>=0.15.0",
|
24 |
"torch>=2.7.1",
|
src/envs.py
CHANGED
@@ -14,12 +14,14 @@ QUEUE_REPO = f"{OWNER}/requests"
|
|
14 |
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
-
|
19 |
# Local caches
|
20 |
-
EVAL_REQUESTS_PATH =
|
21 |
-
EVAL_RESULTS_PATH =
|
22 |
-
EVAL_REQUESTS_PATH_BACKEND =
|
23 |
-
EVAL_RESULTS_PATH_BACKEND =
|
|
|
|
|
|
|
|
|
24 |
|
25 |
API = HfApi(token=TOKEN)
|
|
|
14 |
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
|
|
|
|
17 |
# Local caches
|
18 |
+
EVAL_REQUESTS_PATH = "./eval-queue"
|
19 |
+
EVAL_RESULTS_PATH = "./eval-results"
|
20 |
+
EVAL_REQUESTS_PATH_BACKEND = "./eval-queue-bk"
|
21 |
+
EVAL_RESULTS_PATH_BACKEND = "./eval-results-bk"
|
22 |
+
|
23 |
+
# Create directories if they don't exist
|
24 |
+
for path in [EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND]:
|
25 |
+
os.makedirs(path, exist_ok=True)
|
26 |
|
27 |
API = HfApi(token=TOKEN)
|
src/evaluator/evaluate.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
from typing import Dict, Any
|
4 |
from dataclasses import dataclass
|
5 |
from enum import Enum
|
6 |
-
|
7 |
import torch
|
8 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
9 |
from datasets import load_dataset
|
@@ -28,54 +28,63 @@ class EvaluationResult:
|
|
28 |
|
29 |
def evaluate_tsac_sentiment(model, tokenizer, device):
|
30 |
"""Evaluate model on TSAC sentiment analysis task"""
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
for batch in dataset:
|
45 |
-
inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
|
46 |
-
label = batch['label'].to(device)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def evaluate_tunisian_corpus_coverage(model, tokenizer):
|
56 |
"""Evaluate model's coverage on Tunisian Dialect Corpus"""
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
75 |
|
76 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
77 |
"""Evaluate a single model on all tasks"""
|
78 |
try:
|
|
|
79 |
# Load model and tokenizer
|
80 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
81 |
|
@@ -119,18 +128,23 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
|
|
119 |
|
120 |
def process_evaluation_queue():
|
121 |
"""Process all pending evaluations in the queue"""
|
122 |
-
# Get all pending evaluations
|
123 |
queue_dir = os.path.join(EVAL_REQUESTS_PATH)
|
124 |
-
pending_files = [
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
for
|
127 |
-
file_path = os.path.join(queue_dir, file)
|
128 |
with open(file_path, 'r') as f:
|
129 |
eval_request = json.load(f)
|
130 |
|
131 |
if eval_request.get('status') != EvaluationStatus.PENDING.value:
|
132 |
continue
|
133 |
|
|
|
|
|
134 |
# Mark as running
|
135 |
eval_request['status'] = EvaluationStatus.RUNNING.value
|
136 |
with open(file_path, 'w') as f:
|
@@ -156,27 +170,57 @@ def process_evaluation_queue():
|
|
156 |
json.dump(eval_request, f, indent=2)
|
157 |
|
158 |
# Save to results dataset
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
with open(result_file, 'w') as f:
|
161 |
json.dump({
|
162 |
'model': result.model,
|
163 |
'revision': result.revision,
|
164 |
'precision': result.precision,
|
165 |
'weight_type': result.weight_type,
|
166 |
-
'results': result.results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
}, f, indent=2)
|
168 |
|
169 |
# Upload to Hugging Face
|
170 |
API.upload_file(
|
171 |
path_or_fileobj=result_file,
|
172 |
-
path_in_repo=os.path.
|
173 |
repo_id=f"{OWNER}/results",
|
174 |
repo_type="dataset",
|
175 |
commit_message=f"Add evaluation results for {result.model}"
|
176 |
)
|
177 |
|
178 |
-
def main():
|
179 |
-
process_evaluation_queue()
|
180 |
-
|
181 |
-
if __name__ == "__main__":
|
182 |
-
main()
|
|
|
3 |
from typing import Dict, Any
|
4 |
from dataclasses import dataclass
|
5 |
from enum import Enum
|
6 |
+
from datetime import datetime
|
7 |
import torch
|
8 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
9 |
from datasets import load_dataset
|
|
|
28 |
|
29 |
def evaluate_tsac_sentiment(model, tokenizer, device):
|
30 |
"""Evaluate model on TSAC sentiment analysis task"""
|
31 |
+
try:
|
32 |
+
dataset = load_dataset("fbougares/tsac", split="train")
|
33 |
+
|
34 |
+
def preprocess(examples):
|
35 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
36 |
+
|
37 |
+
dataset = dataset.map(preprocess, batched=True)
|
38 |
+
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
39 |
+
|
40 |
+
model.eval()
|
41 |
+
with torch.no_grad():
|
42 |
+
predictions = []
|
43 |
+
labels = []
|
|
|
|
|
|
|
44 |
|
45 |
+
for batch in dataset:
|
46 |
+
inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
|
47 |
+
label = batch['label'].to(device)
|
48 |
+
|
49 |
+
outputs = model(**inputs)
|
50 |
+
predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
|
51 |
+
labels.extend(label.cpu().tolist())
|
52 |
+
|
53 |
+
accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
|
54 |
+
return accuracy
|
55 |
+
except Exception as e:
|
56 |
+
print(f"Error in TSAC evaluation: {str(e)}")
|
57 |
+
return 0.0
|
58 |
|
59 |
def evaluate_tunisian_corpus_coverage(model, tokenizer):
|
60 |
"""Evaluate model's coverage on Tunisian Dialect Corpus"""
|
61 |
+
try:
|
62 |
+
dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
|
63 |
+
|
64 |
+
def preprocess(examples):
|
65 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
66 |
+
|
67 |
+
dataset = dataset.map(preprocess, batched=True)
|
68 |
+
|
69 |
+
# Calculate coverage based on tokenization
|
70 |
+
total_tokens = 0
|
71 |
+
covered_tokens = 0
|
72 |
+
|
73 |
+
for example in dataset:
|
74 |
+
tokens = tokenizer.tokenize(example['text'])
|
75 |
+
total_tokens += len(tokens)
|
76 |
+
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
77 |
+
|
78 |
+
coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
79 |
+
return coverage
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
82 |
+
return 0.0
|
83 |
|
84 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
85 |
"""Evaluate a single model on all tasks"""
|
86 |
try:
|
87 |
+
print(f"------------ evaluation model {model_name}")
|
88 |
# Load model and tokenizer
|
89 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
90 |
|
|
|
128 |
|
129 |
def process_evaluation_queue():
|
130 |
"""Process all pending evaluations in the queue"""
|
131 |
+
# Get all pending evaluations (including nested directories)
|
132 |
queue_dir = os.path.join(EVAL_REQUESTS_PATH)
|
133 |
+
pending_files = []
|
134 |
+
|
135 |
+
# Walk through the directory tree
|
136 |
+
for root, dirs, files in os.walk(queue_dir):
|
137 |
+
pending_files.extend([os.path.join(root, f) for f in files if f.endswith('.json')])
|
138 |
|
139 |
+
for file_path in pending_files:
|
|
|
140 |
with open(file_path, 'r') as f:
|
141 |
eval_request = json.load(f)
|
142 |
|
143 |
if eval_request.get('status') != EvaluationStatus.PENDING.value:
|
144 |
continue
|
145 |
|
146 |
+
print(f"Processing evaluation request: {file_path}")
|
147 |
+
|
148 |
# Mark as running
|
149 |
eval_request['status'] = EvaluationStatus.RUNNING.value
|
150 |
with open(file_path, 'w') as f:
|
|
|
170 |
json.dump(eval_request, f, indent=2)
|
171 |
|
172 |
# Save to results dataset
|
173 |
+
# Extract username from model path if it exists
|
174 |
+
username = result.model.split('/')[0] if '/' in result.model else ''
|
175 |
+
result_filename = f"{result.model.split('/')[-1]}_{result.precision}.json"
|
176 |
+
|
177 |
+
if username:
|
178 |
+
# Create user directory if it doesn't exist
|
179 |
+
user_dir = os.path.join(EVAL_RESULTS_PATH, username)
|
180 |
+
os.makedirs(user_dir, exist_ok=True)
|
181 |
+
result_file = os.path.join(user_dir, result_filename)
|
182 |
+
else:
|
183 |
+
result_file = os.path.join(EVAL_RESULTS_PATH, result_filename)
|
184 |
+
|
185 |
+
# First, update the request file with the results
|
186 |
+
request_file = os.path.join(os.path.dirname(file_path), os.path.basename(file_path))
|
187 |
+
with open(file_path, 'r') as f:
|
188 |
+
request_data = json.load(f)
|
189 |
+
|
190 |
+
# Update request file with results and status
|
191 |
+
request_data['results'] = result.results
|
192 |
+
request_data['status'] = EvaluationStatus.FINISHED.value
|
193 |
+
|
194 |
+
with open(file_path, 'w') as f:
|
195 |
+
json.dump(request_data, f, indent=2)
|
196 |
+
|
197 |
+
# Now create the results file
|
198 |
with open(result_file, 'w') as f:
|
199 |
json.dump({
|
200 |
'model': result.model,
|
201 |
'revision': result.revision,
|
202 |
'precision': result.precision,
|
203 |
'weight_type': result.weight_type,
|
204 |
+
'results': result.results,
|
205 |
+
'config': {
|
206 |
+
'model_name': result.model,
|
207 |
+
'model_dtype': result.precision,
|
208 |
+
'model_type': result.weight_type,
|
209 |
+
'architecture': 'Unknown',
|
210 |
+
'license': request_data.get('license', '?'),
|
211 |
+
'likes': request_data.get('likes', 0),
|
212 |
+
'num_params': request_data.get('params', 0),
|
213 |
+
'date': request_data.get('submitted_time', datetime.now().strftime('%Y-%m-%d')),
|
214 |
+
'still_on_hub': True
|
215 |
+
}
|
216 |
}, f, indent=2)
|
217 |
|
218 |
# Upload to Hugging Face
|
219 |
API.upload_file(
|
220 |
path_or_fileobj=result_file,
|
221 |
+
path_in_repo=result_filename if not username else os.path.join(username, result_filename),
|
222 |
repo_id=f"{OWNER}/results",
|
223 |
repo_type="dataset",
|
224 |
commit_message=f"Add evaluation results for {result.model}"
|
225 |
)
|
226 |
|
|
|
|
|
|
|
|
|
|
src/leaderboard/read_evals.py
CHANGED
@@ -36,8 +36,9 @@ class EvalResult:
|
|
36 |
def init_from_json_file(self, json_filepath):
|
37 |
"""Inits the result from the specific model result file"""
|
38 |
with open(json_filepath) as fp:
|
|
|
39 |
data = json.load(fp)
|
40 |
-
|
41 |
config = data.get("config")
|
42 |
|
43 |
# Precision
|
|
|
36 |
def init_from_json_file(self, json_filepath):
|
37 |
"""Inits the result from the specific model result file"""
|
38 |
with open(json_filepath) as fp:
|
39 |
+
print(json_filepath)
|
40 |
data = json.load(fp)
|
41 |
+
print(data)
|
42 |
config = data.get("config")
|
43 |
|
44 |
# Precision
|
src/submission/submit.py
CHANGED
@@ -10,6 +10,12 @@ from src.submission.check_validity import (
|
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
12 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
@@ -114,6 +120,125 @@ def add_new_eval(
|
|
114 |
# Remove the local file
|
115 |
os.remove(out_path)
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
12 |
)
|
13 |
+
from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult
|
14 |
+
from src.display.utils import Tasks
|
15 |
+
import torch
|
16 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
17 |
+
from datasets import load_dataset
|
18 |
+
import time
|
19 |
|
20 |
REQUESTED_MODELS = None
|
21 |
USERS_TO_SUBMISSION_DATES = None
|
|
|
120 |
# Remove the local file
|
121 |
os.remove(out_path)
|
122 |
|
123 |
+
# Run evaluation immediately
|
124 |
+
print(f"Evaluating model {model}...")
|
125 |
+
try:
|
126 |
+
# Load model and tokenizer
|
127 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
128 |
+
|
129 |
+
model_obj = AutoModelForSequenceClassification.from_pretrained(
|
130 |
+
model,
|
131 |
+
revision=revision,
|
132 |
+
torch_dtype=getattr(torch, precision),
|
133 |
+
trust_remote_code=True
|
134 |
+
).to(device)
|
135 |
+
|
136 |
+
tokenizer = AutoTokenizer.from_pretrained(model, revision=revision)
|
137 |
+
|
138 |
+
# Evaluate on TSAC
|
139 |
+
print("Evaluating on TSAC sentiment analysis...")
|
140 |
+
tsac_dataset = load_dataset("fbougares/tsac", split="test")
|
141 |
+
|
142 |
+
def preprocess_tsac(examples):
|
143 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
144 |
+
|
145 |
+
tsac_dataset = tsac_dataset.map(preprocess_tsac, batched=True)
|
146 |
+
tsac_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
147 |
+
|
148 |
+
model_obj.eval()
|
149 |
+
with torch.no_grad():
|
150 |
+
predictions = []
|
151 |
+
labels = []
|
152 |
+
|
153 |
+
for batch in tsac_dataset:
|
154 |
+
inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
|
155 |
+
label = batch['label'].to(device)
|
156 |
+
|
157 |
+
outputs = model_obj(**inputs)
|
158 |
+
predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
|
159 |
+
labels.extend(label.cpu().tolist())
|
160 |
+
|
161 |
+
tsac_accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
|
162 |
+
|
163 |
+
# Evaluate on ArabML
|
164 |
+
print("Evaluating on ArabML Tunisian Corpus...")
|
165 |
+
arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
|
166 |
+
|
167 |
+
def preprocess_arabml(examples):
|
168 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
169 |
+
|
170 |
+
arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
|
171 |
+
|
172 |
+
total_tokens = 0
|
173 |
+
covered_tokens = 0
|
174 |
+
|
175 |
+
for example in arabml_dataset:
|
176 |
+
tokens = tokenizer.tokenize(example['text'])
|
177 |
+
total_tokens += len(tokens)
|
178 |
+
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
179 |
+
|
180 |
+
arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
181 |
+
|
182 |
+
# Store results
|
183 |
+
eval_results = {
|
184 |
+
Tasks.tsac_sentiment.value.benchmark: tsac_accuracy,
|
185 |
+
Tasks.tunisian_corpus.value.benchmark: arabml_coverage
|
186 |
+
}
|
187 |
+
|
188 |
+
print(f"Evaluation results: {eval_results}")
|
189 |
+
|
190 |
+
# Update eval_entry with results
|
191 |
+
eval_entry["status"] = EvaluationStatus.FINISHED.value
|
192 |
+
eval_entry["results"] = eval_results
|
193 |
+
|
194 |
+
# Save to results dataset
|
195 |
+
results_file = os.path.join(EVAL_RESULTS_PATH, f"{model}_{revision}_{precision}_{weight_type}.json")
|
196 |
+
with open(results_file, 'w') as f:
|
197 |
+
json.dump({
|
198 |
+
'model': model,
|
199 |
+
'revision': revision,
|
200 |
+
'precision': precision,
|
201 |
+
'weight_type': weight_type,
|
202 |
+
'results': eval_results
|
203 |
+
}, f, indent=2)
|
204 |
+
|
205 |
+
# Upload results to Hugging Face
|
206 |
+
API.upload_file(
|
207 |
+
path_or_fileobj=results_file,
|
208 |
+
path_in_repo=os.path.basename(results_file),
|
209 |
+
repo_id=RESULTS_REPO,
|
210 |
+
repo_type="dataset",
|
211 |
+
commit_message=f"Add evaluation results for {model}"
|
212 |
+
)
|
213 |
+
|
214 |
+
# Remove the original eval request file
|
215 |
+
os.remove(out_path)
|
216 |
+
|
217 |
+
return styled_message(
|
218 |
+
f"Model evaluation completed!\n\n"
|
219 |
+
f"TSAC Sentiment Accuracy: {tsac_accuracy:.2%}\n"
|
220 |
+
f"ArabML Corpus Coverage: {arabml_coverage:.2%}"
|
221 |
+
)
|
222 |
+
|
223 |
+
except Exception as e:
|
224 |
+
print(f"Error during evaluation: {str(e)}")
|
225 |
+
eval_entry["status"] = EvaluationStatus.FAILED.value
|
226 |
+
eval_entry["error"] = str(e)
|
227 |
+
|
228 |
+
with open(out_path, "w") as f:
|
229 |
+
f.write(json.dumps(eval_entry))
|
230 |
+
|
231 |
+
API.upload_file(
|
232 |
+
path_or_fileobj=out_path,
|
233 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
234 |
+
repo_id=QUEUE_REPO,
|
235 |
+
repo_type="dataset",
|
236 |
+
commit_message=f"Add {model} evaluation error",
|
237 |
+
)
|
238 |
+
|
239 |
+
os.remove(out_path)
|
240 |
+
|
241 |
+
return styled_error(
|
242 |
+
f"Error during evaluation: {str(e)}\n\n"
|
243 |
+
"The evaluation will be retried automatically later."
|
244 |
+
)
|