hamzabouajila commited on
Commit
28e88f2
·
1 Parent(s): 9d7aae7

implement evaluation and fix bugs

Browse files
app.py CHANGED
@@ -26,7 +26,9 @@ from src.display.utils import (
26
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
27
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
28
  from src.submission.submit import add_new_eval
29
- from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult, Tasks
 
 
30
 
31
 
32
  def restart_space():
@@ -49,6 +51,23 @@ except Exception:
49
  restart_space()
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
  (
@@ -125,31 +144,6 @@ with demo:
125
  gr.Markdown(LLM_BENCHMARKS_TEXT)
126
  gr.Markdown(EVALUATION_QUEUE_TEXT)
127
 
128
- with gr.TabItem("🚀 Evaluate Model", elem_id="evaluate-tab", id=3):
129
- with gr.Row():
130
- model_name = gr.Textbox(label="Model Name")
131
- revision = gr.Textbox(label="Revision", value="main")
132
- with gr.Row():
133
- precision = gr.Dropdown(
134
- choices=[p.value for p in Precision],
135
- label="Precision",
136
- value="fp32"
137
- )
138
- weight_type = gr.Dropdown(
139
- choices=[w.value for w in WeightType],
140
- label="Weight Type",
141
- value="pytorch"
142
- )
143
- evaluate_button = gr.Button("Evaluate Model")
144
- status_output = gr.Textbox(label="Evaluation Status")
145
-
146
- evaluate_button.click(
147
- fn=evaluate_and_update,
148
- inputs=[model_name, revision, precision, weight_type],
149
- outputs=[status_output]
150
- )
151
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
152
-
153
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
154
  with gr.Column():
155
  with gr.Row():
 
26
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
27
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
28
  from src.submission.submit import add_new_eval
29
+ from src.evaluator.evaluate import process_evaluation_queue
30
+ import threading
31
+ import time
32
 
33
 
34
  def restart_space():
 
51
  restart_space()
52
 
53
 
54
+ # Start evaluator service in a separate thread
55
+ def run_evaluator():
56
+ print("Starting evaluator service...")
57
+ while True:
58
+ try:
59
+ process_evaluation_queue()
60
+ print("Evaluation queue processed. Sleeping for 5 minutes...")
61
+ time.sleep(300) # Sleep for 5 minutes
62
+ except Exception as e:
63
+ print(f"Error in evaluation process: {e}")
64
+ print("Retrying in 5 minutes...")
65
+ time.sleep(300)
66
+
67
+ # Start evaluator in a separate thread
68
+ evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
69
+ evaluator_thread.start()
70
+
71
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
72
 
73
  (
 
144
  gr.Markdown(LLM_BENCHMARKS_TEXT)
145
  gr.Markdown(EVALUATION_QUEUE_TEXT)
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
148
  with gr.Column():
149
  with gr.Row():
pyproject.toml CHANGED
@@ -18,6 +18,7 @@ dependencies = [
18
  "numpy>=2.3.1",
19
  "pandas>=2.3.0",
20
  "python-dateutil>=2.9.0.post0",
 
21
  "sentencepiece>=0.2.0",
22
  "tokenizers>=0.15.0",
23
  "torch>=2.7.1",
 
18
  "numpy>=2.3.1",
19
  "pandas>=2.3.0",
20
  "python-dateutil>=2.9.0.post0",
21
+ "scikit-learn>=1.7.0",
22
  "sentencepiece>=0.2.0",
23
  "tokenizers>=0.15.0",
24
  "torch>=2.7.1",
src/envs.py CHANGED
@@ -14,12 +14,14 @@ QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 
 
 
 
24
 
25
  API = HfApi(token=TOKEN)
 
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
 
 
17
  # Local caches
18
+ EVAL_REQUESTS_PATH = "./eval-queue"
19
+ EVAL_RESULTS_PATH = "./eval-results"
20
+ EVAL_REQUESTS_PATH_BACKEND = "./eval-queue-bk"
21
+ EVAL_RESULTS_PATH_BACKEND = "./eval-results-bk"
22
+
23
+ # Create directories if they don't exist
24
+ for path in [EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND]:
25
+ os.makedirs(path, exist_ok=True)
26
 
27
  API = HfApi(token=TOKEN)
src/evaluator/evaluate.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  from typing import Dict, Any
4
  from dataclasses import dataclass
5
  from enum import Enum
6
-
7
  import torch
8
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
  from datasets import load_dataset
@@ -28,54 +28,63 @@ class EvaluationResult:
28
 
29
  def evaluate_tsac_sentiment(model, tokenizer, device):
30
  """Evaluate model on TSAC sentiment analysis task"""
31
- dataset = load_dataset("fbougares/tsac", split="test")
32
-
33
- def preprocess(examples):
34
- return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
35
-
36
- dataset = dataset.map(preprocess, batched=True)
37
- dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
38
-
39
- model.eval()
40
- with torch.no_grad():
41
- predictions = []
42
- labels = []
43
-
44
- for batch in dataset:
45
- inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
46
- label = batch['label'].to(device)
47
 
48
- outputs = model(**inputs)
49
- predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
50
- labels.extend(label.cpu().tolist())
51
-
52
- accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
53
- return accuracy
 
 
 
 
 
 
 
54
 
55
  def evaluate_tunisian_corpus_coverage(model, tokenizer):
56
  """Evaluate model's coverage on Tunisian Dialect Corpus"""
57
- dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
58
-
59
- def preprocess(examples):
60
- return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
61
-
62
- dataset = dataset.map(preprocess, batched=True)
63
-
64
- # Calculate coverage based on tokenization
65
- total_tokens = 0
66
- covered_tokens = 0
67
-
68
- for example in dataset:
69
- tokens = tokenizer.tokenize(example['text'])
70
- total_tokens += len(tokens)
71
- covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
72
-
73
- coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
74
- return coverage
 
 
 
 
75
 
76
  def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
77
  """Evaluate a single model on all tasks"""
78
  try:
 
79
  # Load model and tokenizer
80
  device = "cuda" if torch.cuda.is_available() else "cpu"
81
 
@@ -119,18 +128,23 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
119
 
120
  def process_evaluation_queue():
121
  """Process all pending evaluations in the queue"""
122
- # Get all pending evaluations
123
  queue_dir = os.path.join(EVAL_REQUESTS_PATH)
124
- pending_files = [f for f in os.listdir(queue_dir) if f.endswith('.json')]
 
 
 
 
125
 
126
- for file in pending_files:
127
- file_path = os.path.join(queue_dir, file)
128
  with open(file_path, 'r') as f:
129
  eval_request = json.load(f)
130
 
131
  if eval_request.get('status') != EvaluationStatus.PENDING.value:
132
  continue
133
 
 
 
134
  # Mark as running
135
  eval_request['status'] = EvaluationStatus.RUNNING.value
136
  with open(file_path, 'w') as f:
@@ -156,27 +170,57 @@ def process_evaluation_queue():
156
  json.dump(eval_request, f, indent=2)
157
 
158
  # Save to results dataset
159
- result_file = os.path.join(EVAL_RESULTS_PATH, f"{result.model}_{result.precision}.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  with open(result_file, 'w') as f:
161
  json.dump({
162
  'model': result.model,
163
  'revision': result.revision,
164
  'precision': result.precision,
165
  'weight_type': result.weight_type,
166
- 'results': result.results
 
 
 
 
 
 
 
 
 
 
 
167
  }, f, indent=2)
168
 
169
  # Upload to Hugging Face
170
  API.upload_file(
171
  path_or_fileobj=result_file,
172
- path_in_repo=os.path.basename(result_file),
173
  repo_id=f"{OWNER}/results",
174
  repo_type="dataset",
175
  commit_message=f"Add evaluation results for {result.model}"
176
  )
177
 
178
- def main():
179
- process_evaluation_queue()
180
-
181
- if __name__ == "__main__":
182
- main()
 
3
  from typing import Dict, Any
4
  from dataclasses import dataclass
5
  from enum import Enum
6
+ from datetime import datetime
7
  import torch
8
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
  from datasets import load_dataset
 
28
 
29
  def evaluate_tsac_sentiment(model, tokenizer, device):
30
  """Evaluate model on TSAC sentiment analysis task"""
31
+ try:
32
+ dataset = load_dataset("fbougares/tsac", split="train")
33
+
34
+ def preprocess(examples):
35
+ return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
36
+
37
+ dataset = dataset.map(preprocess, batched=True)
38
+ dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
39
+
40
+ model.eval()
41
+ with torch.no_grad():
42
+ predictions = []
43
+ labels = []
 
 
 
44
 
45
+ for batch in dataset:
46
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
47
+ label = batch['label'].to(device)
48
+
49
+ outputs = model(**inputs)
50
+ predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
51
+ labels.extend(label.cpu().tolist())
52
+
53
+ accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
54
+ return accuracy
55
+ except Exception as e:
56
+ print(f"Error in TSAC evaluation: {str(e)}")
57
+ return 0.0
58
 
59
  def evaluate_tunisian_corpus_coverage(model, tokenizer):
60
  """Evaluate model's coverage on Tunisian Dialect Corpus"""
61
+ try:
62
+ dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
63
+
64
+ def preprocess(examples):
65
+ return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
66
+
67
+ dataset = dataset.map(preprocess, batched=True)
68
+
69
+ # Calculate coverage based on tokenization
70
+ total_tokens = 0
71
+ covered_tokens = 0
72
+
73
+ for example in dataset:
74
+ tokens = tokenizer.tokenize(example['text'])
75
+ total_tokens += len(tokens)
76
+ covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
77
+
78
+ coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
79
+ return coverage
80
+ except Exception as e:
81
+ print(f"Error in Tunisian Corpus evaluation: {str(e)}")
82
+ return 0.0
83
 
84
  def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
85
  """Evaluate a single model on all tasks"""
86
  try:
87
+ print(f"------------ evaluation model {model_name}")
88
  # Load model and tokenizer
89
  device = "cuda" if torch.cuda.is_available() else "cpu"
90
 
 
128
 
129
  def process_evaluation_queue():
130
  """Process all pending evaluations in the queue"""
131
+ # Get all pending evaluations (including nested directories)
132
  queue_dir = os.path.join(EVAL_REQUESTS_PATH)
133
+ pending_files = []
134
+
135
+ # Walk through the directory tree
136
+ for root, dirs, files in os.walk(queue_dir):
137
+ pending_files.extend([os.path.join(root, f) for f in files if f.endswith('.json')])
138
 
139
+ for file_path in pending_files:
 
140
  with open(file_path, 'r') as f:
141
  eval_request = json.load(f)
142
 
143
  if eval_request.get('status') != EvaluationStatus.PENDING.value:
144
  continue
145
 
146
+ print(f"Processing evaluation request: {file_path}")
147
+
148
  # Mark as running
149
  eval_request['status'] = EvaluationStatus.RUNNING.value
150
  with open(file_path, 'w') as f:
 
170
  json.dump(eval_request, f, indent=2)
171
 
172
  # Save to results dataset
173
+ # Extract username from model path if it exists
174
+ username = result.model.split('/')[0] if '/' in result.model else ''
175
+ result_filename = f"{result.model.split('/')[-1]}_{result.precision}.json"
176
+
177
+ if username:
178
+ # Create user directory if it doesn't exist
179
+ user_dir = os.path.join(EVAL_RESULTS_PATH, username)
180
+ os.makedirs(user_dir, exist_ok=True)
181
+ result_file = os.path.join(user_dir, result_filename)
182
+ else:
183
+ result_file = os.path.join(EVAL_RESULTS_PATH, result_filename)
184
+
185
+ # First, update the request file with the results
186
+ request_file = os.path.join(os.path.dirname(file_path), os.path.basename(file_path))
187
+ with open(file_path, 'r') as f:
188
+ request_data = json.load(f)
189
+
190
+ # Update request file with results and status
191
+ request_data['results'] = result.results
192
+ request_data['status'] = EvaluationStatus.FINISHED.value
193
+
194
+ with open(file_path, 'w') as f:
195
+ json.dump(request_data, f, indent=2)
196
+
197
+ # Now create the results file
198
  with open(result_file, 'w') as f:
199
  json.dump({
200
  'model': result.model,
201
  'revision': result.revision,
202
  'precision': result.precision,
203
  'weight_type': result.weight_type,
204
+ 'results': result.results,
205
+ 'config': {
206
+ 'model_name': result.model,
207
+ 'model_dtype': result.precision,
208
+ 'model_type': result.weight_type,
209
+ 'architecture': 'Unknown',
210
+ 'license': request_data.get('license', '?'),
211
+ 'likes': request_data.get('likes', 0),
212
+ 'num_params': request_data.get('params', 0),
213
+ 'date': request_data.get('submitted_time', datetime.now().strftime('%Y-%m-%d')),
214
+ 'still_on_hub': True
215
+ }
216
  }, f, indent=2)
217
 
218
  # Upload to Hugging Face
219
  API.upload_file(
220
  path_or_fileobj=result_file,
221
+ path_in_repo=result_filename if not username else os.path.join(username, result_filename),
222
  repo_id=f"{OWNER}/results",
223
  repo_type="dataset",
224
  commit_message=f"Add evaluation results for {result.model}"
225
  )
226
 
 
 
 
 
 
src/leaderboard/read_evals.py CHANGED
@@ -36,8 +36,9 @@ class EvalResult:
36
  def init_from_json_file(self, json_filepath):
37
  """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
 
39
  data = json.load(fp)
40
-
41
  config = data.get("config")
42
 
43
  # Precision
 
36
  def init_from_json_file(self, json_filepath):
37
  """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
39
+ print(json_filepath)
40
  data = json.load(fp)
41
+ print(data)
42
  config = data.get("config")
43
 
44
  # Precision
src/submission/submit.py CHANGED
@@ -10,6 +10,12 @@ from src.submission.check_validity import (
10
  get_model_size,
11
  is_model_on_hub,
12
  )
 
 
 
 
 
 
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
@@ -114,6 +120,125 @@ def add_new_eval(
114
  # Remove the local file
115
  os.remove(out_path)
116
 
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  get_model_size,
11
  is_model_on_hub,
12
  )
13
+ from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult
14
+ from src.display.utils import Tasks
15
+ import torch
16
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
17
+ from datasets import load_dataset
18
+ import time
19
 
20
  REQUESTED_MODELS = None
21
  USERS_TO_SUBMISSION_DATES = None
 
120
  # Remove the local file
121
  os.remove(out_path)
122
 
123
+ # Run evaluation immediately
124
+ print(f"Evaluating model {model}...")
125
+ try:
126
+ # Load model and tokenizer
127
+ device = "cuda" if torch.cuda.is_available() else "cpu"
128
+
129
+ model_obj = AutoModelForSequenceClassification.from_pretrained(
130
+ model,
131
+ revision=revision,
132
+ torch_dtype=getattr(torch, precision),
133
+ trust_remote_code=True
134
+ ).to(device)
135
+
136
+ tokenizer = AutoTokenizer.from_pretrained(model, revision=revision)
137
+
138
+ # Evaluate on TSAC
139
+ print("Evaluating on TSAC sentiment analysis...")
140
+ tsac_dataset = load_dataset("fbougares/tsac", split="test")
141
+
142
+ def preprocess_tsac(examples):
143
+ return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
144
+
145
+ tsac_dataset = tsac_dataset.map(preprocess_tsac, batched=True)
146
+ tsac_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
147
+
148
+ model_obj.eval()
149
+ with torch.no_grad():
150
+ predictions = []
151
+ labels = []
152
+
153
+ for batch in tsac_dataset:
154
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
155
+ label = batch['label'].to(device)
156
+
157
+ outputs = model_obj(**inputs)
158
+ predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
159
+ labels.extend(label.cpu().tolist())
160
+
161
+ tsac_accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
162
+
163
+ # Evaluate on ArabML
164
+ print("Evaluating on ArabML Tunisian Corpus...")
165
+ arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
166
+
167
+ def preprocess_arabml(examples):
168
+ return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
169
+
170
+ arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
171
+
172
+ total_tokens = 0
173
+ covered_tokens = 0
174
+
175
+ for example in arabml_dataset:
176
+ tokens = tokenizer.tokenize(example['text'])
177
+ total_tokens += len(tokens)
178
+ covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
179
+
180
+ arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
181
+
182
+ # Store results
183
+ eval_results = {
184
+ Tasks.tsac_sentiment.value.benchmark: tsac_accuracy,
185
+ Tasks.tunisian_corpus.value.benchmark: arabml_coverage
186
+ }
187
+
188
+ print(f"Evaluation results: {eval_results}")
189
+
190
+ # Update eval_entry with results
191
+ eval_entry["status"] = EvaluationStatus.FINISHED.value
192
+ eval_entry["results"] = eval_results
193
+
194
+ # Save to results dataset
195
+ results_file = os.path.join(EVAL_RESULTS_PATH, f"{model}_{revision}_{precision}_{weight_type}.json")
196
+ with open(results_file, 'w') as f:
197
+ json.dump({
198
+ 'model': model,
199
+ 'revision': revision,
200
+ 'precision': precision,
201
+ 'weight_type': weight_type,
202
+ 'results': eval_results
203
+ }, f, indent=2)
204
+
205
+ # Upload results to Hugging Face
206
+ API.upload_file(
207
+ path_or_fileobj=results_file,
208
+ path_in_repo=os.path.basename(results_file),
209
+ repo_id=RESULTS_REPO,
210
+ repo_type="dataset",
211
+ commit_message=f"Add evaluation results for {model}"
212
+ )
213
+
214
+ # Remove the original eval request file
215
+ os.remove(out_path)
216
+
217
+ return styled_message(
218
+ f"Model evaluation completed!\n\n"
219
+ f"TSAC Sentiment Accuracy: {tsac_accuracy:.2%}\n"
220
+ f"ArabML Corpus Coverage: {arabml_coverage:.2%}"
221
+ )
222
+
223
+ except Exception as e:
224
+ print(f"Error during evaluation: {str(e)}")
225
+ eval_entry["status"] = EvaluationStatus.FAILED.value
226
+ eval_entry["error"] = str(e)
227
+
228
+ with open(out_path, "w") as f:
229
+ f.write(json.dumps(eval_entry))
230
+
231
+ API.upload_file(
232
+ path_or_fileobj=out_path,
233
+ path_in_repo=out_path.split("eval-queue/")[1],
234
+ repo_id=QUEUE_REPO,
235
+ repo_type="dataset",
236
+ commit_message=f"Add {model} evaluation error",
237
+ )
238
+
239
+ os.remove(out_path)
240
+
241
+ return styled_error(
242
+ f"Error during evaluation: {str(e)}\n\n"
243
+ "The evaluation will be retried automatically later."
244
+ )