Spaces:

tunis-ai
/

TunisianEncoderModelsLeaderboard

Sleeping

App Files Files Community

hamzabouajila commited on Jul 7

Commit

28e88f2

1 Parent(s): 9d7aae7

implement evaluation and fix bugs

Browse files

Files changed (6) hide show

app.py +20 -26
pyproject.toml +1 -0
src/envs.py +8 -6
src/evaluator/evaluate.py +97 -53
src/leaderboard/read_evals.py +2 -1
src/submission/submit.py +128 -3

app.py CHANGED Viewed

@@ -26,7 +26,9 @@ from src.display.utils import (
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
-from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult, Tasks
 def restart_space():
@@ -49,6 +51,23 @@ except Exception:
     restart_space()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
@@ -125,31 +144,6 @@ with demo:
             gr.Markdown(LLM_BENCHMARKS_TEXT)
             gr.Markdown(EVALUATION_QUEUE_TEXT)
-        with gr.TabItem("🚀 Evaluate Model", elem_id="evaluate-tab", id=3):
-            with gr.Row():
-                model_name = gr.Textbox(label="Model Name")
-                revision = gr.Textbox(label="Revision", value="main")
-            with gr.Row():
-                precision = gr.Dropdown(
-                    choices=[p.value for p in Precision],
-                    label="Precision",
-                    value="fp32"
-                )
-                weight_type = gr.Dropdown(
-                    choices=[w.value for w in WeightType],
-                    label="Weight Type",
-                    value="pytorch"
-                )
-            evaluate_button = gr.Button("Evaluate Model")
-            status_output = gr.Textbox(label="Evaluation Status")
-            evaluate_button.click(
-                fn=evaluate_and_update,
-                inputs=[model_name, revision, precision, weight_type],
-                outputs=[status_output]
-            )
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():

 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
+from src.evaluator.evaluate import process_evaluation_queue
+import threading
+import time
 def restart_space():
     restart_space()
+# Start evaluator service in a separate thread
+def run_evaluator():
+    print("Starting evaluator service...")
+    while True:
+        try:
+            process_evaluation_queue()
+            print("Evaluation queue processed. Sleeping for 5 minutes...")
+            time.sleep(300)  # Sleep for 5 minutes
+        except Exception as e:
+            print(f"Error in evaluation process: {e}")
+            print("Retrying in 5 minutes...")
+            time.sleep(300)
+# Start evaluator in a separate thread
+evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
+evaluator_thread.start()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
             gr.Markdown(LLM_BENCHMARKS_TEXT)
             gr.Markdown(EVALUATION_QUEUE_TEXT)
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():

pyproject.toml CHANGED Viewed

@@ -18,6 +18,7 @@ dependencies = [
     "numpy>=2.3.1",
     "pandas>=2.3.0",
     "python-dateutil>=2.9.0.post0",
     "sentencepiece>=0.2.0",
     "tokenizers>=0.15.0",
     "torch>=2.7.1",

     "numpy>=2.3.1",
     "pandas>=2.3.0",
     "python-dateutil>=2.9.0.post0",
+    "scikit-learn>=1.7.0",
     "sentencepiece>=0.2.0",
     "tokenizers>=0.15.0",
     "torch>=2.7.1",

src/envs.py CHANGED Viewed

@@ -14,12 +14,14 @@ QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
-EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
 # Local caches
+EVAL_REQUESTS_PATH = "./eval-queue"
+EVAL_RESULTS_PATH = "./eval-results"
+EVAL_REQUESTS_PATH_BACKEND = "./eval-queue-bk"
+EVAL_RESULTS_PATH_BACKEND = "./eval-results-bk"
+# Create directories if they don't exist
+for path in [EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND]:
+    os.makedirs(path, exist_ok=True)
 API = HfApi(token=TOKEN)

src/evaluator/evaluate.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from typing import Dict, Any
 from dataclasses import dataclass
 from enum import Enum
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from datasets import load_dataset
@@ -28,54 +28,63 @@ class EvaluationResult:
 def evaluate_tsac_sentiment(model, tokenizer, device):
     """Evaluate model on TSAC sentiment analysis task"""
-    dataset = load_dataset("fbougares/tsac", split="test")
-    def preprocess(examples):
-        return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
-    dataset = dataset.map(preprocess, batched=True)
-    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
-    model.eval()
-    with torch.no_grad():
-        predictions = []
-        labels = []
-        for batch in dataset:
-            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
-            label = batch['label'].to(device)
-            outputs = model(**inputs)
-            predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
-            labels.extend(label.cpu().tolist())
-    accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
-    return accuracy
 def evaluate_tunisian_corpus_coverage(model, tokenizer):
     """Evaluate model's coverage on Tunisian Dialect Corpus"""
-    dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
-    def preprocess(examples):
-        return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
-    dataset = dataset.map(preprocess, batched=True)
-    # Calculate coverage based on tokenization
-    total_tokens = 0
-    covered_tokens = 0
-    for example in dataset:
-        tokens = tokenizer.tokenize(example['text'])
-        total_tokens += len(tokens)
-        covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
-    coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
-    return coverage
 def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
     """Evaluate a single model on all tasks"""
     try:
         # Load model and tokenizer
         device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -119,18 +128,23 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
 def process_evaluation_queue():
     """Process all pending evaluations in the queue"""
-    # Get all pending evaluations
     queue_dir = os.path.join(EVAL_REQUESTS_PATH)
-    pending_files = [f for f in os.listdir(queue_dir) if f.endswith('.json')]
-    for file in pending_files:
-        file_path = os.path.join(queue_dir, file)
         with open(file_path, 'r') as f:
             eval_request = json.load(f)
         if eval_request.get('status') != EvaluationStatus.PENDING.value:
             continue
         # Mark as running
         eval_request['status'] = EvaluationStatus.RUNNING.value
         with open(file_path, 'w') as f:
@@ -156,27 +170,57 @@ def process_evaluation_queue():
             json.dump(eval_request, f, indent=2)
         # Save to results dataset
-        result_file = os.path.join(EVAL_RESULTS_PATH, f"{result.model}_{result.precision}.json")
         with open(result_file, 'w') as f:
             json.dump({
                 'model': result.model,
                 'revision': result.revision,
                 'precision': result.precision,
                 'weight_type': result.weight_type,
-                'results': result.results
             }, f, indent=2)
         # Upload to Hugging Face
         API.upload_file(
             path_or_fileobj=result_file,
-            path_in_repo=os.path.basename(result_file),
             repo_id=f"{OWNER}/results",
             repo_type="dataset",
             commit_message=f"Add evaluation results for {result.model}"
         )
-def main():
-    process_evaluation_queue()
-if __name__ == "__main__":
-    main()

 from typing import Dict, Any
 from dataclasses import dataclass
 from enum import Enum
+from datetime import datetime
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from datasets import load_dataset
 def evaluate_tsac_sentiment(model, tokenizer, device):
     """Evaluate model on TSAC sentiment analysis task"""
+    try:
+        dataset = load_dataset("fbougares/tsac", split="train")
+        def preprocess(examples):
+            return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
+        dataset = dataset.map(preprocess, batched=True)
+        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+        model.eval()
+        with torch.no_grad():
+            predictions = []
+            labels = []
+            for batch in dataset:
+                inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
+                label = batch['label'].to(device)
+                outputs = model(**inputs)
+                predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
+                labels.extend(label.cpu().tolist())
+        accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
+        return accuracy
+    except Exception as e:
+        print(f"Error in TSAC evaluation: {str(e)}")
+        return 0.0
 def evaluate_tunisian_corpus_coverage(model, tokenizer):
     """Evaluate model's coverage on Tunisian Dialect Corpus"""
+    try:
+        dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
+        def preprocess(examples):
+            return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
+        dataset = dataset.map(preprocess, batched=True)
+        # Calculate coverage based on tokenization
+        total_tokens = 0
+        covered_tokens = 0
+        for example in dataset:
+            tokens = tokenizer.tokenize(example['text'])
+            total_tokens += len(tokens)
+            covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
+        coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
+        return coverage
+    except Exception as e:
+        print(f"Error in Tunisian Corpus evaluation: {str(e)}")
+        return 0.0
 def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
     """Evaluate a single model on all tasks"""
     try:
+        print(f"------------ evaluation model {model_name}")
         # Load model and tokenizer
         device = "cuda" if torch.cuda.is_available() else "cpu"
 def process_evaluation_queue():
     """Process all pending evaluations in the queue"""
+    # Get all pending evaluations (including nested directories)
     queue_dir = os.path.join(EVAL_REQUESTS_PATH)
+    pending_files = []
+    # Walk through the directory tree
+    for root, dirs, files in os.walk(queue_dir):
+        pending_files.extend([os.path.join(root, f) for f in files if f.endswith('.json')])
+    for file_path in pending_files:
         with open(file_path, 'r') as f:
             eval_request = json.load(f)
         if eval_request.get('status') != EvaluationStatus.PENDING.value:
             continue
+        print(f"Processing evaluation request: {file_path}")
         # Mark as running
         eval_request['status'] = EvaluationStatus.RUNNING.value
         with open(file_path, 'w') as f:
             json.dump(eval_request, f, indent=2)
         # Save to results dataset
+        # Extract username from model path if it exists
+        username = result.model.split('/')[0] if '/' in result.model else ''
+        result_filename = f"{result.model.split('/')[-1]}_{result.precision}.json"
+        if username:
+            # Create user directory if it doesn't exist
+            user_dir = os.path.join(EVAL_RESULTS_PATH, username)
+            os.makedirs(user_dir, exist_ok=True)
+            result_file = os.path.join(user_dir, result_filename)
+        else:
+            result_file = os.path.join(EVAL_RESULTS_PATH, result_filename)
+        # First, update the request file with the results
+        request_file = os.path.join(os.path.dirname(file_path), os.path.basename(file_path))
+        with open(file_path, 'r') as f:
+            request_data = json.load(f)
+        # Update request file with results and status
+        request_data['results'] = result.results
+        request_data['status'] = EvaluationStatus.FINISHED.value
+        with open(file_path, 'w') as f:
+            json.dump(request_data, f, indent=2)
+        # Now create the results file
         with open(result_file, 'w') as f:
             json.dump({
                 'model': result.model,
                 'revision': result.revision,
                 'precision': result.precision,
                 'weight_type': result.weight_type,
+                'results': result.results,
+                'config': {
+                    'model_name': result.model,
+                    'model_dtype': result.precision,
+                    'model_type': result.weight_type,
+                    'architecture': 'Unknown',
+                    'license': request_data.get('license', '?'),
+                    'likes': request_data.get('likes', 0),
+                    'num_params': request_data.get('params', 0),
+                    'date': request_data.get('submitted_time', datetime.now().strftime('%Y-%m-%d')),
+                    'still_on_hub': True
+                }
             }, f, indent=2)
         # Upload to Hugging Face
         API.upload_file(
             path_or_fileobj=result_file,
+            path_in_repo=result_filename if not username else os.path.join(username, result_filename),
             repo_id=f"{OWNER}/results",
             repo_type="dataset",
             commit_message=f"Add evaluation results for {result.model}"
         )

src/leaderboard/read_evals.py CHANGED Viewed

@@ -36,8 +36,9 @@ class EvalResult:
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         config = data.get("config")
         # Precision

     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
+            print(json_filepath)
             data = json.load(fp)
+        print(data)
         config = data.get("config")
         # Precision

src/submission/submit.py CHANGED Viewed

@@ -10,6 +10,12 @@ from src.submission.check_validity import (
     get_model_size,
     is_model_on_hub,
 )
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
@@ -114,6 +120,125 @@ def add_new_eval(
     # Remove the local file
     os.remove(out_path)
-    return styled_message(
-        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
-    )

     get_model_size,
     is_model_on_hub,
 )
+from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult
+from src.display.utils import Tasks
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from datasets import load_dataset
+import time
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
     # Remove the local file
     os.remove(out_path)
+    # Run evaluation immediately
+    print(f"Evaluating model {model}...")
+    try:
+        # Load model and tokenizer
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model_obj = AutoModelForSequenceClassification.from_pretrained(
+            model,
+            revision=revision,
+            torch_dtype=getattr(torch, precision),
+            trust_remote_code=True
+        ).to(device)
+        tokenizer = AutoTokenizer.from_pretrained(model, revision=revision)
+        # Evaluate on TSAC
+        print("Evaluating on TSAC sentiment analysis...")
+        tsac_dataset = load_dataset("fbougares/tsac", split="test")
+        def preprocess_tsac(examples):
+            return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
+        tsac_dataset = tsac_dataset.map(preprocess_tsac, batched=True)
+        tsac_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+        model_obj.eval()
+        with torch.no_grad():
+            predictions = []
+            labels = []
+            for batch in tsac_dataset:
+                inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
+                label = batch['label'].to(device)
+                outputs = model_obj(**inputs)
+                predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
+                labels.extend(label.cpu().tolist())
+        tsac_accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
+        # Evaluate on ArabML
+        print("Evaluating on ArabML Tunisian Corpus...")
+        arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
+        def preprocess_arabml(examples):
+            return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
+        arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
+        total_tokens = 0
+        covered_tokens = 0
+        for example in arabml_dataset:
+            tokens = tokenizer.tokenize(example['text'])
+            total_tokens += len(tokens)
+            covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
+        arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
+        # Store results
+        eval_results = {
+            Tasks.tsac_sentiment.value.benchmark: tsac_accuracy,
+            Tasks.tunisian_corpus.value.benchmark: arabml_coverage
+        }
+        print(f"Evaluation results: {eval_results}")
+        # Update eval_entry with results
+        eval_entry["status"] = EvaluationStatus.FINISHED.value
+        eval_entry["results"] = eval_results
+        # Save to results dataset
+        results_file = os.path.join(EVAL_RESULTS_PATH, f"{model}_{revision}_{precision}_{weight_type}.json")
+        with open(results_file, 'w') as f:
+            json.dump({
+                'model': model,
+                'revision': revision,
+                'precision': precision,
+                'weight_type': weight_type,
+                'results': eval_results
+            }, f, indent=2)
+        # Upload results to Hugging Face
+        API.upload_file(
+            path_or_fileobj=results_file,
+            path_in_repo=os.path.basename(results_file),
+            repo_id=RESULTS_REPO,
+            repo_type="dataset",
+            commit_message=f"Add evaluation results for {model}"
+        )
+        # Remove the original eval request file
+        os.remove(out_path)
+        return styled_message(
+            f"Model evaluation completed!\n\n"
+            f"TSAC Sentiment Accuracy: {tsac_accuracy:.2%}\n"
+            f"ArabML Corpus Coverage: {arabml_coverage:.2%}"
+        )
+    except Exception as e:
+        print(f"Error during evaluation: {str(e)}")
+        eval_entry["status"] = EvaluationStatus.FAILED.value
+        eval_entry["error"] = str(e)
+        with open(out_path, "w") as f:
+            f.write(json.dumps(eval_entry))
+        API.upload_file(
+            path_or_fileobj=out_path,
+            path_in_repo=out_path.split("eval-queue/")[1],
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            commit_message=f"Add {model} evaluation error",
+        )
+        os.remove(out_path)
+        return styled_error(
+            f"Error during evaluation: {str(e)}\n\n"
+            "The evaluation will be retried automatically later."
+        )