Spaces:

tunis-ai
/

TunisianEncoderModelsLeaderboard

Running

hamzabouajila commited on Aug 5

Commit

2f1e30c

1 Parent(s): 34052ff

feat: enhance evaluation queue reliability and add stale job recovery

- Add timeout mechanism for stalled RUNNING evaluations
- Implement automatic reset of stale RUNNING jobs to PENDING status
- Increase sleep interval between evaluation cycles to 3 minutes
- Add pydantic dependency for better data validation
- Improve error handling and logging in submission process
- Update repository references to use TunisianLLMLeaderBoard
- Add debug logging for file uploads in submission flow
- Fix timezone handling for evaluation timestamps

Files changed (6) hide show

app.py +9 -43
pyproject.toml +1 -0
src/envs.py +1 -1
src/evaluator/evaluate.py +36 -24
src/evaluator/run_evaluator.py +4 -4
src/submission/submit.py +4 -0

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dotenv import load_dotenv
 load_dotenv()
 import gradio as gr
@@ -30,14 +30,12 @@ from src.display.utils import (
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
-from src.evaluator.evaluate import process_evaluation_queue
-import threading
-import time
 def restart_space():
     try:
-        API.restart_space(repo_id=REPO_ID)
     except Exception as e:
         print(f"Error restarting space: {str(e)}")
         try:
@@ -53,21 +51,6 @@ def restart_space():
-def run_evaluator():
-    print("Starting evaluator service...")
-    while True:
-        try:
-            process_evaluation_queue()
-            print("Evaluation queue processed. Sleeping for 5 minutes...")
-            time.sleep(10)  # Sleep for 5 minutes
-        except Exception as e:
-            print(f"Error in evaluation process: {e}")
-            print("Retrying in 5 minutes...")
-            time.sleep(10)
 def init_leaderboard(dataframe):
     if dataframe is None:
         raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -93,27 +76,14 @@ def init_leaderboard(dataframe):
-def evaluate_and_update(model_name, revision, precision, weight_type):
-    """Add a model evaluation request to the queue"""
-    try:
-        add_new_eval(
-            model_name=model_name,
-            revision=revision,
-            precision=precision,
-            weight_type=weight_type,
-            model_type="LLM",
-        )
-        get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-        return "Evaluation request added to queue! Check the leaderboard for updates."
-    except Exception as e:
-        print(f"Error in evaluate_and_update: {str(e)}")
-        print(f"Full traceback: {traceback.format_exc()}")
-        return f"Error adding evaluation request: {str(e)}"
 ### Space initialisation
 try:
     print(f"\n=== Starting space initialization ===")
     print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
     print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
     print(f"QUEUE_REPO: {QUEUE_REPO}")
@@ -144,12 +114,8 @@ try:
 except Exception as e:
     print(f"\n=== Error during space initialization ===")
     print(f"Error: {str(e)}")
-    restart_space()
-evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
-evaluator_thread.start()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
@@ -270,6 +236,6 @@ with demo:
             )
 scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

 from dotenv import load_dotenv
+import sys
 load_dotenv()
 import gradio as gr
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
     try:
+        print("Restarting space...")
+        API.restart_space(repo_id=REPO_ID,token=TOKEN)
     except Exception as e:
         print(f"Error restarting space: {str(e)}")
         try:
 def init_leaderboard(dataframe):
     if dataframe is None:
         raise ValueError("Leaderboard DataFrame is empty or None.")
+# API.delete_files(repo_id=QUEUE_REPO, token=TOKEN,delete_patterns=["*"],commit_message="Clearing queue",repo_type="dataset")
+# API.delete_files(repo_id=RESULTS_REPO, token=TOKEN,delete_patterns=["*"],commit_message="Clearing results",repo_type="dataset")
+# sys.exit(0)
 ### Space initialisation
 try:
     print(f"\n=== Starting space initialization ===")
     print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
     print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
     print(f"QUEUE_REPO: {QUEUE_REPO}")
 except Exception as e:
     print(f"\n=== Error during space initialization ===")
     print(f"Error: {str(e)}")
+    # restart_space()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
             )
 scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=300)
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

pyproject.toml CHANGED Viewed

@@ -17,6 +17,7 @@ dependencies = [
     "matplotlib>=3.10.3",
     "numpy>=2.3.1",
     "pandas>=2.3.0",
     "python-dateutil>=2.9.0.post0",
     "python-dotenv>=1.1.1",
     "scikit-learn>=1.7.0",

     "matplotlib>=3.10.3",
     "numpy>=2.3.1",
     "pandas>=2.3.0",
+    "pydantic>=2.11.7",
     "python-dateutil>=2.9.0.post0",
     "python-dotenv>=1.1.1",
     "scikit-learn>=1.7.0",

src/envs.py CHANGED Viewed

@@ -9,7 +9,7 @@ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
 OWNER = "hamzabouajila" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"

 OWNER = "hamzabouajila" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
+REPO_ID = f"{OWNER}/TunisianLLMLeaderBoard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"

src/evaluator/evaluate.py CHANGED Viewed

@@ -1,19 +1,16 @@
 import json
 import os
-import time
-from typing import Dict, Any
 from dataclasses import dataclass
 from enum import Enum
-from datetime import datetime
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import traceback
-from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, QUEUE_REPO
 from src.evaluator.tunisian_corpus_coverage import evaluate_tunisian_corpus_coverage
 from src.evaluator.tsac import evaluate_tsac_sentiment
-from huggingface_hub import snapshot_download
 class EvaluationStatus(Enum):
@@ -121,7 +118,34 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
             error=error_msg
         )
 def process_evaluation_queue():
     """
     Processes all pending evaluations in the queue.
@@ -130,22 +154,6 @@ def process_evaluation_queue():
     """
     print(f"\n=== Starting evaluation queue processing ===")
     print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    # --- NEW STEP: Download the latest queue from Hugging Face Hub ---
-    try:
-        print(f"Downloading evaluation requests from: {QUEUE_REPO}")
-        snapshot_download(
-            repo_id=QUEUE_REPO,
-            repo_type="dataset",
-            local_dir=EVAL_REQUESTS_PATH,
-            local_dir_use_symlinks=False,
-            token=API.token
-        )
-        print("Successfully downloaded evaluation queue.")
-    except Exception as e:
-        print(f"Error downloading evaluation queue: {str(e)}")
-        print(f"Full traceback: {traceback.format_exc()}")
-        return
     print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
@@ -163,7 +171,8 @@ def process_evaluation_queue():
                     with open(file_path, 'r') as f:
                         eval_entry = json.load(f)
-                    status = eval_entry.get('status', '')
                     if status == EvaluationStatus.PENDING.value:
                         print(f"Found pending evaluation for model: {eval_entry['model']}")
@@ -194,6 +203,7 @@ def process_evaluation_queue():
                             precision=eval_entry['precision'],
                             weight_type=eval_entry['weight_type']
                         )
                         print("\n=== Evaluation completed ===")
                         # --- Step 3: Update file with final status and results locally ---
@@ -238,7 +248,9 @@ def process_evaluation_queue():
                              print(f"Final status for {eval_entry['model']} updated in the queue repository.")
                         except Exception as status_update_error:
                             print(f"Error updating status in queue: {str(status_update_error)}")
                     else:
                         print(f"Skipping file with status: {status}")
                 except Exception as e:

 import json
 import os
+from datetime import datetime,timedelta,timezone
+from typing import Dict
 from dataclasses import dataclass
 from enum import Enum
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import traceback
+from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, QUEUE_REPO,TOKEN
 from src.evaluator.tunisian_corpus_coverage import evaluate_tunisian_corpus_coverage
 from src.evaluator.tsac import evaluate_tsac_sentiment
 class EvaluationStatus(Enum):
             error=error_msg
         )
+def reset_stale_running_eval(eval_entry,root ,file_path ,filename ,timeout_interval=10):
+    submission = eval_entry.get("submitted_time")
+    try:
+        started = datetime.fromisoformat(submission)  # aware datetime
+    except Exception as e:
+        print("Invalid submitted_time format:", submission, e)
+    now_utc = datetime.now(timezone.utc)
+    if now_utc - started > timedelta(seconds=timeout_interval):
+        print(f"Timeout detected — resetting {eval_entry['model']} to PENDING")
+        eval_entry["status"] = EvaluationStatus.PENDING.value
+        eval_entry["submitted_time"] = now_utc.isoformat()
+        with open(file_path, 'w') as f:
+            json.dump(eval_entry, f, indent=2)
+        API.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=os.path.join(os.path.basename(root), filename),
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            commit_message=f"Update status to PENDING for {eval_entry['model']} (timeout)",
+            token=TOKEN
+        )
+        return
 def process_evaluation_queue():
     """
     Processes all pending evaluations in the queue.
     """
     print(f"\n=== Starting evaluation queue processing ===")
     print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
     print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
                     with open(file_path, 'r') as f:
                         eval_entry = json.load(f)
+                    status = eval_entry.get('status', '')
                     if status == EvaluationStatus.PENDING.value:
                         print(f"Found pending evaluation for model: {eval_entry['model']}")
                             precision=eval_entry['precision'],
                             weight_type=eval_entry['weight_type']
                         )
                         print("\n=== Evaluation completed ===")
                         # --- Step 3: Update file with final status and results locally ---
                              print(f"Final status for {eval_entry['model']} updated in the queue repository.")
                         except Exception as status_update_error:
                             print(f"Error updating status in queue: {str(status_update_error)}")
+                    elif status == EvaluationStatus.RUNNING.value:
+                        print("Found Running evaluation for model: ", eval_entry['model'])
+                        reset_stale_running_eval(eval_entry, root, file_path, filename)
                     else:
                         print(f"Skipping file with status: {status}")
                 except Exception as e:

src/evaluator/run_evaluator.py CHANGED Viewed

@@ -16,12 +16,12 @@ def main():
     while True:
         try:
             process_evaluation_queue()
-            print("Evaluation queue processed. Sleeping for 5 minutes...")
-            time.sleep(20)  # Sleep for 5 minutes
         except Exception as e:
             print(f"Error in evaluation process: {e}")
-            print("Retrying in 5 minutes...")
-            time.sleep(20)
 if __name__ == "__main__":
     main()

     while True:
         try:
             process_evaluation_queue()
+            print("Evaluation queue processed. Sleeping for 3 minutes...")
+            time.sleep(180)  # Sleep for 3 minutes
         except Exception as e:
             print(f"Error in evaluation process: {e}")
+            print("Retrying in 3 minutes...")
+            time.sleep(180)
 if __name__ == "__main__":
     main()

src/submission/submit.py CHANGED Viewed

@@ -61,11 +61,14 @@ def _create_eval_request(
         # Use a try-finally block to ensure the local file is always removed
         try:
             with open(local_path, 'w') as f:
                 json.dump(request_data, f, indent=2)
             # Upload the request file to the Hugging Face queue repository
             print(f"Uploading evaluation request to {QUEUE_REPO}")
             path_in_repo = os.path.join(user_name, request_filename)
             API.upload_file(
                 path_or_fileobj=local_path,
                 path_in_repo=path_in_repo,
@@ -127,6 +130,7 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, wei
                 print(f"Error reading queue file: {e}")
                 print(f"Full traceback:\n{traceback.format_exc()}")
                 return styled_warning("Error checking model status. Please try again later.")
         print(f"No existing submission found for key: {model_key} or previous submission had a FAILED status.")
         # --- Step 2: Validate model type and existence on the Hub ---

         # Use a try-finally block to ensure the local file is always removed
         try:
             with open(local_path, 'w') as f:
+                print(request_data)
                 json.dump(request_data, f, indent=2)
             # Upload the request file to the Hugging Face queue repository
             print(f"Uploading evaluation request to {QUEUE_REPO}")
             path_in_repo = os.path.join(user_name, request_filename)
+            print(path_in_repo)
+            print(local_path)
             API.upload_file(
                 path_or_fileobj=local_path,
                 path_in_repo=path_in_repo,
                 print(f"Error reading queue file: {e}")
                 print(f"Full traceback:\n{traceback.format_exc()}")
                 return styled_warning("Error checking model status. Please try again later.")
         print(f"No existing submission found for key: {model_key} or previous submission had a FAILED status.")
         # --- Step 2: Validate model type and existence on the Hub ---