Spaces:

tunis-ai
/

TunisianEncoderModelsLeaderboard

Running

App Files Files Community

hamzabouajila commited on Jul 8

Commit

742dfc3

1 Parent(s): 28e88f2

implement scripts for checking , add logging and update submission and integrate evaluation

Browse files

Files changed (11) hide show

app.py +39 -9
pyproject.toml +1 -0
scripts/check_model.py +27 -0
scripts/explore_arabml.py +24 -0
scripts/explore_dataset.py +24 -0
scripts/explore_tsac.py +24 -0
src/display/utils.py +22 -0
src/evaluator/evaluate.py +321 -138
src/leaderboard/read_evals.py +49 -9
src/submission/check_validity.py +8 -6
src/submission/submit.py +316 -177

app.py CHANGED Viewed

@@ -1,3 +1,9 @@
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -32,7 +38,32 @@ import time
 def restart_space():
-    API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
@@ -109,25 +140,24 @@ def init_leaderboard(dataframe):
 # Add model evaluation functionality
 def evaluate_and_update(model_name, revision, precision, weight_type):
-    """Evaluate a model and update the leaderboard"""
     try:
-        # Run evaluation
-        eval_result = evaluate_model(model_name, revision, precision, weight_type)
-        # Add evaluation to queue
         add_new_eval(
             model_name=model_name,
             revision=revision,
             precision=precision,
             weight_type=weight_type,
-            results=eval_result.results
         )
         # Update leaderboard
         LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-        return "Evaluation started successfully! Check the leaderboard for updates."
     except Exception as e:
-        return f"Error during evaluation: {str(e)}"
 demo = gr.Blocks(css=custom_css)

+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 from apscheduler.schedulers.background import BackgroundScheduler
 def restart_space():
+    try:
+        # Restart the space
+        API.restart_space(repo_id=REPO_ID)
+    except Exception as e:
+        print(f"Error restarting space: {str(e)}")
+        # If restart fails, try to download the datasets again
+        try:
+            print("Attempting to download datasets again...")
+            snapshot_download(
+                repo_id=QUEUE_REPO,
+                local_dir=EVAL_REQUESTS_PATH,
+                repo_type="dataset",
+                tqdm_class=None,
+                etag_timeout=30,
+                token=TOKEN
+            )
+            snapshot_download(
+                repo_id=RESULTS_REPO,
+                local_dir=EVAL_RESULTS_PATH,
+                repo_type="dataset",
+                tqdm_class=None,
+                etag_timeout=30,
+                token=TOKEN
+            )
+        except Exception as download_error:
+            print(f"Error downloading datasets: {str(download_error)}")
 ### Space initialisation
 try:
 # Add model evaluation functionality
 def evaluate_and_update(model_name, revision, precision, weight_type):
+    """Add a model evaluation request to the queue"""
     try:
+        # Add evaluation request to queue
         add_new_eval(
             model_name=model_name,
             revision=revision,
             precision=precision,
             weight_type=weight_type,
+            model_type="LLM",  # Add appropriate model type
         )
         # Update leaderboard
         LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+        return "Evaluation request added to queue! Check the leaderboard for updates."
     except Exception as e:
+        print(f"Error in evaluate_and_update: {str(e)}")
+        print(f"Full traceback: {traceback.format_exc()}")
+        return f"Error adding evaluation request: {str(e)}"
 demo = gr.Blocks(css=custom_css)

pyproject.toml CHANGED Viewed

@@ -18,6 +18,7 @@ dependencies = [
     "numpy>=2.3.1",
     "pandas>=2.3.0",
     "python-dateutil>=2.9.0.post0",
     "scikit-learn>=1.7.0",
     "sentencepiece>=0.2.0",
     "tokenizers>=0.15.0",

     "numpy>=2.3.1",
     "pandas>=2.3.0",
     "python-dateutil>=2.9.0.post0",
+    "python-dotenv>=1.1.1",
     "scikit-learn>=1.7.0",
     "sentencepiece>=0.2.0",
     "tokenizers>=0.15.0",

scripts/check_model.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from transformers import AutoConfig
+import torch
+def check_model(model_name):
+    try:
+        # Try to load the model configuration
+        config = AutoConfig.from_pretrained(model_name)
+        print("\nModel Configuration:")
+        print(config)
+        # Check if model_type is present
+        print("\nModel Type:", config.model_type if hasattr(config, 'model_type') else 'Not specified')
+        # Try to load the model
+        print("\nAttempting to load model...")
+        model = AutoModelForSequenceClassification.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            trust_remote_code=True
+        )
+        print("\nSuccessfully loaded model!")
+    except Exception as e:
+        print(f"\nError: {str(e)}")
+if __name__ == "__main__":
+    check_model("HabibBelguith44/Llama3-Tunisian-Dialect")

scripts/explore_arabml.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from datasets import load_dataset
+def explore_arabml():
+    # Load the ArabML dataset
+    dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
+    # Print dataset info
+    print("\nDataset Info:")
+    print(dataset.info)
+    # Print first example
+    print("\nFirst Example:")
+    print(dataset[0])
+    # Print all column names
+    print("\nColumn Names:")
+    print(dataset.column_names)
+    # Print first few rows
+    print("\nFirst few rows:")
+    print(dataset[:3])
+if __name__ == "__main__":
+    explore_arabml()

scripts/explore_dataset.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from datasets import load_dataset
+def explore_dataset():
+    # Load the dataset
+    dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
+    # Print dataset info
+    print("\nDataset Info:")
+    print(dataset.info)
+    # Print first example
+    print("\nFirst Example:")
+    print(dataset[0])
+    # Print all column names
+    print("\nColumn Names:")
+    print(dataset.column_names)
+    # Print first few rows
+    print("\nFirst few rows:")
+    print(dataset[:3])
+if __name__ == "__main__":
+    explore_dataset()

scripts/explore_tsac.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from datasets import load_dataset
+def explore_tsac():
+    # Load the TSAC dataset
+    dataset = load_dataset("fbougares/tsac", split="train", trust_remote_code=True)
+    # Print dataset info
+    print("\nDataset Info:")
+    print(dataset.info)
+    # Print first example
+    print("\nFirst Example:")
+    print(dataset[0])
+    # Print all column names
+    print("\nColumn Names:")
+    print(dataset.column_names)
+    # Print first few rows
+    print("\nFirst few rows:")
+    print(dataset[:3])
+if __name__ == "__main__":
+    explore_tsac()

src/display/utils.py CHANGED Viewed

@@ -86,6 +86,28 @@ class WeightType(Enum):
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")

     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
+    @staticmethod
+    def from_str(weight_type):
+        """Convert string representation to WeightType enum value.
+        Args:
+            weight_type (str): The string representation of the weight type
+        Returns:
+            WeightType: The corresponding enum value
+        Raises:
+            ValueError: If the weight type is not recognized
+        """
+        weight_type = str(weight_type).lower()
+        if weight_type == "adapter":
+            return WeightType.Adapter
+        elif weight_type == "original":
+            return WeightType.Original
+        elif weight_type == "delta":
+            return WeightType.Delta
+        raise ValueError(f"Unknown weight type: {weight_type}")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")

src/evaluator/evaluate.py CHANGED Viewed

@@ -5,7 +5,7 @@ from dataclasses import dataclass
 from enum import Enum
 from datetime import datetime
 import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from datasets import load_dataset
 from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
@@ -29,94 +29,252 @@ class EvaluationResult:
 def evaluate_tsac_sentiment(model, tokenizer, device):
     """Evaluate model on TSAC sentiment analysis task"""
     try:
-        dataset = load_dataset("fbougares/tsac", split="train")
         def preprocess(examples):
-            return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
         dataset = dataset.map(preprocess, batched=True)
-        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
         model.eval()
         with torch.no_grad():
             predictions = []
-            labels = []
-            for batch in dataset:
-                inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
-                label = batch['label'].to(device)
                 outputs = model(**inputs)
-                predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
-                labels.extend(label.cpu().tolist())
-        accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
-        return accuracy
     except Exception as e:
-        print(f"Error in TSAC evaluation: {str(e)}")
-        return 0.0
-def evaluate_tunisian_corpus_coverage(model, tokenizer):
     """Evaluate model's coverage on Tunisian Dialect Corpus"""
     try:
         dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
         def preprocess(examples):
-            return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
         dataset = dataset.map(preprocess, batched=True)
-        # Calculate coverage based on tokenization
         total_tokens = 0
         covered_tokens = 0
         for example in dataset:
-            tokens = tokenizer.tokenize(example['text'])
             total_tokens += len(tokens)
             covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
         coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
-        return coverage
     except Exception as e:
         print(f"Error in Tunisian Corpus evaluation: {str(e)}")
-        return 0.0
 def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
     """Evaluate a single model on all tasks"""
     try:
-        print(f"------------ evaluation model {model_name}")
-        # Load model and tokenizer
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        model = AutoModelForSequenceClassification.from_pretrained(
-            model_name,
-            revision=revision,
-            torch_dtype=getattr(torch, precision),
-            trust_remote_code=True
-        ).to(device)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
-        # Run evaluations
-        results = {}
-        # TSAC Sentiment
-        tsac_result = evaluate_tsac_sentiment(model, tokenizer, device)
-        results[Tasks.tsac_sentiment.value.benchmark] = tsac_result
-        # Tunisian Corpus Coverage
-        corpus_result = evaluate_tunisian_corpus_coverage(model, tokenizer)
-        results[Tasks.tunisian_corpus.value.benchmark] = corpus_result
-        return EvaluationResult(
-            model=model_name,
-            revision=revision,
-            precision=precision,
-            weight_type=weight_type,
-            results=results
-        )
     except Exception as e:
         return EvaluationResult(
             model=model_name,
             revision=revision,
@@ -128,99 +286,124 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
 def process_evaluation_queue():
     """Process all pending evaluations in the queue"""
-    # Get all pending evaluations (including nested directories)
-    queue_dir = os.path.join(EVAL_REQUESTS_PATH)
     pending_files = []
-    # Walk through the directory tree
-    for root, dirs, files in os.walk(queue_dir):
-        pending_files.extend([os.path.join(root, f) for f in files if f.endswith('.json')])
     for file_path in pending_files:
-        with open(file_path, 'r') as f:
-            eval_request = json.load(f)
-        if eval_request.get('status') != EvaluationStatus.PENDING.value:
-            continue
-        print(f"Processing evaluation request: {file_path}")
-        # Mark as running
-        eval_request['status'] = EvaluationStatus.RUNNING.value
-        with open(file_path, 'w') as f:
-            json.dump(eval_request, f, indent=2)
-        # Perform evaluation
-        result = evaluate_model(
-            model_name=eval_request['model'],
-            revision=eval_request['revision'],
-            precision=eval_request['precision'],
-            weight_type=eval_request['weight_type']
-        )
-        # Save results
-        if result.error:
-            eval_request['status'] = EvaluationStatus.FAILED.value
-            eval_request['error'] = result.error
-        else:
-            eval_request['status'] = EvaluationStatus.FINISHED.value
-            eval_request['results'] = result.results
-        with open(file_path, 'w') as f:
-            json.dump(eval_request, f, indent=2)
-        # Save to results dataset
-        # Extract username from model path if it exists
-        username = result.model.split('/')[0] if '/' in result.model else ''
-        result_filename = f"{result.model.split('/')[-1]}_{result.precision}.json"
-        if username:
-            # Create user directory if it doesn't exist
-            user_dir = os.path.join(EVAL_RESULTS_PATH, username)
-            os.makedirs(user_dir, exist_ok=True)
-            result_file = os.path.join(user_dir, result_filename)
-        else:
-            result_file = os.path.join(EVAL_RESULTS_PATH, result_filename)
-        # First, update the request file with the results
-        request_file = os.path.join(os.path.dirname(file_path), os.path.basename(file_path))
-        with open(file_path, 'r') as f:
-            request_data = json.load(f)
-        # Update request file with results and status
-        request_data['results'] = result.results
-        request_data['status'] = EvaluationStatus.FINISHED.value
-        with open(file_path, 'w') as f:
-            json.dump(request_data, f, indent=2)
-        # Now create the results file
-        with open(result_file, 'w') as f:
-            json.dump({
-                'model': result.model,
-                'revision': result.revision,
-                'precision': result.precision,
-                'weight_type': result.weight_type,
-                'results': result.results,
-                'config': {
-                    'model_name': result.model,
-                    'model_dtype': result.precision,
-                    'model_type': result.weight_type,
-                    'architecture': 'Unknown',
-                    'license': request_data.get('license', '?'),
-                    'likes': request_data.get('likes', 0),
-                    'num_params': request_data.get('params', 0),
-                    'date': request_data.get('submitted_time', datetime.now().strftime('%Y-%m-%d')),
-                    'still_on_hub': True
-                }
-            }, f, indent=2)
         # Upload to Hugging Face
-        API.upload_file(
-            path_or_fileobj=result_file,
-            path_in_repo=result_filename if not username else os.path.join(username, result_filename),
-            repo_id=f"{OWNER}/results",
-            repo_type="dataset",
-            commit_message=f"Add evaluation results for {result.model}"
-        )

 from enum import Enum
 from datetime import datetime
 import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from datasets import load_dataset
 from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
 def evaluate_tsac_sentiment(model, tokenizer, device):
     """Evaluate model on TSAC sentiment analysis task"""
     try:
+        print("\n=== Starting TSAC sentiment evaluation ===")
+        print(f"Current device: {device}")
+        # Load and preprocess dataset
+        print("\nLoading and preprocessing TSAC dataset...")
+        dataset = load_dataset("fbougares/tsac", split="test", trust_remote_code=True)
+        print(f"Dataset size: {len(dataset)} examples")
         def preprocess(examples):
+            print(f"\nProcessing batch of {len(examples['sentence'])} examples")
+            # Use 'sentence' field as per dataset structure
+            return tokenizer(
+                examples['sentence'],
+                padding=True,
+                truncation=True,
+                max_length=512,
+                return_tensors='pt'
+            )
         dataset = dataset.map(preprocess, batched=True)
+        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
+        # Check first example
+        first_example = dataset[0]
+        print("\nFirst example details:")
+        print(f"Input IDs shape: {first_example['input_ids'].shape}")
+        print(f"Attention mask shape: {first_example['attention_mask'].shape}")
+        print(f"Target: {first_example['target']}")
         model.eval()
+        print(f"\nModel class: {model.__class__.__name__}")
+        print(f"Model device: {next(model.parameters()).device}")
         with torch.no_grad():
             predictions = []
+            targets = []
+            for i, batch in enumerate(dataset):
+                if i == 0:
+                    print("\nProcessing first batch...")
+                    print(f"Batch keys: {list(batch.keys())}")
+                    print(f"Target shape: {batch['target'].shape}")
+                inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
+                target = batch['target'].to(device)
                 outputs = model(**inputs)
+                print(f"\nBatch {i} output type: {type(outputs)}")
+                # Handle different model output formats
+                if isinstance(outputs, dict):
+                    print(f"Output keys: {list(outputs.keys())}")
+                    if 'logits' in outputs:
+                        logits = outputs['logits']
+                    elif 'prediction_logits' in outputs:
+                        logits = outputs['prediction_logits']
+                    else:
+                        raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
+                elif isinstance(outputs, tuple):
+                    print(f"Output tuple length: {len(outputs)}")
+                    logits = outputs[0]
+                else:
+                    logits = outputs
+                print(f"Logits shape: {logits.shape}")
+                # For sequence classification, we typically use the [CLS] token's prediction
+                if len(logits.shape) == 3:  # [batch_size, sequence_length, num_classes]
+                    logits = logits[:, 0, :]  # Take the [CLS] token prediction
+                print(f"Final logits shape: {logits.shape}")
+                batch_predictions = logits.argmax(dim=-1).cpu().tolist()
+                batch_targets = target.cpu().tolist()
+                predictions.extend(batch_predictions)
+                targets.extend(batch_targets)
+                if i == 0:
+                    print("\nFirst batch predictions:")
+                    print(f"Predictions: {batch_predictions[:5]}")
+                    print(f"Targets: {batch_targets[:5]}")
+            print(f"\nTotal predictions: {len(predictions)}")
+            print(f"Total targets: {len(targets)}")
+            # Calculate accuracy
+            correct = sum(p == t for p, t in zip(predictions, targets))
+            total = len(predictions)
+            accuracy = correct / total if total > 0 else 0.0
+            print(f"\nEvaluation results:")
+            print(f"Correct predictions: {correct}")
+            print(f"Total predictions: {total}")
+            print(f"Accuracy: {accuracy:.4f}")
+            return {"accuracy": accuracy}
     except Exception as e:
+        print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
+        print(f"Full traceback: {traceback.format_exc()}")
+        raise e
+def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
     """Evaluate model's coverage on Tunisian Dialect Corpus"""
     try:
         dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
         def preprocess(examples):
+            print("Tunisian Corpus preprocess exemples -------------",examples)
+            # Use 'Tweet' field as per dataset structure
+            return tokenizer(examples['Tweet'], padding=True, truncation=True, max_length=512)
         dataset = dataset.map(preprocess, batched=True)
+        # Calculate token coverage
         total_tokens = 0
         covered_tokens = 0
         for example in dataset:
+            tokens = tokenizer.tokenize(example['Tweet'])
             total_tokens += len(tokens)
             covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
         coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
+        print(f"Tunisian Corpus Coverage: {coverage:.2%}")
+        return {"coverage": coverage}
     except Exception as e:
         print(f"Error in Tunisian Corpus evaluation: {str(e)}")
+        raise e  # Raise the error instead of returning 0.0
 def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
     """Evaluate a single model on all tasks"""
     try:
+        print(f"\nStarting evaluation for model: {model_name} (revision: {revision}, precision: {precision}, weight_type: {weight_type})")
+        print(f"Current working directory: {os.getcwd()}")
+        print(f"Evaluation requests path: {EVAL_REQUESTS_PATH}")
+        print(f"Evaluation results path: {EVAL_RESULTS_PATH}")
+        # Initialize device
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {device}")
+        # Load model and tokenizer with enhanced error handling
+        try:
+            print(f"\nLoading model: {model_name}")
+            print(f"Model path exists: {os.path.exists(model_name)}")
+            # First try to load the config to check model type
+            try:
+                config = AutoConfig.from_pretrained(model_name, revision=revision)
+                print(f"Model type from config: {config.model_type}")
+            except Exception as config_error:
+                print(f"Error loading config: {str(config_error)}")
+            # Try loading with trust_remote_code=True first
+            try:
+                print("\nAttempting to load with trust_remote_code=True...")
+                model = AutoModelForSequenceClassification.from_pretrained(
+                    model_name,
+                    revision=revision,
+                    torch_dtype=getattr(torch, precision),
+                    trust_remote_code=True
+                ).to(device)
+                print(f"Successfully loaded model {model_name} with trust_remote_code=True")
+                print(f"Model class: {model.__class__.__name__}")
+            except Exception as e1:
+                print(f"Error loading with trust_remote_code=True: {str(e1)}")
+                print(f"Error type: {type(e1).__name__}")
+                # If it's a model type error, try with llama as model type
+                if "Unrecognized model" in str(e1) and "llama" in model_name.lower():
+                    print("\nAttempting to load as llama model...")
+                    try:
+                        model = AutoModelForSequenceClassification.from_pretrained(
+                            model_name,
+                            revision=revision,
+                            torch_dtype=getattr(torch, precision),
+                            trust_remote_code=True,
+                            model_type="llama"
+                        ).to(device)
+                        print(f"Successfully loaded model {model_name} as llama model")
+                        print(f"Model class: {model.__class__.__name__}")
+                    except Exception as e2:
+                        print(f"Error loading as llama model: {str(e2)}")
+                        print(f"Error type: {type(e2).__name__}")
+                        raise Exception(f"Failed to load model with both methods: {str(e1)}, {str(e2)}")
+                else:
+                    raise e1
+            print(f"\nLoading tokenizer: {model_name}")
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
+                print(f"Successfully loaded tokenizer for {model_name}")
+                print(f"Tokenizer class: {tokenizer.__class__.__name__}")
+            except Exception as e:
+                print(f"Error loading tokenizer: {str(e)}")
+                print(f"Error type: {type(e).__name__}")
+                raise Exception(f"Failed to load tokenizer: {str(e)}")
+            # Run evaluations
+            print("\nStarting TSAC sentiment evaluation...")
+            try:
+                tsac_results = evaluate_tsac_sentiment(model, tokenizer, device)
+                print(f"TSAC results: {tsac_results}")
+            except Exception as e:
+                print(f"Error in TSAC evaluation for {model_name}: {str(e)}")
+                print(f"Error type: {type(e).__name__}")
+                tsac_results = {"accuracy": None}
+            print("\nStarting Tunisian Corpus evaluation...")
+            try:
+                tunisian_results = evaluate_tunisian_corpus_coverage(model, tokenizer, device)
+                print(f"Tunisian Corpus results: {tunisian_results}")
+            except Exception as e:
+                print(f"Error in Tunisian Corpus evaluation for {model_name}: {str(e)}")
+                print(f"Error type: {type(e).__name__}")
+                tunisian_results = {"coverage": None}
+            print("\nEvaluation completed successfully!")
+            print(f"Final results: {tsac_results} | {tunisian_results}")
+            return EvaluationResult(
+                model=model_name,
+                revision=revision,
+                precision=precision,
+                weight_type=weight_type,
+                results={
+                    **tsac_results,
+                    **tunisian_results
+                }
+            )
+        except Exception as e:
+            print(f"\nError loading model {model_name}: {str(e)}")
+            print(f"Error type: {type(e).__name__}")
+            print(f"Full traceback: {traceback.format_exc()}")
+            return EvaluationResult(
+                model=model_name,
+                revision=revision,
+                precision=precision,
+                weight_type=weight_type,
+                results={},
+                error=str(e)
+            )
     except Exception as e:
+        print(f"\nError evaluating model {model_name}: {str(e)}")
+        print(f"Error type: {type(e).__name__}")
+        print(f"Full traceback: {traceback.format_exc()}")
         return EvaluationResult(
             model=model_name,
             revision=revision,
 def process_evaluation_queue():
     """Process all pending evaluations in the queue"""
+    print(f"\n=== Starting evaluation queue processing ===")
+    print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
+    # Get all pending evaluations
+    if not os.path.exists(EVAL_REQUESTS_PATH):
+        print(f"Evaluation requests path does not exist: {EVAL_REQUESTS_PATH}")
+        return
     pending_files = []
+    for file in os.listdir(EVAL_REQUESTS_PATH):
+        if file.endswith('.json'):
+            pending_files.append(os.path.join(EVAL_REQUESTS_PATH, file))
+    print(f"Found {len(pending_files)} pending evaluation requests")
+    for file_path in pending_files:
+        print(f"  - {file_path}")
+    if not pending_files:
+        print("No pending evaluation requests found")
+        return
     for file_path in pending_files:
+        try:
+            print(f"\n=== Processing evaluation request: {file_path} ===")
+            # Read the file atomically
+            try:
+                with open(file_path, 'r') as f:
+                    eval_request = json.load(f)
+                print(f"Loaded evaluation request: {json.dumps(eval_request, indent=2)}")
+            except Exception as e:
+                print(f"Error reading evaluation request: {str(e)}")
+                continue
+            # Skip non-pending evaluations
+            status = eval_request.get('status', 'UNKNOWN')
+            if status != EvaluationStatus.PENDING.value:
+                print(f"Skipping non-pending evaluation (status: {status})")
+                continue
+            # Update status to RUNNING
+            eval_request['status'] = EvaluationStatus.RUNNING.value
+            print(f"Updating status to RUNNING for {eval_request['model']}")
+            # Write the update atomically
+            try:
+                with open(file_path, 'w') as f:
+                    json.dump(eval_request, f, indent=2)
+                print("Successfully updated status to RUNNING")
+            except Exception as e:
+                print(f"Error updating status: {str(e)}")
+                continue
+            # Get model info from request
+            model_name = eval_request.get('model', '')
+            revision = eval_request.get('revision', '')
+            precision = eval_request.get('precision', '')
+            weight_type = eval_request.get('weight_type', '')
+            if not model_name:
+                print("Error: Missing model name in evaluation request")
+                continue
+            print(f"\n=== Evaluating model: {model_name} ===")
+            print(f"Revision: {revision}")
+            print(f"Precision: {precision}")
+            print(f"Weight type: {weight_type}")
+            result = evaluate_model(model_name, revision, precision, weight_type)
+            # Update status and save results
+            if result.error:
+                print(f"\n=== Evaluation failed ===")
+                print(f"Error: {result.error}")
+                eval_request['status'] = EvaluationStatus.FAILED.value
+                eval_request['error'] = result.error
+            else:
+                print(f"\n=== Evaluation completed successfully ===")
+                print(f"Results: {result.results}")
+                eval_request['status'] = EvaluationStatus.FINISHED.value
+                eval_request['results'] = result.results
+            # Write the final update atomically
+            try:
+                with open(file_path, 'w') as f:
+                    json.dump(eval_request, f, indent=2)
+                print("Successfully saved evaluation results")
+            except Exception as e:
+                print(f"Error saving evaluation results: {str(e)}")
+                continue
+            # Move successful evaluations to results directory
+            if eval_request['status'] == EvaluationStatus.FINISHED.value:
+                try:
+                    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
+                    result_file = os.path.join(EVAL_RESULTS_PATH, os.path.basename(file_path))
+                    os.rename(file_path, result_file)
+                    print(f"Moved evaluation results to: {result_file}")
+                except Exception as e:
+                    print(f"Error moving results file: {str(e)}")
+        except Exception as e:
+            print(f"\n=== Error processing evaluation: {str(e)} ===")
+            print(f"Full traceback: {traceback.format_exc()}")
+            continue
         # Upload to Hugging Face
+        try:
+            if 'result_file' in locals():
+                API.upload_file(
+                    path_or_fileobj=result_file,
+                    path_in_repo=result_filename if not username else os.path.join(username, result_filename),
+                    repo_id=f"{OWNER}/results",
+                    repo_type="dataset",
+                    commit_message=f"Add evaluation results for {result.model}"
+                )
+                print("Successfully uploaded results to Hugging Face")
+        except Exception as e:
+            print(f"Error uploading results to Hugging Face: {str(e)}")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -35,11 +35,37 @@ class EvalResult:
     @classmethod
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            print(json_filepath)
-            data = json.load(fp)
-        print(data)
-        config = data.get("config")
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
@@ -71,7 +97,7 @@ class EvalResult:
         results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
@@ -167,9 +193,23 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         eval_name = eval_result.eval_name

     @classmethod
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
+        try:
+            with open(json_filepath) as fp:
+                data = json.load(fp)
+            # Get model info
+            model_name = data.get('model')
+            org_and_model = model_name.split("/", 1)
+            org = org_and_model[0]
+            model = org_and_model[1]
+            # Get results
+            results = data.get('results', {})
+            precision = Precision.from_str(data.get('precision', 'Unknown'))
+            # Create EvalResult
+            return EvalResult(
+                eval_name=f"{org}_{model}_{precision.value}",
+                full_model=model_name,
+                org=org,
+                model=model,
+                revision=data.get('revision', ''),
+                results=results,
+                precision=precision,
+                model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
+                weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
+                date=data.get('submitted_at', ''),
+                still_on_hub=is_model_on_hub(model_name)
+            )
+        except Exception as e:
+            print(f"Error reading evaluation file {json_filepath}: {str(e)}")
+            return None
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
         results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
+        try:
+            # Creation of result
+            eval_result = EvalResult.init_from_json_file(model_result_filepath)
+            if eval_result is None:
+                print(f"Skipping invalid evaluation file: {model_result_filepath}")
+                continue
+            eval_result.update_with_request_file(requests_path)
+            # Store results of same eval together
+            if eval_result.eval_name not in eval_results:
+                eval_results[eval_result.eval_name] = []
+            eval_results[eval_result.eval_name].append(eval_result)
+        except Exception as e:
+            print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
+            continue
         # Store results of same eval together
         eval_name = eval_result.eval_name

src/submission/check_validity.py CHANGED Viewed

@@ -74,10 +74,10 @@ def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
-def already_submitted_models(requested_models_dir: str) -> set[str]:
-    """Gather a list of already submitted models to avoid duplicates"""
     depth = 1
-    file_names = []
     users_to_submission_dates = defaultdict(list)
     for root, _, files in os.walk(requested_models_dir):
@@ -86,9 +86,11 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
             for file in files:
                 if not file.endswith(".json"):
                     continue
-                with open(os.path.join(root, file), "r") as f:
                     info = json.load(f)
-                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
                     # Select organisation
                     if info["model"].count("/") == 0 or "submitted_time" not in info:
@@ -96,4 +98,4 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
                     organisation, _ = info["model"].split("/")
                     users_to_submission_dates[organisation].append(info["submitted_time"])
-    return set(file_names), users_to_submission_dates

     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
+def already_submitted_models(requested_models_dir: str) -> dict:
+    """Gather a mapping of submitted models to their queue files to avoid duplicates"""
     depth = 1
+    requested_models = {}
     users_to_submission_dates = defaultdict(list)
     for root, _, files in os.walk(requested_models_dir):
             for file in files:
                 if not file.endswith(".json"):
                     continue
+                queue_file = os.path.join(root, file)
+                with open(queue_file, "r") as f:
                     info = json.load(f)
+                    model_key = f"{info['model']}_{info['revision']}_{info['precision']}"
+                    requested_models[model_key] = queue_file
                     # Select organisation
                     if info["model"].count("/") == 0 or "submitted_time" not in info:
                     organisation, _ = info["model"].split("/")
                     users_to_submission_dates[organisation].append(info["submitted_time"])
+    return requested_models, users_to_submission_dates

src/submission/submit.py CHANGED Viewed

@@ -20,6 +20,58 @@ import time
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     base_model: str,
@@ -28,144 +80,293 @@ def add_new_eval(
     weight_type: str,
     model_type: str,
 ):
-    global REQUESTED_MODELS
-    global USERS_TO_SUBMISSION_DATES
-    if not REQUESTED_MODELS:
         REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-    user_name = ""
-    model_path = model
-    if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
-    precision = precision.split(" ")[0]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
-    # Does the model actually exist?
-    if revision == "":
-        revision = "main"
-    # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
-    # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "model": model,
-        "base_model": base_model,
-        "revision": revision,
-        "precision": precision,
-        "weight_type": weight_type,
-        "status": "PENDING",
-        "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-        "private": False,
-    }
-    # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
-    )
-    # Remove the local file
-    os.remove(out_path)
-    # Run evaluation immediately
-    print(f"Evaluating model {model}...")
-    try:
-        # Load model and tokenizer
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        model_obj = AutoModelForSequenceClassification.from_pretrained(
-            model,
-            revision=revision,
-            torch_dtype=getattr(torch, precision),
-            trust_remote_code=True
-        ).to(device)
-        tokenizer = AutoTokenizer.from_pretrained(model, revision=revision)
-        # Evaluate on TSAC
-        print("Evaluating on TSAC sentiment analysis...")
-        tsac_dataset = load_dataset("fbougares/tsac", split="test")
-        def preprocess_tsac(examples):
-            return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
-        tsac_dataset = tsac_dataset.map(preprocess_tsac, batched=True)
-        tsac_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
         model_obj.eval()
         with torch.no_grad():
             predictions = []
-            labels = []
-            for batch in tsac_dataset:
-                inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
-                label = batch['label'].to(device)
                 outputs = model_obj(**inputs)
-                predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
-                labels.extend(label.cpu().tolist())
-        tsac_accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
         # Evaluate on ArabML
         print("Evaluating on ArabML Tunisian Corpus...")
-        arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
         def preprocess_arabml(examples):
-            return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
         arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
@@ -173,72 +374,10 @@ def add_new_eval(
         covered_tokens = 0
         for example in arabml_dataset:
-            tokens = tokenizer.tokenize(example['text'])
             total_tokens += len(tokens)
             covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
         arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
         # Store results
-        eval_results = {
-            Tasks.tsac_sentiment.value.benchmark: tsac_accuracy,
-            Tasks.tunisian_corpus.value.benchmark: arabml_coverage
-        }
-        print(f"Evaluation results: {eval_results}")
-        # Update eval_entry with results
-        eval_entry["status"] = EvaluationStatus.FINISHED.value
-        eval_entry["results"] = eval_results
-        # Save to results dataset
-        results_file = os.path.join(EVAL_RESULTS_PATH, f"{model}_{revision}_{precision}_{weight_type}.json")
-        with open(results_file, 'w') as f:
-            json.dump({
-                'model': model,
-                'revision': revision,
-                'precision': precision,
-                'weight_type': weight_type,
-                'results': eval_results
-            }, f, indent=2)
-        # Upload results to Hugging Face
-        API.upload_file(
-            path_or_fileobj=results_file,
-            path_in_repo=os.path.basename(results_file),
-            repo_id=RESULTS_REPO,
-            repo_type="dataset",
-            commit_message=f"Add evaluation results for {model}"
-        )
-        # Remove the original eval request file
-        os.remove(out_path)
-        return styled_message(
-            f"Model evaluation completed!\n\n"
-            f"TSAC Sentiment Accuracy: {tsac_accuracy:.2%}\n"
-            f"ArabML Corpus Coverage: {arabml_coverage:.2%}"
-        )
-    except Exception as e:
-        print(f"Error during evaluation: {str(e)}")
-        eval_entry["status"] = EvaluationStatus.FAILED.value
-        eval_entry["error"] = str(e)
-        with open(out_path, "w") as f:
-            f.write(json.dumps(eval_entry))
-        API.upload_file(
-            path_or_fileobj=out_path,
-            path_in_repo=out_path.split("eval-queue/")[1],
-            repo_id=QUEUE_REPO,
-            repo_type="dataset",
-            commit_message=f"Add {model} evaluation error",
-        )
-        os.remove(out_path)
-        return styled_error(
-            f"Error during evaluation: {str(e)}\n\n"
-            "The evaluation will be retried automatically later."
-        )

 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
+def create_eval_request(
+    model: str,
+    base_model: str,
+    revision: str,
+    precision: str,
+    weight_type: str,
+    model_type: str,
+):
+    """Create and upload an evaluation request"""
+    try:
+        # Create evaluation request file
+        request_data = {
+            'model': model,
+            'base_model': base_model,
+            'revision': revision,
+            'precision': precision,
+            'weight_type': weight_type,
+            'model_type': model_type,
+            'status': EvaluationStatus.PENDING.value,
+            'submitted_time': datetime.now(timezone.utc).isoformat()
+        }
+        # Create filename
+        username = model.split('/')[0] if '/' in model else None
+        request_filename = f"{username or 'unknown'}_{model.replace('/', '_')}_eval_request_{revision}_{precision}_{weight_type}.json"
+        request_path = os.path.join(EVAL_REQUESTS_PATH, request_filename)
+        # Write request file
+        with open(request_path, 'w') as f:
+            json.dump(request_data, f, indent=2)
+        print(f"Created evaluation request: {request_filename}")
+        # Upload to Hugging Face
+        API.upload_file(
+            path_or_fileobj=request_path,
+            path_in_repo=request_filename if not username else os.path.join(username, request_filename),
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            commit_message=f"Add evaluation request for {model}",
+            token=TOKEN
+        )
+        print(f"Uploaded evaluation request to {QUEUE_REPO}")
+        return styled_message(
+            "Evaluation request created! Please wait for the evaluation to complete."
+        )
+    except Exception as e:
+        print(f"Error creating evaluation request: {str(e)}")
+        return styled_error(f"Failed to create evaluation request: {str(e)}")
 def add_new_eval(
     model: str,
     base_model: str,
     weight_type: str,
     model_type: str,
 ):
+    """Validate model and create evaluation request"""
+    try:
+        print("\n=== Starting evaluation submission ===")
+        print(f"Submission time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC")
+        print(f"Model: {model}")
+        print(f"Base model: {base_model}")
+        print(f"Revision: {revision}")
+        print(f"Precision: {precision}")
+        print(f"Weight type: {weight_type}")
+        print(f"Model type: {model_type}")
+        print(f"Evaluation requests path: {EVAL_REQUESTS_PATH}")
+        print(f"Queue repo: {QUEUE_REPO}")
+        # Always refresh the cache before checking for duplicates
+        print("\n=== Checking for duplicate submissions ===")
+        global REQUESTED_MODELS
+        global USERS_TO_SUBMISSION_DATES
+        start_time = time.time()
         REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
+        print(f"Cache refresh completed in {time.time() - start_time:.2f} seconds")
+        print(f"Found {len(REQUESTED_MODELS)} existing submissions")
+        user_name = ""
+        model_path = model
+        if "/" in model:
+            user_name = model.split("/")[0]
+            model_path = model.split("/")[1]
+        print(f"\nUser name: {user_name}")
+        print(f"Model path: {model_path}")
+        precision = precision.split(" ")[0]
+        if revision == "":
+            revision = "main"
+            print("Using default revision: main")
+        current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+        # Check if model is already submitted
+        print("\n=== Checking for existing submission ===")
+        model_key = f"{model}_{revision}_{precision}"
+        if model_key in REQUESTED_MODELS:
+            print(f"Found existing submission with key: {model_key}")
+            # Get the status from the queue file
+            queue_file = REQUESTED_MODELS[model_key]
+            try:
+                with open(queue_file, 'r') as f:
+                    queue_entry = json.load(f)
+                status = queue_entry.get('status')
+                print(f"Found existing submission with status: {status}")
+                if status is None:
+                    print(f"Warning: No status found in queue file {queue_file}")
+                    return styled_warning("Error checking model status. Please try again later.")
+                if status != EvaluationStatus.FAILED.value:
+                    print(f"Model already submitted and in {status} status")
+                    return styled_warning(f"This model has been already submitted and is in {status} status.")
+            except Exception as e:
+                print(f"Error reading queue file: {e}")
+                print(f"Full traceback: {traceback.format_exc()}")
+                return styled_warning("Error checking model status. Please try again later.")
+    except Exception as e:
+        print(f"Error during evaluation: {str(e)}")
+        raise
+        print("\n=== Validating model type ===")
+        if model_type is None or model_type == "":
+            print("Error: Model type is missing")
+            return styled_error("Please select a model type.")
+        print("\n=== Validating model existence ===")
+        if revision == "":
+            revision = "main"
+            print("Using default revision: main")
+        print("\n=== Validating model on Hugging Face ===")
+        try:
+            if weight_type in ["Delta", "Adapter"]:
+                print(f"Checking base model {base_model} on Hugging Face...")
+                base_model_on_hub, error, _ = is_model_on_hub(
+                    model_name=base_model,
+                    revision=revision,
+                    token=TOKEN,
+                    test_tokenizer=True
+                )
+                print(f"Base model check result: {base_model_on_hub}")
+                if not base_model_on_hub:
+                    print(f"Error: Base model not found: {error}")
+                    return styled_error(f'Base model "{base_model}" {error}')
+            if not weight_type == "Adapter":
+                print(f"Checking model {model} on Hugging Face...")
+                model_on_hub, error, _ = is_model_on_hub(
+                    model_name=model,
+                    revision=revision,
+                    token=TOKEN,
+                    test_tokenizer=True
+                )
+                print(f"Model check result: {model_on_hub}")
+                if not model_on_hub:
+                    print(f"Error: Model not found: {error}")
+                    return styled_error(f'Model "{model}" {error}')
+        except Exception as e:
+            print(f"Error checking model on Hugging Face: {e}")
+            print(f"Full traceback: {traceback.format_exc()}")
+            return styled_error(f"Failed to validate model on Hugging Face: {str(e)}")
+        print("\n=== Getting model info ===")
+        try:
+            model_info = API.model_info(repo_id=model, revision=revision)
+            print(f"Successfully retrieved model info for {model}")
+        except Exception as e:
+            print(f"Error getting model info: {e}")
+            print(f"Full traceback: {traceback.format_exc()}")
+            return styled_error("Could not get your model information. Please fill it up properly.")
+        print("\n=== Getting model size ===")
+        try:
+            model_size = get_model_size(model_info=model_info, precision=precision)
+            print(f"Model size: {model_size}")
+        except Exception as e:
+            print(f"Error getting model size: {e}")
+            print(f"Full traceback: {traceback.format_exc()}")
+            model_size = "?"
+        print("\n=== Validating model card and license ===")
+        try:
+            license = model_info.cardData["license"]
+            print(f"Model license: {license}")
+        except Exception as e:
+            print(f"Error getting model license: {e}")
+            print(f"Full traceback: {traceback.format_exc()}")
+            return styled_error("Please select a license for your model")
+        print("\n=== Checking model card ===")
+        try:
+            modelcard_OK, error_msg = check_model_card(model)
+            print(f"Model card check result: {modelcard_OK}")
+            if not modelcard_OK:
+                print(f"Model card error: {error_msg}")
+                return styled_error(error_msg)
+        except Exception as e:
+            print(f"Error checking model card: {e}")
+            print(f"Full traceback: {traceback.format_exc()}")
+            return styled_error("Failed to validate model card")
+        print("\n=== Creating evaluation entry ===")
+        eval_entry = {
+            "model": model,
+            "base_model": base_model,
+            "revision": revision,
+            "precision": precision,
+            "weight_type": weight_type,
+            "status": "PENDING",
+            "submitted_time": current_time,
+            "model_type": model_type,
+            "likes": model_info.likes,
+            "params": model_size,
+            "license": license,
+            "private": False,
+        }
+        print(f"\nEvaluation entry created: {json.dumps(eval_entry, indent=2)}")
+        print("\n=== Checking for duplicate submission ===")
+        model_key = f"{model}_{revision}_{precision}"
+        if model_key in REQUESTED_MODELS:
+            print(f"Found existing submission with key: {model_key}")
+            # Get the status from the queue file
+            queue_file = REQUESTED_MODELS[model_key]
+            try:
+                with open(queue_file, 'r') as f:
+                    queue_entry = json.load(f)
+                status = queue_entry.get('status')
+                print(f"Found existing submission with status: {status}")
+                if status is None:
+                    print(f"Warning: No status found in queue file {queue_file}")
+                    return styled_warning("Error checking model status. Please try again later.")
+                if status != EvaluationStatus.FAILED.value:
+                    print(f"Model already submitted and in {status} status")
+                    return styled_warning(f"This model has been already submitted and is in {status} status.")
+            except Exception as e:
+                print(f"Error reading queue file: {e}")
+                print(f"Full traceback: {traceback.format_exc()}")
+                return styled_warning("Error checking model status. Please try again later.")
+        print("\n=== Creating evaluation file ===")
+        OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
+        print(f"Creating output directory: {OUT_DIR}")
+        os.makedirs(OUT_DIR, exist_ok=True)
+        out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
+        print(f"Output file path: {out_path}")
+        # Write evaluation entry to file
+        try:
+            with open(out_path, "w") as f:
+                f.write(json.dumps(eval_entry))
+            print("\nEvaluation file created successfully")
+            # Upload to Hugging Face
+            print("\n=== Uploading evaluation file ===")
+            API.upload_file(
+                path_or_fileobj=out_path,
+                path_in_repo=out_path.split("eval-queue/")[1],
+                repo_id=QUEUE_REPO,
+                repo_type="dataset",
+                commit_message=f"Add evaluation request for {model}",
+                token=TOKEN
+            )
+            print(f"\nEvaluation request uploaded successfully to {QUEUE_REPO}")
+            # Clean up local file
+            os.remove(out_path)
+            print("\nLocal evaluation file removed")
+            return styled_message(
+                "Evaluation request created successfully! Please wait for the evaluation to complete."
+            )
+        except Exception as e:
+            print(f"Error during file operations: {str(e)}")
+            print(f"Full traceback: {traceback.format_exc()}")
+            return styled_error(f"Failed to create evaluation request: {str(e)}")
+        dataloader = DataLoader(tsac_dataset, batch_size=32, shuffle=False)
         model_obj.eval()
         with torch.no_grad():
             predictions = []
+            targets = []
+            for batch in dataloader:
+                inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
+                target = batch['target'].to(device)
+                # Log the first batch details
+                if len(predictions) == 0:  # Only log for the first batch
+                    print(f"\nFirst batch example:")
+                    print(f"Input keys: {list(inputs.keys())}")
+                    print(f"Target shape: {target.shape}")
                 outputs = model_obj(**inputs)
+                print(f"\nModel output type: {type(outputs)}")
+                # Try to get logits from different possible formats
+                if isinstance(outputs, dict):
+                    print(f"Output keys: {list(outputs.keys())}")
+                    # Try different common keys
+                    if 'logits' in outputs:
+                        logits = outputs['logits']
+                    elif 'prediction_logits' in outputs:
+                        logits = outputs['prediction_logits']
+                    else:
+                        raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
+                elif isinstance(outputs, tuple):
+                    print(f"Output tuple length: {len(outputs)}")
+                    # Try different positions in the tuple
+                    if len(outputs) > 0:
+                        logits = outputs[0]
+                    else:
+                        raise ValueError("Empty output tuple")
+                else:
+                    # If it's a single tensor, assume it's the logits
+                    logits = outputs
+                print(f"Logits shape: {logits.shape}")
+                # For sequence classification, we typically use the [CLS] token's prediction
+                # Get the first token's prediction (CLS token)
+                cls_logits = logits[:, 0, :]  # Shape: [batch_size, num_classes]
+                predictions.extend(cls_logits.argmax(dim=-1).cpu().tolist())
+                targets.extend(target.cpu().tolist())
+        accuracy = sum(p == t for p, t in zip(predictions, targets)) / len(predictions)
+        eval_entry['results'] = {'accuracy': accuracy}
+        # Update the queue file with results
+        with open(out_path, "w") as f:
+            f.write(json.dumps(eval_entry))
         # Evaluate on ArabML
         print("Evaluating on ArabML Tunisian Corpus...")
+        arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train", trust_remote_code=True)
         def preprocess_arabml(examples):
+            return tokenizer(examples['Tweet'], padding=True, truncation=True, max_length=512)
         arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
         covered_tokens = 0
         for example in arabml_dataset:
+            tokens = tokenizer.tokenize(example['Tweet'])
             total_tokens += len(tokens)
             covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
         arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
         # Store results