import gradio as gr from collections import defaultdict import os import base64 from datasets import ( Dataset, load_dataset, ) import pandas as pd from collections import defaultdict import itertools TOKEN = os.environ['TOKEN'] MASKED_LM_MODELS = [ "BounharAbdelaziz/XLM-RoBERTa-Morocco", "SI2M-Lab/DarijaBERT", "BounharAbdelaziz/ModernBERT-Morocco", "google-bert/bert-base-multilingual-cased", "FacebookAI/xlm-roberta-large", "aubmindlab/bert-base-arabertv02", ] CAUSAL_LM_MODELS = [ "BounharAbdelaziz/Al-Atlas-LLM-0.5B", "Qwen/Qwen2.5-0.5B", "tiiuae/Falcon3-1B-Base", "MBZUAI-Paris/Atlas-Chat-2B", ] def encode_image_to_base64(image_path): """Encode an image or GIF file to base64.""" with open(image_path, "rb") as file: encoded_string = base64.b64encode(file.read()).decode() return encoded_string def create_html_media(media_path, is_gif=False): """Create HTML for displaying an image or GIF.""" media_base64 = encode_image_to_base64(media_path) media_type = "gif" if is_gif else "jpeg" html_string = f"""
Displayed Media
""" return html_string class LMBattleArena: def __init__(self, dataset_path, saving_freq=25): """Initialize battle arena with dataset""" self.df = pd.read_csv(dataset_path) self.current_index = 0 self.saving_freq = saving_freq # save the results in csv/push to hub every saving_freq evaluations self.evaluation_results_masked = [] self.evaluation_results_causal = [] self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0}) # Generate all possible model pairs self.masked_model_pairs = list(itertools.combinations(MASKED_LM_MODELS, 2)) self.causal_model_pairs = list(itertools.combinations(CAUSAL_LM_MODELS, 2)) # Pair indices to track which pair is being evaluated self.masked_pair_idx = 0 self.causal_pair_idx = 0 # To track which rows have been evaluated for which model pairs self.row_model_pairs_evaluated = set() # Using a simple set def get_next_battle_pair(self, is_causal): """Retrieve next pair of summaries for comparison ensuring all pairs are evaluated""" if self.current_index >= len(self.df): # Reset index to go through dataset again with remaining model pairs self.current_index = 0 # If we've gone through all model pairs for all rows, we're done if is_causal and self.causal_pair_idx >= len(self.causal_model_pairs): return None elif not is_causal and self.masked_pair_idx >= len(self.masked_model_pairs): return None row = self.df.iloc[self.current_index] # Get the current model pair to evaluate if is_causal: # Check if we've evaluated all causal model pairs if self.causal_pair_idx >= len(self.causal_model_pairs): # Move to next row and reset pair index self.current_index += 1 self.causal_pair_idx = 0 # Try again with the next row return self.get_next_battle_pair(is_causal) model_pair = self.causal_model_pairs[self.causal_pair_idx] pair_key = f"{self.current_index}_causal_{self.causal_pair_idx}" # Check if this row-pair combination has been evaluated if pair_key in self.row_model_pairs_evaluated: # Move to next pair self.causal_pair_idx += 1 return self.get_next_battle_pair(is_causal) # Mark this row-pair combination as evaluated self.row_model_pairs_evaluated.add(pair_key) # Move to next pair for next evaluation self.causal_pair_idx += 1 # Check if we've gone through all pairs for this row if self.causal_pair_idx >= len(self.causal_model_pairs): # Reset pair index and move to next row for next evaluation self.causal_pair_idx = 0 self.current_index += 1 else: # Similar logic for masked models if self.masked_pair_idx >= len(self.masked_model_pairs): self.current_index += 1 self.masked_pair_idx = 0 return self.get_next_battle_pair(is_causal) model_pair = self.masked_model_pairs[self.masked_pair_idx] pair_key = f"{self.current_index}_masked_{self.masked_pair_idx}" if pair_key in self.row_model_pairs_evaluated: self.masked_pair_idx += 1 return self.get_next_battle_pair(is_causal) self.row_model_pairs_evaluated.add(pair_key) self.masked_pair_idx += 1 if self.masked_pair_idx >= len(self.masked_model_pairs): self.masked_pair_idx = 0 self.current_index += 1 # Prepare the battle data with the selected model pair battle_data = { 'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'], 'model_1': row[model_pair[0]], 'model_2': row[model_pair[1]], 'model1_name': model_pair[0], 'model2_name': model_pair[1] } return battle_data def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal): """Record user's model preference and update scores""" self.model_scores[model1_name]['total_comparisons'] += 1 self.model_scores[model2_name]['total_comparisons'] += 1 if preferred_models == "Both Good": self.model_scores[model1_name]['wins'] += 1 self.model_scores[model2_name]['wins'] += 1 elif preferred_models == "Model A": # Maps to first model self.model_scores[model1_name]['wins'] += 1 elif preferred_models == "Model B": # Maps to second model self.model_scores[model2_name]['wins'] += 1 # "Both Bad" case - no wins recorded evaluation = { 'input_text': input_text, 'output1': output1, 'output2': output2, 'model1_name': model1_name, 'model2_name': model2_name, 'preferred_models': preferred_models } if is_causal: self.evaluation_results_causal.append(evaluation) else: self.evaluation_results_masked.append(evaluation) # Calculate the total number of evaluations total_evaluations = len(self.evaluation_results_causal) + len(self.evaluation_results_masked) # Save results periodically if total_evaluations % self.saving_freq == 0: self.save_results() return self.get_model_scores_df(is_causal) def save_results(self): """Save the evaluation results to Hub and CSV""" results_df = self.get_model_scores_df(is_causal=True) # Get the latest scores results_dataset = Dataset.from_pandas(results_df) results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True, token=TOKEN) results_df.to_csv('human_eval_results.csv') # Also save the raw evaluation results masked_df = pd.DataFrame(self.evaluation_results_masked) causal_df = pd.DataFrame(self.evaluation_results_causal) if not masked_df.empty: masked_df.to_csv('masked_evaluations.csv') if not causal_df.empty: causal_df.to_csv('causal_evaluations.csv') def get_model_scores_df(self, is_causal): """Convert model scores to DataFrame""" scores_data = [] for model, stats in self.model_scores.items(): if is_causal: if model not in CAUSAL_LM_MODELS: continue else: if model not in MASKED_LM_MODELS: continue win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0 scores_data.append({ 'Model': model, 'Wins': stats['wins'], 'Total Comparisons': stats['total_comparisons'], 'Win Rate (%)': round(win_rate, 2) }) results_df = pd.DataFrame(scores_data) print("Generated DataFrame:\n", results_df) # Debugging print # if 'Win Rate (%)' not in results_df.columns: # raise ValueError("Win Rate (%) column is missing from DataFrame!") return results_df def create_battle_arena(dataset_path, is_gif, is_causal): arena = LMBattleArena(dataset_path) def battle_round(is_causal): battle_data = arena.get_next_battle_pair(is_causal) if battle_data is None: return "All model pairs have been evaluated for all examples!", "", "", "", "", gr.DataFrame(visible=False) return ( battle_data['prompt'], battle_data['model_1'], battle_data['model_2'], battle_data['model1_name'], battle_data['model2_name'], gr.DataFrame(visible=True) ) def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal): scores_df = arena.record_evaluation( preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal ) next_battle = battle_round(is_causal) return (*next_battle[:-1], scores_df) with gr.Blocks(css="footer{display:none !important}") as demo: # Rest of the code remains the same base_path = os.path.dirname(__file__) local_image_path = os.path.join(base_path, 'battle_leaderboard.gif') gr.HTML(create_html_media(local_image_path, is_gif=is_gif)) with gr.Tabs(): with gr.Tab("Masked LM Battle Arena"): gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena") # Use gr.State to store the boolean value without displaying it is_causal = gr.State(value=False) input_text = gr.Textbox( label="Input prompt", interactive=False, ) with gr.Row(): output_1 = gr.Textbox( label="Model A", interactive=False ) model1_name = gr.State() # Hidden state for model1 name with gr.Row(): output_2 = gr.Textbox( label="Model B", interactive=False ) model2_name = gr.State() # Hidden state for model2 name preferred_models = gr.Radio( label="Which model is better?", choices=["Model A", "Model B", "Both Good", "Both Bad"] ) submit_btn = gr.Button("Vote", variant="primary") scores_table = gr.DataFrame( headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'], label="🏆 Leaderboard" ) submit_btn.click( submit_preference, inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal], outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table] ) demo.load( battle_round, inputs=[is_causal], outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table] ) with gr.Tab("Causal LM Battle Arena"): gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena") # Use gr.State to store the boolean value without displaying it is_causal = gr.State(value=True) input_text = gr.Textbox( label="Input prompt", interactive=False, ) with gr.Row(): output_1 = gr.Textbox( label="Model A", interactive=False ) model1_name = gr.State() # Hidden state for model1 name with gr.Row(): output_2 = gr.Textbox( label="Model B", interactive=False ) model2_name = gr.State() # Hidden state for model2 name preferred_models = gr.Radio( label="Which model is better?", choices=["Model A", "Model B", "Both Good", "Both Bad"] ) submit_btn = gr.Button("Vote", variant="primary") scores_table = gr.DataFrame( headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'], label="🏆 Leaderboard" ) submit_btn.click( submit_preference, inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal], outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table] ) demo.load( battle_round, inputs=[is_causal], outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table] ) return demo if __name__ == "__main__": # inference device device = "cpu" dataset_path = 'human_eval_dataset.csv' is_gif = True # load the existing dataset that contains outputs of the LMs human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path) # atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas demo = create_battle_arena(dataset_path, is_gif, is_causal=False) demo.launch(debug=True)