Spaces:

atlasia
/

Moroccan-Darija-LLM-Battle-Al-Atlas

Running

App Files Files Community

Moroccan-Darija-LLM-Battle-Al-Atlas / human_eval.py

BounharAbdelaziz

removed extra comma

ed7d6ec verified about 8 hours ago

raw

history blame contribute delete

15.2 kB

	import gradio as gr
	from collections import defaultdict
	import os
	import base64
	from datasets import (
	Dataset,
	load_dataset,
	)
	import pandas as pd
	from collections import defaultdict
	import itertools

	TOKEN = os.environ['TOKEN']


	MASKED_LM_MODELS = [
	"BounharAbdelaziz/XLM-RoBERTa-Morocco",
	"SI2M-Lab/DarijaBERT",
	"BounharAbdelaziz/ModernBERT-Morocco",
	"google-bert/bert-base-multilingual-cased",
	"FacebookAI/xlm-roberta-large",
	"aubmindlab/bert-base-arabertv02",
	]

	CAUSAL_LM_MODELS = [
	"BounharAbdelaziz/Al-Atlas-LLM-0.5B",
	"Qwen/Qwen2.5-0.5B",
	"tiiuae/Falcon3-1B-Base",
	"MBZUAI-Paris/Atlas-Chat-2B",
	]

	def encode_image_to_base64(image_path):
	"""Encode an image or GIF file to base64."""
	with open(image_path, "rb") as file:
	encoded_string = base64.b64encode(file.read()).decode()
	return encoded_string

	def create_html_media(media_path, is_gif=False):
	"""Create HTML for displaying an image or GIF."""
	media_base64 = encode_image_to_base64(media_path)
	media_type = "gif" if is_gif else "jpeg"

	html_string = f"""
	<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
	<div style="max-width: 450px; margin: auto;">
	<img src="data:image/{media_type};base64,{media_base64}"
	style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
	alt="Displayed Media">
	</div>
	</div>
	"""
	return html_string

	class LMBattleArena:
	def __init__(self, dataset_path, saving_freq=25):
	"""Initialize battle arena with dataset"""
	self.df = pd.read_csv(dataset_path)
	self.current_index = 0
	self.saving_freq = saving_freq # save the results in csv/push to hub every saving_freq evaluations
	self.evaluation_results_masked = []
	self.evaluation_results_causal = []
	self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})

	# Generate all possible model pairs
	self.masked_model_pairs = list(itertools.combinations(MASKED_LM_MODELS, 2))
	self.causal_model_pairs = list(itertools.combinations(CAUSAL_LM_MODELS, 2))

	# Pair indices to track which pair is being evaluated
	self.masked_pair_idx = 0
	self.causal_pair_idx = 0

	# To track which rows have been evaluated for which model pairs
	self.row_model_pairs_evaluated = set() # Using a simple set

	def get_next_battle_pair(self, is_causal):
	"""Retrieve next pair of summaries for comparison ensuring all pairs are evaluated"""

	if self.current_index >= len(self.df):
	# Reset index to go through dataset again with remaining model pairs
	self.current_index = 0

	# If we've gone through all model pairs for all rows, we're done
	if is_causal and self.causal_pair_idx >= len(self.causal_model_pairs):
	return None
	elif not is_causal and self.masked_pair_idx >= len(self.masked_model_pairs):
	return None

	row = self.df.iloc[self.current_index]

	# Get the current model pair to evaluate
	if is_causal:
	# Check if we've evaluated all causal model pairs
	if self.causal_pair_idx >= len(self.causal_model_pairs):
	# Move to next row and reset pair index
	self.current_index += 1
	self.causal_pair_idx = 0
	# Try again with the next row
	return self.get_next_battle_pair(is_causal)

	model_pair = self.causal_model_pairs[self.causal_pair_idx]
	pair_key = f"{self.current_index}_causal_{self.causal_pair_idx}"

	# Check if this row-pair combination has been evaluated
	if pair_key in self.row_model_pairs_evaluated:
	# Move to next pair
	self.causal_pair_idx += 1
	return self.get_next_battle_pair(is_causal)

	# Mark this row-pair combination as evaluated
	self.row_model_pairs_evaluated.add(pair_key)
	# Move to next pair for next evaluation
	self.causal_pair_idx += 1

	# Check if we've gone through all pairs for this row
	if self.causal_pair_idx >= len(self.causal_model_pairs):
	# Reset pair index and move to next row for next evaluation
	self.causal_pair_idx = 0
	self.current_index += 1
	else:
	# Similar logic for masked models
	if self.masked_pair_idx >= len(self.masked_model_pairs):
	self.current_index += 1
	self.masked_pair_idx = 0
	return self.get_next_battle_pair(is_causal)

	model_pair = self.masked_model_pairs[self.masked_pair_idx]
	pair_key = f"{self.current_index}_masked_{self.masked_pair_idx}"

	if pair_key in self.row_model_pairs_evaluated:
	self.masked_pair_idx += 1
	return self.get_next_battle_pair(is_causal)

	self.row_model_pairs_evaluated.add(pair_key)
	self.masked_pair_idx += 1

	if self.masked_pair_idx >= len(self.masked_model_pairs):
	self.masked_pair_idx = 0
	self.current_index += 1

	# Prepare the battle data with the selected model pair
	battle_data = {
	'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
	'model_1': row[model_pair[0]],
	'model_2': row[model_pair[1]],
	'model1_name': model_pair[0],
	'model2_name': model_pair[1]
	}

	return battle_data

	def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
	"""Record user's model preference and update scores"""
	self.model_scores[model1_name]['total_comparisons'] += 1
	self.model_scores[model2_name]['total_comparisons'] += 1

	if preferred_models == "Both Good":
	self.model_scores[model1_name]['wins'] += 1
	self.model_scores[model2_name]['wins'] += 1
	elif preferred_models == "Model A": # Maps to first model
	self.model_scores[model1_name]['wins'] += 1
	elif preferred_models == "Model B": # Maps to second model
	self.model_scores[model2_name]['wins'] += 1
	# "Both Bad" case - no wins recorded

	evaluation = {
	'input_text': input_text,
	'output1': output1,
	'output2': output2,
	'model1_name': model1_name,
	'model2_name': model2_name,
	'preferred_models': preferred_models
	}
	if is_causal:
	self.evaluation_results_causal.append(evaluation)
	else:
	self.evaluation_results_masked.append(evaluation)

	# Calculate the total number of evaluations
	total_evaluations = len(self.evaluation_results_causal) + len(self.evaluation_results_masked)

	# Save results periodically
	if total_evaluations % self.saving_freq == 0:
	self.save_results()

	return self.get_model_scores_df(is_causal)

	def save_results(self):
	"""Save the evaluation results to Hub and CSV"""
	results_df = self.get_model_scores_df(is_causal=True) # Get the latest scores
	results_dataset = Dataset.from_pandas(results_df)
	results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True, token=TOKEN)
	results_df.to_csv('human_eval_results.csv')

	# Also save the raw evaluation results
	masked_df = pd.DataFrame(self.evaluation_results_masked)
	causal_df = pd.DataFrame(self.evaluation_results_causal)

	if not masked_df.empty:
	masked_df.to_csv('masked_evaluations.csv')
	if not causal_df.empty:
	causal_df.to_csv('causal_evaluations.csv')

	def get_model_scores_df(self, is_causal):
	"""Convert model scores to DataFrame"""
	scores_data = []
	for model, stats in self.model_scores.items():
	if is_causal:
	if model not in CAUSAL_LM_MODELS:
	continue
	else:
	if model not in MASKED_LM_MODELS:
	continue
	win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
	scores_data.append({
	'Model': model,
	'Wins': stats['wins'],
	'Total Comparisons': stats['total_comparisons'],
	'Win Rate (%)': round(win_rate, 2)
	})

	results_df = pd.DataFrame(scores_data)
	print("Generated DataFrame:\n", results_df) # Debugging print

	# if 'Win Rate (%)' not in results_df.columns:
	# raise ValueError("Win Rate (%) column is missing from DataFrame!")

	return results_df


	def create_battle_arena(dataset_path, is_gif, is_causal):
	arena = LMBattleArena(dataset_path)

	def battle_round(is_causal):
	battle_data = arena.get_next_battle_pair(is_causal)

	if battle_data is None:
	return "All model pairs have been evaluated for all examples!", "", "", "", "", gr.DataFrame(visible=False)

	return (
	battle_data['prompt'],
	battle_data['model_1'],
	battle_data['model_2'],
	battle_data['model1_name'],
	battle_data['model2_name'],
	gr.DataFrame(visible=True)
	)

	def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
	scores_df = arena.record_evaluation(
	preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
	)
	next_battle = battle_round(is_causal)
	return (*next_battle[:-1], scores_df)

	with gr.Blocks(css="footer{display:none !important}") as demo:
	# Rest of the code remains the same
	base_path = os.path.dirname(__file__)
	local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
	gr.HTML(create_html_media(local_image_path, is_gif=is_gif))

	with gr.Tabs():
	with gr.Tab("Masked LM Battle Arena"):
	gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")

	# Use gr.State to store the boolean value without displaying it
	is_causal = gr.State(value=False)

	input_text = gr.Textbox(
	label="Input prompt",
	interactive=False,
	)

	with gr.Row():
	output_1 = gr.Textbox(
	label="Model A",
	interactive=False
	)
	model1_name = gr.State() # Hidden state for model1 name

	with gr.Row():
	output_2 = gr.Textbox(
	label="Model B",
	interactive=False
	)
	model2_name = gr.State() # Hidden state for model2 name

	preferred_models = gr.Radio(
	label="Which model is better?",
	choices=["Model A", "Model B", "Both Good", "Both Bad"]
	)
	submit_btn = gr.Button("Vote", variant="primary")

	scores_table = gr.DataFrame(
	headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
	label="🏆 Leaderboard"
	)

	submit_btn.click(
	submit_preference,
	inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	demo.load(
	battle_round,
	inputs=[is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	with gr.Tab("Causal LM Battle Arena"):
	gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")

	# Use gr.State to store the boolean value without displaying it
	is_causal = gr.State(value=True)

	input_text = gr.Textbox(
	label="Input prompt",
	interactive=False,
	)

	with gr.Row():
	output_1 = gr.Textbox(
	label="Model A",
	interactive=False
	)
	model1_name = gr.State() # Hidden state for model1 name

	with gr.Row():
	output_2 = gr.Textbox(
	label="Model B",
	interactive=False
	)
	model2_name = gr.State() # Hidden state for model2 name

	preferred_models = gr.Radio(
	label="Which model is better?",
	choices=["Model A", "Model B", "Both Good", "Both Bad"]
	)
	submit_btn = gr.Button("Vote", variant="primary")

	scores_table = gr.DataFrame(
	headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
	label="🏆 Leaderboard"
	)

	submit_btn.click(
	submit_preference,
	inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	demo.load(
	battle_round,
	inputs=[is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	return demo

	if __name__ == "__main__":

	# inference device
	device = "cpu"
	dataset_path = 'human_eval_dataset.csv'
	is_gif = True

	# load the existing dataset that contains outputs of the LMs
	human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path) # atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas

	demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
	demo.launch(debug=True)