Update app.py
Browse files
app.py
CHANGED
@@ -13,7 +13,7 @@ token = os.environ.get("HG_TOKEN")
|
|
13 |
if token:
|
14 |
login(token)
|
15 |
|
16 |
-
|
17 |
try:
|
18 |
dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
|
19 |
references = {row["id"]: row["text"] for row in dataset}
|
@@ -22,10 +22,10 @@ except Exception as e:
|
|
22 |
print(f"Error loading dataset: {str(e)}")
|
23 |
references = {}
|
24 |
|
25 |
-
|
26 |
leaderboard_file = "leaderboard.csv"
|
27 |
if not os.path.exists(leaderboard_file):
|
28 |
-
|
29 |
sample_data = []
|
30 |
# ["MALIBA-AI/bambara-asr-v1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
|
31 |
# ["whisper-large-v3-bambara", 0.3120, 0.1870, 0.2745, "2025-02-20 14:22:33"]
|
@@ -36,7 +36,7 @@ if not os.path.exists(leaderboard_file):
|
|
36 |
else:
|
37 |
leaderboard_df = pd.read_csv(leaderboard_file)
|
38 |
|
39 |
-
|
40 |
if "Combined_Score" not in leaderboard_df.columns:
|
41 |
leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
|
42 |
leaderboard_df.to_csv(leaderboard_file, index=False)
|
@@ -78,7 +78,6 @@ def calculate_metrics(predictions_df):
|
|
78 |
sample_wer = wer(reference, hypothesis)
|
79 |
sample_cer = cer(reference, hypothesis)
|
80 |
|
81 |
-
# Cap extreme values to prevent outliers from skewing results
|
82 |
sample_wer = min(sample_wer, 2.0)
|
83 |
sample_cer = min(sample_cer, 2.0)
|
84 |
|
@@ -104,7 +103,7 @@ def calculate_metrics(predictions_df):
|
|
104 |
avg_wer = sum(item["wer"] for item in results) / len(results)
|
105 |
avg_cer = sum(item["cer"] for item in results) / len(results)
|
106 |
|
107 |
-
|
108 |
weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
|
109 |
weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
|
110 |
|
@@ -119,22 +118,19 @@ def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
|
|
119 |
if df is None or len(df) == 0:
|
120 |
return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
|
121 |
|
122 |
-
|
123 |
display_df = df.copy()
|
124 |
|
125 |
-
|
126 |
display_df = display_df.sort_values(sort_by)
|
127 |
|
128 |
-
# Add ranking column
|
129 |
display_df.insert(0, "Rank", range(1, len(display_df) + 1))
|
130 |
|
131 |
-
# Format numeric columns as percentages
|
132 |
for col in ["WER", "CER", "Combined_Score"]:
|
133 |
if col in display_df.columns:
|
134 |
display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
|
135 |
|
136 |
-
|
137 |
-
# This allows for proper sorting while showing formatted values
|
138 |
|
139 |
return display_df
|
140 |
|
@@ -198,16 +194,13 @@ def process_submission(model_name, csv_file):
|
|
198 |
except Exception as e:
|
199 |
return f"Error calculating metrics: {str(e)}", None
|
200 |
|
201 |
-
|
202 |
leaderboard = pd.read_csv(leaderboard_file)
|
203 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
204 |
|
205 |
-
# Calculate combined score (70% WER, 30% CER)
|
206 |
combined_score = avg_wer * 0.7 + avg_cer * 0.3
|
207 |
|
208 |
-
# Check if model already exists
|
209 |
if model_name in leaderboard["Model_Name"].values:
|
210 |
-
# Update existing entry
|
211 |
idx = leaderboard[leaderboard["Model_Name"] == model_name].index
|
212 |
leaderboard.loc[idx, "WER"] = avg_wer
|
213 |
leaderboard.loc[idx, "CER"] = avg_cer
|
@@ -215,18 +208,15 @@ def process_submission(model_name, csv_file):
|
|
215 |
leaderboard.loc[idx, "timestamp"] = timestamp
|
216 |
updated_leaderboard = leaderboard
|
217 |
else:
|
218 |
-
# Add new entry
|
219 |
new_entry = pd.DataFrame(
|
220 |
[[model_name, avg_wer, avg_cer, combined_score, timestamp]],
|
221 |
columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
|
222 |
)
|
223 |
updated_leaderboard = pd.concat([leaderboard, new_entry])
|
224 |
|
225 |
-
# Sort and save updated leaderboard
|
226 |
updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
|
227 |
updated_leaderboard.to_csv(leaderboard_file, index=False)
|
228 |
|
229 |
-
# Prepare for display
|
230 |
display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
|
231 |
|
232 |
return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
|
@@ -270,7 +260,7 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
|
|
270 |
|
271 |
current_data = get_current_leaderboard()
|
272 |
|
273 |
-
|
274 |
if len(current_data) > 0:
|
275 |
best_model = current_data.sort_values("Combined_Score").iloc[0]
|
276 |
gr.Markdown(f"""
|
@@ -282,7 +272,7 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
|
|
282 |
|
283 |
with gr.Tabs() as tabs:
|
284 |
with gr.TabItem("π
Model Rankings"):
|
285 |
-
|
286 |
initial_leaderboard = create_leaderboard_table()
|
287 |
|
288 |
ranking_method = gr.Radio(
|
@@ -373,12 +363,12 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
|
|
373 |
"""
|
374 |
## About the Benchmark Dataset
|
375 |
|
376 |
-
This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/
|
377 |
|
378 |
* Contains diverse Bambara speech samples
|
379 |
* Includes various speakers, accents, and dialects
|
380 |
* Covers different speech styles and recording conditions
|
381 |
-
*
|
382 |
|
383 |
### How to Generate Predictions
|
384 |
|
@@ -394,6 +384,8 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
|
|
394 |
* Text is normalized (lowercase, punctuation removed) before metrics calculation
|
395 |
* Extreme outliers are capped to prevent skewing results
|
396 |
* All submissions are validated for format and completeness
|
|
|
|
|
397 |
"""
|
398 |
)
|
399 |
|
|
|
13 |
if token:
|
14 |
login(token)
|
15 |
|
16 |
+
|
17 |
try:
|
18 |
dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
|
19 |
references = {row["id"]: row["text"] for row in dataset}
|
|
|
22 |
print(f"Error loading dataset: {str(e)}")
|
23 |
references = {}
|
24 |
|
25 |
+
|
26 |
leaderboard_file = "leaderboard.csv"
|
27 |
if not os.path.exists(leaderboard_file):
|
28 |
+
|
29 |
sample_data = []
|
30 |
# ["MALIBA-AI/bambara-asr-v1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
|
31 |
# ["whisper-large-v3-bambara", 0.3120, 0.1870, 0.2745, "2025-02-20 14:22:33"]
|
|
|
36 |
else:
|
37 |
leaderboard_df = pd.read_csv(leaderboard_file)
|
38 |
|
39 |
+
|
40 |
if "Combined_Score" not in leaderboard_df.columns:
|
41 |
leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
|
42 |
leaderboard_df.to_csv(leaderboard_file, index=False)
|
|
|
78 |
sample_wer = wer(reference, hypothesis)
|
79 |
sample_cer = cer(reference, hypothesis)
|
80 |
|
|
|
81 |
sample_wer = min(sample_wer, 2.0)
|
82 |
sample_cer = min(sample_cer, 2.0)
|
83 |
|
|
|
103 |
avg_wer = sum(item["wer"] for item in results) / len(results)
|
104 |
avg_cer = sum(item["cer"] for item in results) / len(results)
|
105 |
|
106 |
+
|
107 |
weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
|
108 |
weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
|
109 |
|
|
|
118 |
if df is None or len(df) == 0:
|
119 |
return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
|
120 |
|
121 |
+
|
122 |
display_df = df.copy()
|
123 |
|
124 |
+
|
125 |
display_df = display_df.sort_values(sort_by)
|
126 |
|
|
|
127 |
display_df.insert(0, "Rank", range(1, len(display_df) + 1))
|
128 |
|
|
|
129 |
for col in ["WER", "CER", "Combined_Score"]:
|
130 |
if col in display_df.columns:
|
131 |
display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
|
132 |
|
133 |
+
|
|
|
134 |
|
135 |
return display_df
|
136 |
|
|
|
194 |
except Exception as e:
|
195 |
return f"Error calculating metrics: {str(e)}", None
|
196 |
|
197 |
+
|
198 |
leaderboard = pd.read_csv(leaderboard_file)
|
199 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
200 |
|
|
|
201 |
combined_score = avg_wer * 0.7 + avg_cer * 0.3
|
202 |
|
|
|
203 |
if model_name in leaderboard["Model_Name"].values:
|
|
|
204 |
idx = leaderboard[leaderboard["Model_Name"] == model_name].index
|
205 |
leaderboard.loc[idx, "WER"] = avg_wer
|
206 |
leaderboard.loc[idx, "CER"] = avg_cer
|
|
|
208 |
leaderboard.loc[idx, "timestamp"] = timestamp
|
209 |
updated_leaderboard = leaderboard
|
210 |
else:
|
|
|
211 |
new_entry = pd.DataFrame(
|
212 |
[[model_name, avg_wer, avg_cer, combined_score, timestamp]],
|
213 |
columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
|
214 |
)
|
215 |
updated_leaderboard = pd.concat([leaderboard, new_entry])
|
216 |
|
|
|
217 |
updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
|
218 |
updated_leaderboard.to_csv(leaderboard_file, index=False)
|
219 |
|
|
|
220 |
display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
|
221 |
|
222 |
return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
|
|
|
260 |
|
261 |
current_data = get_current_leaderboard()
|
262 |
|
263 |
+
|
264 |
if len(current_data) > 0:
|
265 |
best_model = current_data.sort_values("Combined_Score").iloc[0]
|
266 |
gr.Markdown(f"""
|
|
|
272 |
|
273 |
with gr.Tabs() as tabs:
|
274 |
with gr.TabItem("π
Model Rankings"):
|
275 |
+
|
276 |
initial_leaderboard = create_leaderboard_table()
|
277 |
|
278 |
ranking_method = gr.Radio(
|
|
|
363 |
"""
|
364 |
## About the Benchmark Dataset
|
365 |
|
366 |
+
This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/MALIBA-AI/bambara-speech-recognition-leaderboard)** dataset:
|
367 |
|
368 |
* Contains diverse Bambara speech samples
|
369 |
* Includes various speakers, accents, and dialects
|
370 |
* Covers different speech styles and recording conditions
|
371 |
+
* Transcribed and validated
|
372 |
|
373 |
### How to Generate Predictions
|
374 |
|
|
|
384 |
* Text is normalized (lowercase, punctuation removed) before metrics calculation
|
385 |
* Extreme outliers are capped to prevent skewing results
|
386 |
* All submissions are validated for format and completeness
|
387 |
+
|
388 |
+
NB: This work is a collaboration between MALIBA-AI, RobotsMali AI4D-LAB and Djelia
|
389 |
"""
|
390 |
)
|
391 |
|