sudoping01 commited on
Commit
e445644
Β·
verified Β·
1 Parent(s): 53036ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -23
app.py CHANGED
@@ -13,7 +13,7 @@ token = os.environ.get("HG_TOKEN")
13
  if token:
14
  login(token)
15
 
16
- # Load reference dataset
17
  try:
18
  dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
19
  references = {row["id"]: row["text"] for row in dataset}
@@ -22,10 +22,10 @@ except Exception as e:
22
  print(f"Error loading dataset: {str(e)}")
23
  references = {}
24
 
25
- # Initialize or load the leaderboard file
26
  leaderboard_file = "leaderboard.csv"
27
  if not os.path.exists(leaderboard_file):
28
- # Create a new leaderboard with sample data for testing
29
  sample_data = []
30
  # ["MALIBA-AI/bambara-asr-v1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
31
  # ["whisper-large-v3-bambara", 0.3120, 0.1870, 0.2745, "2025-02-20 14:22:33"]
@@ -36,7 +36,7 @@ if not os.path.exists(leaderboard_file):
36
  else:
37
  leaderboard_df = pd.read_csv(leaderboard_file)
38
 
39
- # Ensure the Combined_Score column exists
40
  if "Combined_Score" not in leaderboard_df.columns:
41
  leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
42
  leaderboard_df.to_csv(leaderboard_file, index=False)
@@ -78,7 +78,6 @@ def calculate_metrics(predictions_df):
78
  sample_wer = wer(reference, hypothesis)
79
  sample_cer = cer(reference, hypothesis)
80
 
81
- # Cap extreme values to prevent outliers from skewing results
82
  sample_wer = min(sample_wer, 2.0)
83
  sample_cer = min(sample_cer, 2.0)
84
 
@@ -104,7 +103,7 @@ def calculate_metrics(predictions_df):
104
  avg_wer = sum(item["wer"] for item in results) / len(results)
105
  avg_cer = sum(item["cer"] for item in results) / len(results)
106
 
107
- # Calculate weighted average metrics based on reference length
108
  weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
109
  weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
110
 
@@ -119,22 +118,19 @@ def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
119
  if df is None or len(df) == 0:
120
  return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
121
 
122
- # Make a copy to avoid modifying the original
123
  display_df = df.copy()
124
 
125
- # Sort by the selected metric (lower is better)
126
  display_df = display_df.sort_values(sort_by)
127
 
128
- # Add ranking column
129
  display_df.insert(0, "Rank", range(1, len(display_df) + 1))
130
 
131
- # Format numeric columns as percentages
132
  for col in ["WER", "CER", "Combined_Score"]:
133
  if col in display_df.columns:
134
  display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
135
 
136
- # Keep both the raw values and percentage displays
137
- # This allows for proper sorting while showing formatted values
138
 
139
  return display_df
140
 
@@ -198,16 +194,13 @@ def process_submission(model_name, csv_file):
198
  except Exception as e:
199
  return f"Error calculating metrics: {str(e)}", None
200
 
201
- # Load existing leaderboard
202
  leaderboard = pd.read_csv(leaderboard_file)
203
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
204
 
205
- # Calculate combined score (70% WER, 30% CER)
206
  combined_score = avg_wer * 0.7 + avg_cer * 0.3
207
 
208
- # Check if model already exists
209
  if model_name in leaderboard["Model_Name"].values:
210
- # Update existing entry
211
  idx = leaderboard[leaderboard["Model_Name"] == model_name].index
212
  leaderboard.loc[idx, "WER"] = avg_wer
213
  leaderboard.loc[idx, "CER"] = avg_cer
@@ -215,18 +208,15 @@ def process_submission(model_name, csv_file):
215
  leaderboard.loc[idx, "timestamp"] = timestamp
216
  updated_leaderboard = leaderboard
217
  else:
218
- # Add new entry
219
  new_entry = pd.DataFrame(
220
  [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
221
  columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
222
  )
223
  updated_leaderboard = pd.concat([leaderboard, new_entry])
224
 
225
- # Sort and save updated leaderboard
226
  updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
227
  updated_leaderboard.to_csv(leaderboard_file, index=False)
228
 
229
- # Prepare for display
230
  display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
231
 
232
  return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
@@ -270,7 +260,7 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
270
 
271
  current_data = get_current_leaderboard()
272
 
273
- # Highlight top-performing model
274
  if len(current_data) > 0:
275
  best_model = current_data.sort_values("Combined_Score").iloc[0]
276
  gr.Markdown(f"""
@@ -282,7 +272,7 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
282
 
283
  with gr.Tabs() as tabs:
284
  with gr.TabItem("πŸ… Model Rankings"):
285
- # Pre-load the leaderboard data
286
  initial_leaderboard = create_leaderboard_table()
287
 
288
  ranking_method = gr.Radio(
@@ -373,12 +363,12 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
373
  """
374
  ## About the Benchmark Dataset
375
 
376
- This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/sudoping01/bambara-speech-recognition-benchmark)** dataset:
377
 
378
  * Contains diverse Bambara speech samples
379
  * Includes various speakers, accents, and dialects
380
  * Covers different speech styles and recording conditions
381
- * Professionally transcribed and validated
382
 
383
  ### How to Generate Predictions
384
 
@@ -394,6 +384,8 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
394
  * Text is normalized (lowercase, punctuation removed) before metrics calculation
395
  * Extreme outliers are capped to prevent skewing results
396
  * All submissions are validated for format and completeness
 
 
397
  """
398
  )
399
 
 
13
  if token:
14
  login(token)
15
 
16
+
17
  try:
18
  dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
19
  references = {row["id"]: row["text"] for row in dataset}
 
22
  print(f"Error loading dataset: {str(e)}")
23
  references = {}
24
 
25
+
26
  leaderboard_file = "leaderboard.csv"
27
  if not os.path.exists(leaderboard_file):
28
+
29
  sample_data = []
30
  # ["MALIBA-AI/bambara-asr-v1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
31
  # ["whisper-large-v3-bambara", 0.3120, 0.1870, 0.2745, "2025-02-20 14:22:33"]
 
36
  else:
37
  leaderboard_df = pd.read_csv(leaderboard_file)
38
 
39
+
40
  if "Combined_Score" not in leaderboard_df.columns:
41
  leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
42
  leaderboard_df.to_csv(leaderboard_file, index=False)
 
78
  sample_wer = wer(reference, hypothesis)
79
  sample_cer = cer(reference, hypothesis)
80
 
 
81
  sample_wer = min(sample_wer, 2.0)
82
  sample_cer = min(sample_cer, 2.0)
83
 
 
103
  avg_wer = sum(item["wer"] for item in results) / len(results)
104
  avg_cer = sum(item["cer"] for item in results) / len(results)
105
 
106
+
107
  weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
108
  weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
109
 
 
118
  if df is None or len(df) == 0:
119
  return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
120
 
121
+
122
  display_df = df.copy()
123
 
124
+
125
  display_df = display_df.sort_values(sort_by)
126
 
 
127
  display_df.insert(0, "Rank", range(1, len(display_df) + 1))
128
 
 
129
  for col in ["WER", "CER", "Combined_Score"]:
130
  if col in display_df.columns:
131
  display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
132
 
133
+
 
134
 
135
  return display_df
136
 
 
194
  except Exception as e:
195
  return f"Error calculating metrics: {str(e)}", None
196
 
197
+
198
  leaderboard = pd.read_csv(leaderboard_file)
199
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
200
 
 
201
  combined_score = avg_wer * 0.7 + avg_cer * 0.3
202
 
 
203
  if model_name in leaderboard["Model_Name"].values:
 
204
  idx = leaderboard[leaderboard["Model_Name"] == model_name].index
205
  leaderboard.loc[idx, "WER"] = avg_wer
206
  leaderboard.loc[idx, "CER"] = avg_cer
 
208
  leaderboard.loc[idx, "timestamp"] = timestamp
209
  updated_leaderboard = leaderboard
210
  else:
 
211
  new_entry = pd.DataFrame(
212
  [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
213
  columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
214
  )
215
  updated_leaderboard = pd.concat([leaderboard, new_entry])
216
 
 
217
  updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
218
  updated_leaderboard.to_csv(leaderboard_file, index=False)
219
 
 
220
  display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
221
 
222
  return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
 
260
 
261
  current_data = get_current_leaderboard()
262
 
263
+
264
  if len(current_data) > 0:
265
  best_model = current_data.sort_values("Combined_Score").iloc[0]
266
  gr.Markdown(f"""
 
272
 
273
  with gr.Tabs() as tabs:
274
  with gr.TabItem("πŸ… Model Rankings"):
275
+
276
  initial_leaderboard = create_leaderboard_table()
277
 
278
  ranking_method = gr.Radio(
 
363
  """
364
  ## About the Benchmark Dataset
365
 
366
+ This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/MALIBA-AI/bambara-speech-recognition-leaderboard)** dataset:
367
 
368
  * Contains diverse Bambara speech samples
369
  * Includes various speakers, accents, and dialects
370
  * Covers different speech styles and recording conditions
371
+ * Transcribed and validated
372
 
373
  ### How to Generate Predictions
374
 
 
384
  * Text is normalized (lowercase, punctuation removed) before metrics calculation
385
  * Extreme outliers are capped to prevent skewing results
386
  * All submissions are validated for format and completeness
387
+
388
+ NB: This work is a collaboration between MALIBA-AI, RobotsMali AI4D-LAB and Djelia
389
  """
390
  )
391