Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Drop common voice and update rtfx (#17)
Browse files- update docs (0cdd6b0519ca27dcd59d34915a2155a4961a65bc)
Co-authored-by: Sanchit Gandhi <[email protected]>
- README.md +1 -1
- app.py +5 -5
- constants.py +34 -17
- init.py +1 -2
- utils_display.py +1 -2
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🏆
|
|
4 |
colorFrom: red
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
tags:
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.41.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
tags:
|
app.py
CHANGED
@@ -6,12 +6,12 @@ from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub
|
|
6 |
from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message
|
7 |
from datetime import datetime, timezone
|
8 |
|
9 |
-
LAST_UPDATED = "
|
10 |
|
11 |
column_names = {
|
12 |
"MODEL": "Model",
|
13 |
"Avg. WER": "Average WER ⬇️",
|
14 |
-
"
|
15 |
"AMI WER": "AMI",
|
16 |
"Earnings22 WER": "Earnings22",
|
17 |
"Gigaspeech WER": "Gigaspeech",
|
@@ -20,7 +20,7 @@ column_names = {
|
|
20 |
"SPGISpeech WER": "SPGISpeech",
|
21 |
"Tedlium WER": "Tedlium",
|
22 |
"Voxpopuli WER": "Voxpopuli",
|
23 |
-
|
24 |
|
25 |
eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
|
26 |
|
@@ -111,7 +111,6 @@ with gr.Blocks() as demo:
|
|
111 |
leaderboard_table = gr.components.Dataframe(
|
112 |
value=original_df,
|
113 |
datatype=TYPES,
|
114 |
-
max_rows=None,
|
115 |
elem_id="leaderboard-table",
|
116 |
interactive=False,
|
117 |
visible=True,
|
@@ -143,6 +142,7 @@ with gr.Blocks() as demo:
|
|
143 |
value=CITATION_TEXT, lines=7,
|
144 |
label="Copy the BibTeX snippet to cite this source",
|
145 |
elem_id="citation-button",
|
146 |
-
|
|
|
147 |
|
148 |
demo.launch()
|
|
|
6 |
from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message
|
7 |
from datetime import datetime, timezone
|
8 |
|
9 |
+
LAST_UPDATED = "Aug 12th 2024"
|
10 |
|
11 |
column_names = {
|
12 |
"MODEL": "Model",
|
13 |
"Avg. WER": "Average WER ⬇️",
|
14 |
+
"Avg. RTFx": "RTFx ⬆️️",
|
15 |
"AMI WER": "AMI",
|
16 |
"Earnings22 WER": "Earnings22",
|
17 |
"Gigaspeech WER": "Gigaspeech",
|
|
|
20 |
"SPGISpeech WER": "SPGISpeech",
|
21 |
"Tedlium WER": "Tedlium",
|
22 |
"Voxpopuli WER": "Voxpopuli",
|
23 |
+
}
|
24 |
|
25 |
eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
|
26 |
|
|
|
111 |
leaderboard_table = gr.components.Dataframe(
|
112 |
value=original_df,
|
113 |
datatype=TYPES,
|
|
|
114 |
elem_id="leaderboard-table",
|
115 |
interactive=False,
|
116 |
visible=True,
|
|
|
142 |
value=CITATION_TEXT, lines=7,
|
143 |
label="Copy the BibTeX snippet to cite this source",
|
144 |
elem_id="citation-button",
|
145 |
+
show_copy_button=True,
|
146 |
+
)
|
147 |
|
148 |
demo.launch()
|
constants.py
CHANGED
@@ -15,7 +15,7 @@ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body>
|
|
15 |
|
16 |
INTRODUCTION_TEXT = "📐 The 🤗 Open ASR Leaderboard ranks and evaluates speech recognition models \
|
17 |
on the Hugging Face Hub. \
|
18 |
-
\nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (⬇️) and [
|
19 |
\nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. \
|
20 |
\nThe leaderboard currently focuses on English speech recognition, and will be expanded to multilingual evaluation in later versions."
|
21 |
|
@@ -33,34 +33,52 @@ Here you will find details about the speech recognition metrics and datasets rep
|
|
33 |
|
34 |
## Metrics
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
### Word Error Rate (WER)
|
41 |
|
42 |
Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
|
43 |
of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
|
44 |
|
45 |
-
|
46 |
-
Example: If the reference transcript is "I really love cats," and the ASR system outputs "I don't love dogs,".
|
47 |
-
The WER would be `50%` because 2 out of 4 words are incorrect.
|
48 |
-
```
|
49 |
|
50 |
-
|
|
|
|
|
|
|
51 |
|
52 |
-
|
|
|
|
|
|
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
speech as fast as it's spoken, while an RTF of 2 means it takes twice as long. Thus, **a lower RTF value indicates lower latency**.
|
57 |
|
58 |
```
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
```
|
|
|
|
|
62 |
|
63 |
-
|
|
|
64 |
|
65 |
## How to reproduce our results
|
66 |
|
@@ -86,7 +104,6 @@ are ranked based on their average WER scores, from lowest to highest.
|
|
86 |
| Dataset | Domain | Speaking Style | Train (h) | Dev (h) | Test (h) | Transcriptions | License |
|
87 |
|-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
|
88 |
| [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) | Audiobook | Narrated | 960 | 11 | 11 | Normalised | CC-BY-4.0 |
|
89 |
-
| [Common Voice 9](https://huggingface.co/datasets/mozilla-foundation/common_voice_9_0) | Wikipedia | Narrated | 1409 | 27 | 27 | Punctuated & Cased | CC0-1.0 |
|
90 |
| [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | European Parliament | Oratory | 523 | 5 | 5 | Punctuated | CC0 |
|
91 |
| [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium) | TED talks | Oratory | 454 | 2 | 3 | Normalised | CC-BY-NC-ND 3.0 |
|
92 |
| [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500 | 12 | 40 | Punctuated | apache-2.0 |
|
|
|
15 |
|
16 |
INTRODUCTION_TEXT = "📐 The 🤗 Open ASR Leaderboard ranks and evaluates speech recognition models \
|
17 |
on the Hugging Face Hub. \
|
18 |
+
\nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (⬇️ lower the better) and [RTFx](https://github.com/NVIDIA/DeepLearningExamples/blob/master/Kaldi/SpeechRecognition/README.md#metrics) (⬆️ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated. \
|
19 |
\nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. \
|
20 |
\nThe leaderboard currently focuses on English speech recognition, and will be expanded to multilingual evaluation in later versions."
|
21 |
|
|
|
33 |
|
34 |
## Metrics
|
35 |
|
36 |
+
Models are evaluated jointly using the Word Error Rate (WER) and Inverse Real Time Factor (RTFx) metrics. The WER metric
|
37 |
+
is used to assess the accuracy of a system, and the RTFx the inference speed. Models are ranked in the leaderboard based
|
38 |
+
on their WER, lowest to highest.
|
39 |
+
|
40 |
+
Crucially, the WER and RTFx values are computed for the same inference run using a single script. The implication of this is two-fold:
|
41 |
+
1. The WER and RTFx values are coupled: for a given WER, one can expect to achieve the corresponding RTFx. This allows the proposer to trade-off lower WER for higher RTFx should they wish.
|
42 |
+
2. The WER and RTFx values are averaged over all audios in the benchmark (in the order of thousands of audios).
|
43 |
+
|
44 |
+
For details on reproducing the benchmark numbers, refer to the [Open ASR GitHub repository](https://github.com/huggingface/open_asr_leaderboard#evaluate-a-model).
|
45 |
|
46 |
### Word Error Rate (WER)
|
47 |
|
48 |
Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
|
49 |
of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
|
50 |
|
51 |
+
Take the following example:
|
|
|
|
|
|
|
52 |
|
53 |
+
| Reference: | the | cat | sat | on | the | mat |
|
54 |
+
|-------------|-----|-----|---------|-----|-----|-----|
|
55 |
+
| Prediction: | the | cat | **sit** | on | the | | |
|
56 |
+
| Label: | ✅ | ✅ | S | ✅ | ✅ | D |
|
57 |
|
58 |
+
Here, we have:
|
59 |
+
* 1 substitution ("sit" instead of "sat")
|
60 |
+
* 0 insertions
|
61 |
+
* 1 deletion ("mat" is missing)
|
62 |
|
63 |
+
This gives 2 errors in total. To get our word error rate, we divide the total number of errors (substitutions + insertions + deletions) by the total number of words in our
|
64 |
+
reference (N), which for this example is 6:
|
|
|
65 |
|
66 |
```
|
67 |
+
WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
|
68 |
+
```
|
69 |
+
|
70 |
+
Giving a WER of 0.33, or 33%. For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints, meaning punctuation and casing is removed from the references and predictions. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
|
71 |
+
|
72 |
+
### Inverse Real Time Factor (RTFx)
|
73 |
+
|
74 |
+
Inverse Real Time Factor is a measure of the **latency** of automatic speech recognition systems, i.e. how long it takes an
|
75 |
+
model to process a given amount of speech. It is defined as:
|
76 |
```
|
77 |
+
RTFx = (number of seconds of audio inferred) / (compute time in seconds)
|
78 |
+
```
|
79 |
|
80 |
+
Therefore, and RTFx of 1 means a system processes speech as fast as it's spoken, while an RTFx of 2 means it takes half the time.
|
81 |
+
Thus, **a higher RTFx value indicates lower latency**.
|
82 |
|
83 |
## How to reproduce our results
|
84 |
|
|
|
104 |
| Dataset | Domain | Speaking Style | Train (h) | Dev (h) | Test (h) | Transcriptions | License |
|
105 |
|-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
|
106 |
| [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) | Audiobook | Narrated | 960 | 11 | 11 | Normalised | CC-BY-4.0 |
|
|
|
107 |
| [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | European Parliament | Oratory | 523 | 5 | 5 | Punctuated | CC0 |
|
108 |
| [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium) | TED talks | Oratory | 454 | 2 | 3 | Normalised | CC-BY-NC-ND 3.0 |
|
109 |
| [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500 | 12 | 40 | Punctuated | apache-2.0 |
|
init.py
CHANGED
@@ -14,7 +14,6 @@ hf_api = HfApi(
|
|
14 |
|
15 |
def load_all_info_from_dataset_hub():
|
16 |
eval_queue_repo = None
|
17 |
-
results_csv_path = None
|
18 |
requested_models = None
|
19 |
|
20 |
passed = True
|
@@ -40,7 +39,7 @@ def load_all_info_from_dataset_hub():
|
|
40 |
if csv_results is None:
|
41 |
passed = False
|
42 |
if not passed:
|
43 |
-
|
44 |
|
45 |
return eval_queue_repo, requested_models, csv_results
|
46 |
|
|
|
14 |
|
15 |
def load_all_info_from_dataset_hub():
|
16 |
eval_queue_repo = None
|
|
|
17 |
requested_models = None
|
18 |
|
19 |
passed = True
|
|
|
39 |
if csv_results is None:
|
40 |
passed = False
|
41 |
if not passed:
|
42 |
+
raise ValueError("No Hugging Face token provided. Skipping evaluation requests and results.")
|
43 |
|
44 |
return eval_queue_repo, requested_models, csv_results
|
45 |
|
utils_display.py
CHANGED
@@ -14,7 +14,7 @@ def fields(raw_class):
|
|
14 |
class AutoEvalColumn: # Auto evals column
|
15 |
model = ColumnContent("Model", "markdown")
|
16 |
avg_wer = ColumnContent("Average WER ⬇️", "number")
|
17 |
-
rtf = ColumnContent("
|
18 |
ami_wer = ColumnContent("AMI", "number")
|
19 |
e22_wer = ColumnContent("Earnings22", "number")
|
20 |
gs_wer = ColumnContent("Gigaspeech", "number")
|
@@ -23,7 +23,6 @@ class AutoEvalColumn: # Auto evals column
|
|
23 |
ss_wer = ColumnContent("SPGISpeech", "number")
|
24 |
tl_wer = ColumnContent("Tedlium", "number")
|
25 |
vp_wer = ColumnContent("Voxpopuli", "number")
|
26 |
-
cv_wer = ColumnContent("Common Voice", "number")
|
27 |
|
28 |
|
29 |
def make_clickable_model(model_name):
|
|
|
14 |
class AutoEvalColumn: # Auto evals column
|
15 |
model = ColumnContent("Model", "markdown")
|
16 |
avg_wer = ColumnContent("Average WER ⬇️", "number")
|
17 |
+
rtf = ColumnContent("RTFx ⬆️️", "number")
|
18 |
ami_wer = ColumnContent("AMI", "number")
|
19 |
e22_wer = ColumnContent("Earnings22", "number")
|
20 |
gs_wer = ColumnContent("Gigaspeech", "number")
|
|
|
23 |
ss_wer = ColumnContent("SPGISpeech", "number")
|
24 |
tl_wer = ColumnContent("Tedlium", "number")
|
25 |
vp_wer = ColumnContent("Voxpopuli", "number")
|
|
|
26 |
|
27 |
|
28 |
def make_clickable_model(model_name):
|