sanchit-gandhi
commited on
Commit
·
3155f54
1
Parent(s):
e676bd8
single tab
Browse files
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import os
|
2 |
-
from functools import partial
|
3 |
|
4 |
import numpy as np
|
5 |
import unicodedata
|
@@ -64,30 +63,40 @@ target_dtype = np.int16
|
|
64 |
max_range = np.iinfo(target_dtype).max
|
65 |
|
66 |
|
67 |
-
def get_visualisation(idx, model="v2"):
|
68 |
idx -= 1
|
69 |
audio = dataset[idx]["audio"]
|
70 |
array = (audio["array"] * max_range).astype(np.int16)
|
71 |
sampling_rate = audio["sampling_rate"]
|
72 |
|
73 |
text1 = norm_target[idx]
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
wer_output = process_words(text1, text2, wer_default, wer_default)
|
77 |
-
wer_percentage = round(100 * wer_output.wer,
|
78 |
-
ier_percentage = round(
|
|
|
|
|
79 |
|
80 |
-
rel_length = round(len(text2.split()) / len(text1.split()),
|
81 |
|
82 |
diff = compare_string(text1, text2)
|
83 |
full_text = style_text(diff)
|
84 |
|
85 |
return (sampling_rate, array), wer_percentage, ier_percentage, rel_length, full_text
|
86 |
|
|
|
87 |
def get_side_by_side_visualisation(idx):
|
88 |
-
large_v2 = get_visualisation(idx, model="v2")
|
89 |
-
large_32_2 = get_visualisation(idx, model="32-2")
|
|
|
90 |
table = [large_v2[1:-1], large_32_2[1:-1]]
|
|
|
91 |
table[0] = ["large-v2", *table[0]]
|
92 |
table[1] = ["large-32-2", *table[1]]
|
93 |
return large_v2[0], table, large_v2[-1], large_32_2[-1]
|
@@ -95,76 +104,37 @@ def get_side_by_side_visualisation(idx):
|
|
95 |
|
96 |
if __name__ == "__main__":
|
97 |
with gr.Blocks() as demo:
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
)
|
106 |
-
btn = gr.Button("Analyse")
|
107 |
-
audio_out = gr.Audio(label="Audio input")
|
108 |
with gr.Row():
|
109 |
-
|
110 |
-
|
111 |
-
label="Insertion Error Rate (IER)"
|
112 |
-
)
|
113 |
-
relative_length = gr.Number(
|
114 |
-
label="Relative length (reference length / target length)"
|
115 |
-
)
|
116 |
-
text_out = gr.Markdown(label="Text difference")
|
117 |
-
|
118 |
-
btn.click(
|
119 |
-
fn=partial(get_visualisation, model="v2"),
|
120 |
-
inputs=slider,
|
121 |
-
outputs=[audio_out, wer, ier, relative_length, text_out],
|
122 |
-
)
|
123 |
-
with gr.Tab("large-32-2"):
|
124 |
-
gr.Markdown(
|
125 |
-
"Analyse the transcriptions generated by the Whisper large-32-2 model on the TEDLIUM dev set."
|
126 |
-
)
|
127 |
-
slider = gr.Slider(
|
128 |
-
minimum=1, maximum=len(norm_target), step=1, label="Dataset sample"
|
129 |
-
)
|
130 |
-
btn = gr.Button("Analyse")
|
131 |
-
audio_out = gr.Audio(label="Audio input")
|
132 |
with gr.Row():
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
btn.click(
|
143 |
-
fn=partial(get_visualisation, model="32-2"),
|
144 |
-
inputs=slider,
|
145 |
-
outputs=[audio_out, wer, ier, relative_length, text_out],
|
146 |
-
)
|
147 |
-
with gr.Tab("side-by-side"):
|
148 |
-
gr.Markdown(
|
149 |
-
"Analyse the transcriptions generated by the Whisper large-32-2 model on the TEDLIUM dev set."
|
150 |
-
)
|
151 |
-
slider = gr.Slider(
|
152 |
-
minimum=1, maximum=len(norm_target), step=1, label="Dataset sample"
|
153 |
-
)
|
154 |
-
btn = gr.Button("Analyse")
|
155 |
-
audio_out = gr.Audio(label="Audio input")
|
156 |
-
with gr.Column():
|
157 |
-
table = gr.Dataframe(headers=["Model", "Word Error Rate (WER)", "Insertion Error Rate (IER)", "Rel length (ref length / tgt length)"], height=1000)
|
158 |
-
with gr.Row():
|
159 |
-
gr.Markdown("large-v2 text diff")
|
160 |
-
gr.Markdown("large-32-2 text diff")
|
161 |
-
with gr.Row():
|
162 |
-
text_out_v2 = gr.Markdown(label="Text difference")
|
163 |
-
text_out_32_2 = gr.Markdown(label="Text difference")
|
164 |
-
|
165 |
-
btn.click(
|
166 |
-
fn=get_side_by_side_visualisation,
|
167 |
-
inputs=slider,
|
168 |
-
outputs=[audio_out, table, text_out_v2, text_out_32_2],
|
169 |
-
)
|
170 |
demo.launch()
|
|
|
1 |
import os
|
|
|
2 |
|
3 |
import numpy as np
|
4 |
import unicodedata
|
|
|
63 |
max_range = np.iinfo(target_dtype).max
|
64 |
|
65 |
|
66 |
+
def get_visualisation(idx, model="large-v2", round_dp=2):
|
67 |
idx -= 1
|
68 |
audio = dataset[idx]["audio"]
|
69 |
array = (audio["array"] * max_range).astype(np.int16)
|
70 |
sampling_rate = audio["sampling_rate"]
|
71 |
|
72 |
text1 = norm_target[idx]
|
73 |
+
if model == "large-v2":
|
74 |
+
text2 = norm_pred_v2[idx]
|
75 |
+
elif model == "large-32-2":
|
76 |
+
text2 = norm_pred_32_2[idx]
|
77 |
+
else:
|
78 |
+
raise ValueError(f"Got unknown model {model}, should be one of `'large-v2'` or `'large-32-2'`.")
|
79 |
|
80 |
wer_output = process_words(text1, text2, wer_default, wer_default)
|
81 |
+
wer_percentage = round(100 * wer_output.wer, round_dp)
|
82 |
+
ier_percentage = round(
|
83 |
+
100 * wer_output.insertions / len(wer_output.references[0]), round_dp
|
84 |
+
)
|
85 |
|
86 |
+
rel_length = round(len(text2.split()) / len(text1.split()), round_dp)
|
87 |
|
88 |
diff = compare_string(text1, text2)
|
89 |
full_text = style_text(diff)
|
90 |
|
91 |
return (sampling_rate, array), wer_percentage, ier_percentage, rel_length, full_text
|
92 |
|
93 |
+
|
94 |
def get_side_by_side_visualisation(idx):
|
95 |
+
large_v2 = get_visualisation(idx, model="large-v2")
|
96 |
+
large_32_2 = get_visualisation(idx, model="large-32-2")
|
97 |
+
# format the rows
|
98 |
table = [large_v2[1:-1], large_32_2[1:-1]]
|
99 |
+
# format the model names
|
100 |
table[0] = ["large-v2", *table[0]]
|
101 |
table[1] = ["large-32-2", *table[1]]
|
102 |
return large_v2[0], table, large_v2[-1], large_32_2[-1]
|
|
|
104 |
|
105 |
if __name__ == "__main__":
|
106 |
with gr.Blocks() as demo:
|
107 |
+
gr.Markdown(
|
108 |
+
"Analyse the transcriptions generated by the Whisper large-v2 and large-32-2 models on the TEDLIUM dev set."
|
109 |
+
"The transcriptions for both models are shown at the bottom of the demo. The text diff for each is computed "
|
110 |
+
"relative to the target transcriptions. Insertions are displayed in <span style='background-color:Lightgreen'>green</span>, and "
|
111 |
+
"deletions in <span style='background-color:#FFCCCB'><s>red</s></span>."
|
112 |
+
)
|
113 |
+
slider = gr.Slider(
|
114 |
+
minimum=1, maximum=len(norm_target), step=1, label="Dataset sample"
|
115 |
+
)
|
116 |
+
btn = gr.Button("Analyse")
|
117 |
+
audio_out = gr.Audio(label="Audio input")
|
118 |
+
with gr.Column():
|
119 |
+
table = gr.Dataframe(
|
120 |
+
headers=[
|
121 |
+
"Model",
|
122 |
+
"Word Error Rate (WER)",
|
123 |
+
"Insertion Error Rate (IER)",
|
124 |
+
"Rel length (ref length / tgt length)",
|
125 |
+
],
|
126 |
+
height=1000,
|
127 |
)
|
|
|
|
|
128 |
with gr.Row():
|
129 |
+
gr.Markdown("**large-v2 text diff**")
|
130 |
+
gr.Markdown("**large-32-2 text diff**")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
with gr.Row():
|
132 |
+
text_out_v2 = gr.Markdown(label="Text difference")
|
133 |
+
text_out_32_2 = gr.Markdown(label="Text difference")
|
134 |
+
|
135 |
+
btn.click(
|
136 |
+
fn=get_side_by_side_visualisation,
|
137 |
+
inputs=slider,
|
138 |
+
outputs=[audio_out, table, text_out_v2, text_out_32_2],
|
139 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
demo.launch()
|