Spaces:
Running
Running
Commit
·
8ae5c89
1
Parent(s):
f34b85f
Update app.py
Browse files
app.py
CHANGED
@@ -2,12 +2,20 @@ import numpy as np
|
|
2 |
import streamlit as st
|
3 |
from constants import WHISPER_MODELS, language_dict
|
4 |
import streamlit as st
|
5 |
-
from utils import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import whisperx as whisper
|
7 |
import json
|
8 |
import pandas as pd
|
9 |
from pydub import AudioSegment
|
10 |
import os
|
|
|
11 |
|
12 |
if "btn1" not in st.session_state:
|
13 |
st.session_state["btn1"] = False
|
@@ -52,14 +60,14 @@ with input:
|
|
52 |
# on_change=disable_btn2,
|
53 |
# disabled=st.session_state["btn1"],
|
54 |
)
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
text_json = None
|
63 |
|
64 |
# st.markdown("""**model**""", unsafe_allow_html=True)
|
65 |
model_name = st.selectbox(
|
@@ -137,16 +145,16 @@ with input:
|
|
137 |
)
|
138 |
else:
|
139 |
temperature = [temperature]
|
140 |
-
try:
|
141 |
-
if len(temperature) == 0:
|
142 |
-
st.error("Choose correct value for temperature")
|
143 |
-
except:
|
144 |
-
pass
|
145 |
-
# st.write(temperature)
|
146 |
submit = st.button("Submit", type="primary")
|
147 |
with output:
|
148 |
st.header("Output")
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
name = str(uuid.uuid1())
|
152 |
if submit:
|
@@ -157,46 +165,37 @@ with output:
|
|
157 |
if audio_uploaded.name.endswith(".wav"):
|
158 |
temp = AudioSegment.from_wav(audio_uploaded)
|
159 |
temp.export(f"{name}.wav")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
-
|
162 |
-
temp = AudioSegment.from_wav(audio_uploaded)
|
163 |
-
temp.export(f"{name}.wav")
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
if language == "":
|
168 |
model = whisper.load_model(model_name)
|
169 |
-
with st.
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
with st.spinner("Running ... "):
|
184 |
-
decode = {"suppress_tokens": suppress_tokens, "beam_size": 5}
|
185 |
-
model = whisper.load_model(model_name)
|
186 |
-
with st.container():
|
187 |
-
with st.spinner(f"Running with {model_name} model"):
|
188 |
-
result = model.transcribe(
|
189 |
-
f"{name}.wav",
|
190 |
-
language=language,
|
191 |
-
patience=patience,
|
192 |
-
initial_prompt=initial_prompt,
|
193 |
-
condition_on_previous_text=condition_on_previous_text,
|
194 |
-
temperature=temperature,
|
195 |
-
compression_ratio_threshold=compression_ratio_threshold,
|
196 |
-
logprob_threshold=logprob_threshold,
|
197 |
-
no_speech_threshold=no_speech_threshold,
|
198 |
-
**decode,
|
199 |
-
)
|
200 |
|
201 |
if translate:
|
202 |
result = translate_to_english(result, json=False)
|
@@ -213,21 +212,6 @@ with output:
|
|
213 |
f"{name}.wav",
|
214 |
device=device,
|
215 |
)
|
216 |
-
|
217 |
-
if text_json is not None:
|
218 |
-
if translate:
|
219 |
-
result = translate_to_english(text_json, json=True)
|
220 |
-
with st.spinner("Running alignment model ..."):
|
221 |
-
model_a, metadata = whisper.load_align_model(
|
222 |
-
language_code=language, device=device
|
223 |
-
)
|
224 |
-
|
225 |
-
result_aligned = whisper.align(
|
226 |
-
text_json, model_a, metadata, audio_uploaded.name, device
|
227 |
-
)
|
228 |
-
|
229 |
-
if text_json is None:
|
230 |
-
words_segments = result_aligned["word_segments"]
|
231 |
write(
|
232 |
f"{name}.wav",
|
233 |
dtype=transcription,
|
@@ -237,13 +221,40 @@ with output:
|
|
237 |
trans.text_area(
|
238 |
"transcription", trans_text, height=None, max_chars=None, key=None
|
239 |
)
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
segments_post.text_area(
|
248 |
"Word Segments after alignment",
|
249 |
result_aligned["word_segments"],
|
@@ -251,10 +262,6 @@ with output:
|
|
251 |
max_chars=None,
|
252 |
key=None,
|
253 |
)
|
254 |
-
with open("segments.json", "w", encoding="utf-8") as f:
|
255 |
-
|
256 |
-
json.dump(result_aligned["word_segments"], f, indent=False)
|
257 |
-
|
258 |
segments_post2.text_area(
|
259 |
"Segments after alignment",
|
260 |
result_aligned["segments"],
|
@@ -265,4 +272,100 @@ with output:
|
|
265 |
lang.text_input(
|
266 |
"detected language", language_dict.get(language), disabled=True
|
267 |
)
|
268 |
-
os.remove(f"{name}.wav")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import streamlit as st
|
3 |
from constants import WHISPER_MODELS, language_dict
|
4 |
import streamlit as st
|
5 |
+
from utils import (
|
6 |
+
translate_to_english,
|
7 |
+
detect_language,
|
8 |
+
write,
|
9 |
+
read,
|
10 |
+
get_key,
|
11 |
+
|
12 |
+
)
|
13 |
import whisperx as whisper
|
14 |
import json
|
15 |
import pandas as pd
|
16 |
from pydub import AudioSegment
|
17 |
import os
|
18 |
+
import uuid
|
19 |
|
20 |
if "btn1" not in st.session_state:
|
21 |
st.session_state["btn1"] = False
|
|
|
60 |
# on_change=disable_btn2,
|
61 |
# disabled=st.session_state["btn1"],
|
62 |
)
|
63 |
+
text_json = st.file_uploader(
|
64 |
+
label="Aligned JSON",
|
65 |
+
type=["json"],
|
66 |
+
help="Your aligned json file (Only if you need to skip transcribe)",
|
67 |
+
# disabled=st.session_state["btn2"],
|
68 |
+
# on_change=disable_btn1,
|
69 |
+
)
|
70 |
+
# text_json = None
|
71 |
|
72 |
# st.markdown("""**model**""", unsafe_allow_html=True)
|
73 |
model_name = st.selectbox(
|
|
|
145 |
)
|
146 |
else:
|
147 |
temperature = [temperature]
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
submit = st.button("Submit", type="primary")
|
149 |
with output:
|
150 |
st.header("Output")
|
151 |
+
|
152 |
+
segments_pre = st.empty()
|
153 |
+
segments_post = st.empty()
|
154 |
+
segments_post_json = st.empty()
|
155 |
+
segments_post2 = st.empty()
|
156 |
+
trans = st.empty()
|
157 |
+
lang = st.empty()
|
158 |
|
159 |
name = str(uuid.uuid1())
|
160 |
if submit:
|
|
|
165 |
if audio_uploaded.name.endswith(".wav"):
|
166 |
temp = AudioSegment.from_wav(audio_uploaded)
|
167 |
temp.export(f"{name}.wav")
|
168 |
+
if audio_uploaded.name.endswith(".mp3"):
|
169 |
+
temp = AudioSegment.from_wav(audio_uploaded)
|
170 |
+
temp.export(f"{name}.wav")
|
171 |
+
if language == "":
|
172 |
+
model = whisper.load_model(model_name)
|
173 |
+
with st.spinner("Detecting language..."):
|
174 |
+
detection = detect_language(f"{name}.wav", model)
|
175 |
+
language = detection.get("detected_language")
|
176 |
+
del model
|
177 |
+
if len(language) > 2:
|
178 |
+
language = get_key(language)
|
179 |
|
180 |
+
if text_json is None:
|
|
|
|
|
181 |
|
182 |
+
with st.spinner("Running ... "):
|
183 |
+
decode = {"suppress_tokens": suppress_tokens, "beam_size": 5}
|
|
|
184 |
model = whisper.load_model(model_name)
|
185 |
+
with st.container():
|
186 |
+
with st.spinner(f"Running with {model_name} model"):
|
187 |
+
result = model.transcribe(
|
188 |
+
f"{name}.wav",
|
189 |
+
language=language,
|
190 |
+
patience=patience,
|
191 |
+
initial_prompt=initial_prompt,
|
192 |
+
condition_on_previous_text=condition_on_previous_text,
|
193 |
+
temperature=temperature,
|
194 |
+
compression_ratio_threshold=compression_ratio_threshold,
|
195 |
+
logprob_threshold=logprob_threshold,
|
196 |
+
no_speech_threshold=no_speech_threshold,
|
197 |
+
**decode,
|
198 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
if translate:
|
201 |
result = translate_to_english(result, json=False)
|
|
|
212 |
f"{name}.wav",
|
213 |
device=device,
|
214 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
write(
|
216 |
f"{name}.wav",
|
217 |
dtype=transcription,
|
|
|
221 |
trans.text_area(
|
222 |
"transcription", trans_text, height=None, max_chars=None, key=None
|
223 |
)
|
224 |
+
char_segments = []
|
225 |
+
word_segments = []
|
226 |
+
|
227 |
+
for x in range(len(result_aligned["segments"])):
|
228 |
+
word_segments.append(
|
229 |
+
{
|
230 |
+
"word-segments": result_aligned["segments"][x][
|
231 |
+
"word-segments"
|
232 |
+
]
|
233 |
+
.fillna("")
|
234 |
+
.to_dict(orient="records")
|
235 |
+
}
|
236 |
+
)
|
237 |
+
char_segments.append(
|
238 |
+
{
|
239 |
+
"char-segments": result_aligned["segments"][x][
|
240 |
+
"char-segments"
|
241 |
+
]
|
242 |
+
.fillna("")
|
243 |
+
.to_dict(orient="records")
|
244 |
+
}
|
245 |
+
)
|
246 |
+
|
247 |
+
for x in range(len(result_aligned["segments"])):
|
248 |
+
|
249 |
+
result_aligned["segments"][x]["word-segments"] = word_segments[x]
|
250 |
+
result_aligned["segments"][x]["char-segments"] = char_segments[x]
|
251 |
+
segments_pre.text_area(
|
252 |
+
"Segments before alignment",
|
253 |
+
result["segments"],
|
254 |
+
height=None,
|
255 |
+
max_chars=None,
|
256 |
+
key=None,
|
257 |
+
)
|
258 |
segments_post.text_area(
|
259 |
"Word Segments after alignment",
|
260 |
result_aligned["word_segments"],
|
|
|
262 |
max_chars=None,
|
263 |
key=None,
|
264 |
)
|
|
|
|
|
|
|
|
|
265 |
segments_post2.text_area(
|
266 |
"Segments after alignment",
|
267 |
result_aligned["segments"],
|
|
|
272 |
lang.text_input(
|
273 |
"detected language", language_dict.get(language), disabled=True
|
274 |
)
|
275 |
+
os.remove(f"{name}.wav")
|
276 |
+
if text_json is not None:
|
277 |
+
with st.spinner("Running ... "):
|
278 |
+
|
279 |
+
model = whisper.load_model(model_name)
|
280 |
+
json_filname = str(uuid.uuid1())
|
281 |
+
data = json.load(text_json)
|
282 |
+
|
283 |
+
# Close the uploaded file
|
284 |
+
text_json.close()
|
285 |
+
|
286 |
+
# Write the JSON data to a new file
|
287 |
+
with open(f"{json_filname}.json", "w") as outfile:
|
288 |
+
json.dump(data, outfile)
|
289 |
+
|
290 |
+
# with open("fold.json", "w", encoding="utf-8") as f:
|
291 |
+
# json.dump(text_json, f)
|
292 |
+
with open(f"{json_filname}.json", "r", encoding="utf-8") as f:
|
293 |
+
cont = json.load(f)
|
294 |
+
|
295 |
+
with st.spinner("Running alignment model ..."):
|
296 |
+
model_a, metadata = whisper.load_align_model(
|
297 |
+
language_code=language, device=device
|
298 |
+
)
|
299 |
+
result_aligned = whisper.align(
|
300 |
+
cont,
|
301 |
+
model_a,
|
302 |
+
metadata,
|
303 |
+
f"{name}.wav",
|
304 |
+
device=device,
|
305 |
+
)
|
306 |
+
words_segments = result_aligned["word_segments"]
|
307 |
+
write(
|
308 |
+
f"{name}.wav",
|
309 |
+
dtype=transcription,
|
310 |
+
result_aligned=result_aligned,
|
311 |
+
)
|
312 |
+
trans_text = read(f"{name}.wav", transcription)
|
313 |
+
char_segments = []
|
314 |
+
word_segments = []
|
315 |
+
|
316 |
+
for x in range(len(result_aligned["segments"])):
|
317 |
+
word_segments.append(
|
318 |
+
{
|
319 |
+
"word-segments": result_aligned["segments"][x][
|
320 |
+
"word-segments"
|
321 |
+
]
|
322 |
+
.fillna("")
|
323 |
+
.to_dict(orient="records")
|
324 |
+
}
|
325 |
+
)
|
326 |
+
char_segments.append(
|
327 |
+
{
|
328 |
+
"char-segments": result_aligned["segments"][x][
|
329 |
+
"char-segments"
|
330 |
+
]
|
331 |
+
.fillna("")
|
332 |
+
.to_dict(orient="records")
|
333 |
+
}
|
334 |
+
)
|
335 |
+
|
336 |
+
for x in range(len(result_aligned["segments"])):
|
337 |
+
|
338 |
+
result_aligned["segments"][x]["word-segments"] = word_segments[x]
|
339 |
+
result_aligned["segments"][x]["char-segments"] = char_segments[x]
|
340 |
+
trans.text_area(
|
341 |
+
"transcription", trans_text, height=None, max_chars=None, key=None
|
342 |
+
)
|
343 |
+
segments_pre.text_area(
|
344 |
+
"Segments before alignment",
|
345 |
+
cont,
|
346 |
+
height=None,
|
347 |
+
max_chars=None,
|
348 |
+
key=None,
|
349 |
+
)
|
350 |
+
|
351 |
+
segments_post.text_area(
|
352 |
+
"Word Segments after alignment",
|
353 |
+
result_aligned["word_segments"],
|
354 |
+
height=None,
|
355 |
+
max_chars=None,
|
356 |
+
key=None,
|
357 |
+
)
|
358 |
+
|
359 |
+
segments_post2.text_area(
|
360 |
+
"Segments after alignment",
|
361 |
+
result_aligned["segments"],
|
362 |
+
expanded=False
|
363 |
+
height=None,
|
364 |
+
max_chars=None,
|
365 |
+
key=None,
|
366 |
+
)
|
367 |
+
lang.text_input(
|
368 |
+
"detected language", language_dict.get(language), disabled=True
|
369 |
+
)
|
370 |
+
os.remove(f"{name}.wav")
|
371 |
+
os.remove(f"{json_filname}.json")
|