Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -125,10 +125,11 @@ class WhisperxModel:
|
|
125 |
return self.align_model.align(segments, audio_path)
|
126 |
|
127 |
@spaces.GPU
|
128 |
-
def load_models(
|
129 |
global transcribe_model, align_model, ssrspeech_model
|
130 |
|
131 |
alignment_model_name = "whisperX"
|
|
|
132 |
if ssrspeech_model_name == "English":
|
133 |
ssrspeech_model_name = "English"
|
134 |
text_tokenizer = TextTokenizer(backend="espeak")
|
@@ -141,16 +142,8 @@ def load_models(whisper_backend_name, ssrspeech_model_name):
|
|
141 |
language = "zh"
|
142 |
whisper_model_name = "base"
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
if whisper_model_name is not None:
|
148 |
-
if whisper_backend_name == "whisper":
|
149 |
-
transcribe_model = WhisperModel(whisper_model_name, language)
|
150 |
-
else:
|
151 |
-
if align_model is None:
|
152 |
-
raise gr.Error("Align model required for whisperx backend")
|
153 |
-
transcribe_model = WhisperxModel(whisper_model_name, align_model, language)
|
154 |
|
155 |
ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
|
156 |
if not os.path.exists(ssrspeech_fn):
|
@@ -261,7 +254,7 @@ def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_
|
|
261 |
target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
262 |
orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
263 |
|
264 |
-
orig_transcript, segments = transcribe(audio_path)
|
265 |
if language == 'zh':
|
266 |
converter = opencc.OpenCC('t2s')
|
267 |
orig_transcript = converter.convert(orig_transcript)
|
@@ -289,7 +282,7 @@ def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_
|
|
289 |
|
290 |
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
291 |
sf.write(audio_path, audio, 16000)
|
292 |
-
orig_transcript, segments = transcribe(audio_path)
|
293 |
|
294 |
if language == 'zh':
|
295 |
converter = opencc.OpenCC('t2s')
|
@@ -377,7 +370,7 @@ def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_
|
|
377 |
new_audio = new_audio[0].cpu()
|
378 |
torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
379 |
if tts: # remove the start parts
|
380 |
-
new_transcript, new_segments = transcribe(audio_path)
|
381 |
if language == 'zh':
|
382 |
transcribe_state = align(traditional_to_simplified(new_segments), audio_path)
|
383 |
transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
|
@@ -411,13 +404,6 @@ demo_text = {
|
|
411 |
},
|
412 |
}
|
413 |
|
414 |
-
all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
|
415 |
-
|
416 |
-
demo_words = ['0.069 Gwynplain 0.611', '0.671 had, 0.912', '0.952 besides, 1.414', '1.494 for 1.634', '1.695 his 1.835', '1.915 work 2.136', '2.196 and 2.297', '2.337 for 2.517', '2.557 his 2.678', '2.758 feats 3.019', '3.079 of 3.139', '3.2 strength, 3.561', '4.022 round 4.263', '4.303 his 4.444', '4.524 neck 4.705', '4.745 and 4.825', '4.905 over 5.086', '5.146 his 5.266', '5.307 shoulders, 5.768', '6.23 an 6.33', '6.531 esclavine 7.133', '7.213 of 7.293', '7.353 leather. 7.614']
|
417 |
-
|
418 |
-
demo_words_info = [{'word': 'Gwynplain', 'start': 0.069, 'end': 0.611, 'score': 0.833}, {'word': 'had,', 'start': 0.671, 'end': 0.912, 'score': 0.879}, {'word': 'besides,', 'start': 0.952, 'end': 1.414, 'score': 0.863}, {'word': 'for', 'start': 1.494, 'end': 1.634, 'score': 0.89}, {'word': 'his', 'start': 1.695, 'end': 1.835, 'score': 0.669}, {'word': 'work', 'start': 1.915, 'end': 2.136, 'score': 0.916}, {'word': 'and', 'start': 2.196, 'end': 2.297, 'score': 0.766}, {'word': 'for', 'start': 2.337, 'end': 2.517, 'score': 0.808}, {'word': 'his', 'start': 2.557, 'end': 2.678, 'score': 0.786}, {'word': 'feats', 'start': 2.758, 'end': 3.019, 'score': 0.97}, {'word': 'of', 'start': 3.079, 'end': 3.139, 'score': 0.752}, {'word': 'strength,', 'start': 3.2, 'end': 3.561, 'score': 0.742}, {'word': 'round', 'start': 4.022, 'end': 4.263, 'score': 0.916}, {'word': 'his', 'start': 4.303, 'end': 4.444, 'score': 0.666}, {'word': 'neck', 'start': 4.524, 'end': 4.705, 'score': 0.908}, {'word': 'and', 'start': 4.745, 'end': 4.825, 'score': 0.882}, {'word': 'over', 'start': 4.905, 'end': 5.086, 'score': 0.847}, {'word': 'his', 'start': 5.146, 'end': 5.266, 'score': 0.791}, {'word': 'shoulders,', 'start': 5.307, 'end': 5.768, 'score': 0.729}, {'word': 'an', 'start': 6.23, 'end': 6.33, 'score': 0.854}, {'word': 'esclavine', 'start': 6.531, 'end': 7.133, 'score': 0.803}, {'word': 'of', 'start': 7.213, 'end': 7.293, 'score': 0.772}, {'word': 'leather.', 'start': 7.353, 'end': 7.614, 'score': 0.896}]
|
419 |
-
|
420 |
-
|
421 |
def get_app():
|
422 |
with gr.Blocks() as app:
|
423 |
with gr.Row():
|
@@ -428,7 +414,6 @@ def get_app():
|
|
428 |
with gr.Row():
|
429 |
ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
|
430 |
choices=["English", "Mandarin"])
|
431 |
-
whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
|
432 |
|
433 |
with gr.Row():
|
434 |
with gr.Column(scale=2):
|
@@ -440,7 +425,7 @@ def get_app():
|
|
440 |
|
441 |
with gr.Column(scale=3):
|
442 |
with gr.Group():
|
443 |
-
transcript = gr.Textbox(label="Text", lines=7, value=demo_text["
|
444 |
|
445 |
with gr.Row():
|
446 |
mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
|
@@ -449,9 +434,6 @@ def get_app():
|
|
449 |
|
450 |
with gr.Column(scale=2):
|
451 |
output_audio = gr.Audio(label="Output Audio")
|
452 |
-
with gr.Accordion("Inference transcript", open=False):
|
453 |
-
inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
|
454 |
-
info="Inference was performed on this transcript.")
|
455 |
|
456 |
with gr.Row():
|
457 |
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
@@ -477,7 +459,7 @@ def get_app():
|
|
477 |
success_output = gr.HTML()
|
478 |
|
479 |
load_models_btn.click(fn=load_models,
|
480 |
-
inputs=[
|
481 |
outputs=[models_selector, success_output])
|
482 |
|
483 |
semgents = gr.State() # not used
|
|
|
125 |
return self.align_model.align(segments, audio_path)
|
126 |
|
127 |
@spaces.GPU
|
128 |
+
def load_models(ssrspeech_model_name):
|
129 |
global transcribe_model, align_model, ssrspeech_model
|
130 |
|
131 |
alignment_model_name = "whisperX"
|
132 |
+
whisper_backend_name = "whisperX"
|
133 |
if ssrspeech_model_name == "English":
|
134 |
ssrspeech_model_name = "English"
|
135 |
text_tokenizer = TextTokenizer(backend="espeak")
|
|
|
142 |
language = "zh"
|
143 |
whisper_model_name = "base"
|
144 |
|
145 |
+
align_model = WhisperxAlignModel(language)
|
146 |
+
transcribe_model = WhisperxModel(whisper_model_name, align_model, language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
|
149 |
if not os.path.exists(ssrspeech_fn):
|
|
|
254 |
target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
255 |
orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
256 |
|
257 |
+
[orig_transcript, segments, _] = transcribe(audio_path)
|
258 |
if language == 'zh':
|
259 |
converter = opencc.OpenCC('t2s')
|
260 |
orig_transcript = converter.convert(orig_transcript)
|
|
|
282 |
|
283 |
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
284 |
sf.write(audio_path, audio, 16000)
|
285 |
+
[orig_transcript, segments, _] = transcribe(audio_path)
|
286 |
|
287 |
if language == 'zh':
|
288 |
converter = opencc.OpenCC('t2s')
|
|
|
370 |
new_audio = new_audio[0].cpu()
|
371 |
torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
372 |
if tts: # remove the start parts
|
373 |
+
[new_transcript, new_segments, _] = transcribe(audio_path)
|
374 |
if language == 'zh':
|
375 |
transcribe_state = align(traditional_to_simplified(new_segments), audio_path)
|
376 |
transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
|
|
|
404 |
},
|
405 |
}
|
406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
def get_app():
|
408 |
with gr.Blocks() as app:
|
409 |
with gr.Row():
|
|
|
414 |
with gr.Row():
|
415 |
ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
|
416 |
choices=["English", "Mandarin"])
|
|
|
417 |
|
418 |
with gr.Row():
|
419 |
with gr.Column(scale=2):
|
|
|
425 |
|
426 |
with gr.Column(scale=3):
|
427 |
with gr.Group():
|
428 |
+
transcript = gr.Textbox(label="Text", lines=7, value=demo_text["Edit"]["regular"])
|
429 |
|
430 |
with gr.Row():
|
431 |
mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
|
|
|
434 |
|
435 |
with gr.Column(scale=2):
|
436 |
output_audio = gr.Audio(label="Output Audio")
|
|
|
|
|
|
|
437 |
|
438 |
with gr.Row():
|
439 |
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
|
|
459 |
success_output = gr.HTML()
|
460 |
|
461 |
load_models_btn.click(fn=load_models,
|
462 |
+
inputs=[ssrspeech_model_choice],
|
463 |
outputs=[models_selector, success_output])
|
464 |
|
465 |
semgents = gr.State() # not used
|