OpenSound commited on
Commit
9457d94
·
1 Parent(s): cc9b589

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -27
app.py CHANGED
@@ -125,10 +125,11 @@ class WhisperxModel:
125
  return self.align_model.align(segments, audio_path)
126
 
127
  @spaces.GPU
128
- def load_models(whisper_backend_name, ssrspeech_model_name):
129
  global transcribe_model, align_model, ssrspeech_model
130
 
131
  alignment_model_name = "whisperX"
 
132
  if ssrspeech_model_name == "English":
133
  ssrspeech_model_name = "English"
134
  text_tokenizer = TextTokenizer(backend="espeak")
@@ -141,16 +142,8 @@ def load_models(whisper_backend_name, ssrspeech_model_name):
141
  language = "zh"
142
  whisper_model_name = "base"
143
 
144
- if alignment_model_name is not None:
145
- align_model = WhisperxAlignModel(language)
146
-
147
- if whisper_model_name is not None:
148
- if whisper_backend_name == "whisper":
149
- transcribe_model = WhisperModel(whisper_model_name, language)
150
- else:
151
- if align_model is None:
152
- raise gr.Error("Align model required for whisperx backend")
153
- transcribe_model = WhisperxModel(whisper_model_name, align_model, language)
154
 
155
  ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
156
  if not os.path.exists(ssrspeech_fn):
@@ -261,7 +254,7 @@ def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_
261
  target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
262
  orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
263
 
264
- orig_transcript, segments = transcribe(audio_path)
265
  if language == 'zh':
266
  converter = opencc.OpenCC('t2s')
267
  orig_transcript = converter.convert(orig_transcript)
@@ -289,7 +282,7 @@ def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_
289
 
290
  audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
291
  sf.write(audio_path, audio, 16000)
292
- orig_transcript, segments = transcribe(audio_path)
293
 
294
  if language == 'zh':
295
  converter = opencc.OpenCC('t2s')
@@ -377,7 +370,7 @@ def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_
377
  new_audio = new_audio[0].cpu()
378
  torchaudio.save(audio_path, new_audio, codec_audio_sr)
379
  if tts: # remove the start parts
380
- new_transcript, new_segments = transcribe(audio_path)
381
  if language == 'zh':
382
  transcribe_state = align(traditional_to_simplified(new_segments), audio_path)
383
  transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
@@ -411,13 +404,6 @@ demo_text = {
411
  },
412
  }
413
 
414
- all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
415
-
416
- demo_words = ['0.069 Gwynplain 0.611', '0.671 had, 0.912', '0.952 besides, 1.414', '1.494 for 1.634', '1.695 his 1.835', '1.915 work 2.136', '2.196 and 2.297', '2.337 for 2.517', '2.557 his 2.678', '2.758 feats 3.019', '3.079 of 3.139', '3.2 strength, 3.561', '4.022 round 4.263', '4.303 his 4.444', '4.524 neck 4.705', '4.745 and 4.825', '4.905 over 5.086', '5.146 his 5.266', '5.307 shoulders, 5.768', '6.23 an 6.33', '6.531 esclavine 7.133', '7.213 of 7.293', '7.353 leather. 7.614']
417
-
418
- demo_words_info = [{'word': 'Gwynplain', 'start': 0.069, 'end': 0.611, 'score': 0.833}, {'word': 'had,', 'start': 0.671, 'end': 0.912, 'score': 0.879}, {'word': 'besides,', 'start': 0.952, 'end': 1.414, 'score': 0.863}, {'word': 'for', 'start': 1.494, 'end': 1.634, 'score': 0.89}, {'word': 'his', 'start': 1.695, 'end': 1.835, 'score': 0.669}, {'word': 'work', 'start': 1.915, 'end': 2.136, 'score': 0.916}, {'word': 'and', 'start': 2.196, 'end': 2.297, 'score': 0.766}, {'word': 'for', 'start': 2.337, 'end': 2.517, 'score': 0.808}, {'word': 'his', 'start': 2.557, 'end': 2.678, 'score': 0.786}, {'word': 'feats', 'start': 2.758, 'end': 3.019, 'score': 0.97}, {'word': 'of', 'start': 3.079, 'end': 3.139, 'score': 0.752}, {'word': 'strength,', 'start': 3.2, 'end': 3.561, 'score': 0.742}, {'word': 'round', 'start': 4.022, 'end': 4.263, 'score': 0.916}, {'word': 'his', 'start': 4.303, 'end': 4.444, 'score': 0.666}, {'word': 'neck', 'start': 4.524, 'end': 4.705, 'score': 0.908}, {'word': 'and', 'start': 4.745, 'end': 4.825, 'score': 0.882}, {'word': 'over', 'start': 4.905, 'end': 5.086, 'score': 0.847}, {'word': 'his', 'start': 5.146, 'end': 5.266, 'score': 0.791}, {'word': 'shoulders,', 'start': 5.307, 'end': 5.768, 'score': 0.729}, {'word': 'an', 'start': 6.23, 'end': 6.33, 'score': 0.854}, {'word': 'esclavine', 'start': 6.531, 'end': 7.133, 'score': 0.803}, {'word': 'of', 'start': 7.213, 'end': 7.293, 'score': 0.772}, {'word': 'leather.', 'start': 7.353, 'end': 7.614, 'score': 0.896}]
419
-
420
-
421
  def get_app():
422
  with gr.Blocks() as app:
423
  with gr.Row():
@@ -428,7 +414,6 @@ def get_app():
428
  with gr.Row():
429
  ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
430
  choices=["English", "Mandarin"])
431
- whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
432
 
433
  with gr.Row():
434
  with gr.Column(scale=2):
@@ -440,7 +425,7 @@ def get_app():
440
 
441
  with gr.Column(scale=3):
442
  with gr.Group():
443
- transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["regular"])
444
 
445
  with gr.Row():
446
  mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
@@ -449,9 +434,6 @@ def get_app():
449
 
450
  with gr.Column(scale=2):
451
  output_audio = gr.Audio(label="Output Audio")
452
- with gr.Accordion("Inference transcript", open=False):
453
- inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
454
- info="Inference was performed on this transcript.")
455
 
456
  with gr.Row():
457
  with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
@@ -477,7 +459,7 @@ def get_app():
477
  success_output = gr.HTML()
478
 
479
  load_models_btn.click(fn=load_models,
480
- inputs=[whisper_backend_choice, ssrspeech_model_choice],
481
  outputs=[models_selector, success_output])
482
 
483
  semgents = gr.State() # not used
 
125
  return self.align_model.align(segments, audio_path)
126
 
127
  @spaces.GPU
128
+ def load_models(ssrspeech_model_name):
129
  global transcribe_model, align_model, ssrspeech_model
130
 
131
  alignment_model_name = "whisperX"
132
+ whisper_backend_name = "whisperX"
133
  if ssrspeech_model_name == "English":
134
  ssrspeech_model_name = "English"
135
  text_tokenizer = TextTokenizer(backend="espeak")
 
142
  language = "zh"
143
  whisper_model_name = "base"
144
 
145
+ align_model = WhisperxAlignModel(language)
146
+ transcribe_model = WhisperxModel(whisper_model_name, align_model, language)
 
 
 
 
 
 
 
 
147
 
148
  ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
149
  if not os.path.exists(ssrspeech_fn):
 
254
  target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
255
  orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
256
 
257
+ [orig_transcript, segments, _] = transcribe(audio_path)
258
  if language == 'zh':
259
  converter = opencc.OpenCC('t2s')
260
  orig_transcript = converter.convert(orig_transcript)
 
282
 
283
  audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
284
  sf.write(audio_path, audio, 16000)
285
+ [orig_transcript, segments, _] = transcribe(audio_path)
286
 
287
  if language == 'zh':
288
  converter = opencc.OpenCC('t2s')
 
370
  new_audio = new_audio[0].cpu()
371
  torchaudio.save(audio_path, new_audio, codec_audio_sr)
372
  if tts: # remove the start parts
373
+ [new_transcript, new_segments, _] = transcribe(audio_path)
374
  if language == 'zh':
375
  transcribe_state = align(traditional_to_simplified(new_segments), audio_path)
376
  transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
 
404
  },
405
  }
406
 
 
 
 
 
 
 
 
407
  def get_app():
408
  with gr.Blocks() as app:
409
  with gr.Row():
 
414
  with gr.Row():
415
  ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
416
  choices=["English", "Mandarin"])
 
417
 
418
  with gr.Row():
419
  with gr.Column(scale=2):
 
425
 
426
  with gr.Column(scale=3):
427
  with gr.Group():
428
+ transcript = gr.Textbox(label="Text", lines=7, value=demo_text["Edit"]["regular"])
429
 
430
  with gr.Row():
431
  mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
 
434
 
435
  with gr.Column(scale=2):
436
  output_audio = gr.Audio(label="Output Audio")
 
 
 
437
 
438
  with gr.Row():
439
  with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
 
459
  success_output = gr.HTML()
460
 
461
  load_models_btn.click(fn=load_models,
462
+ inputs=[ssrspeech_model_choice],
463
  outputs=[models_selector, success_output])
464
 
465
  semgents = gr.State() # not used