OpenSound commited on
Commit
4b8ea71
Β·
1 Parent(s): 0b57247

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -51
app.py CHANGED
@@ -31,16 +31,6 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
31
  def get_random_string():
32
  return "".join(str(uuid.uuid4()).split("-"))
33
 
34
- def traditional_to_simplified(segments):
35
- converter = opencc.OpenCC('t2s')
36
- seg_num = len(segments)
37
- for i in range(seg_num):
38
- words = segments[i]['words']
39
- for j in range(len(words)):
40
- segments[i]['words'][j]['word'] = converter.convert(segments[i]['words'][j]['word'])
41
- segments[i]['text'] = converter.convert(segments[i]['text'])
42
- return segments
43
-
44
  @spaces.GPU
45
  def seed_everything(seed):
46
  if seed != -1:
@@ -80,50 +70,14 @@ def get_mask_interval(transcribe_state, word_span):
80
 
81
  return (start, end)
82
 
83
-
84
- from whisperx import load_align_model
85
-
86
-
87
- @spaces.GPU
88
- class WhisperxAlignModel:
89
- def __init__(self, language):
90
- from whisperx import load_align_model
91
- self.model, self.metadata = load_align_model(language_code=language, device=device)
92
-
93
- def align(self, segments, audio_path):
94
- from whisperx import align, load_audio
95
- audio = load_audio(audio_path)
96
- return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
97
-
98
-
99
- @spaces.GPU
100
- class WhisperxModel:
101
- def __init__(self, model_name, align_model, language):
102
- from whisperx import load_model
103
- self.model = load_model(model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
104
- self.align_model = align_model
105
-
106
- def transcribe(self, audio_path):
107
- segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
108
- for segment in segments:
109
- segment['text'] = replace_numbers_with_words(segment['text'])
110
- return self.align_model.align(segments, audio_path)
111
-
112
  from whisperx import load_align_model, load_model, load_audio
113
  from whisperx import align as align_func
114
 
115
-
116
  ssrspeech_model_name = "English"
117
  text_tokenizer = TextTokenizer(backend="espeak")
118
  language = "en"
119
  transcribe_model_name = "base.en"
120
 
121
- # align_model = WhisperxAlignModel(language)
122
- # transcribe_model = WhisperxModel(transcribe_model_name, align_model, language)
123
-
124
- # align_model, align_model_metadata = load_align_model(language_code=language, device=device)
125
- # transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
126
-
127
  ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
128
  if not os.path.exists(ssrspeech_fn):
129
  os.system(f"wget https://huggingface.co/westbrook/SSR-Speech-{ssrspeech_model_name}/resolve/main/{ssrspeech_model_name}.pth -O " + ssrspeech_fn)
@@ -161,7 +115,10 @@ def get_transcribe_state(segments):
161
  def transcribe(audio_path):
162
  align_model, _ = load_align_model(language_code=language, device=device)
163
  transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
164
- segments = transcribe_model.transcribe(audio_path)
 
 
 
165
  state = get_transcribe_state(segments)
166
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
167
 
@@ -350,12 +307,12 @@ demo_text = {
350
  def get_app():
351
  with gr.Blocks() as app:
352
  gr.Markdown("""
353
- # EzAudio: High-quality Text-to-Audio Generator
354
- Generate and edit audio from text using a diffusion transformer. Adjust advanced settings for more control.
355
 
356
- Learn more about 🟣**EzAudio** on the [EzAudio Homepage](https://haidog-yaqub.github.io/EzAudio-Page/).
357
 
358
- πŸš€ The **EzAudio-ControlNet (Energy Envelope)** demo is now live! Try it on [πŸ€—EzAudio-ControlNet Space](https://huggingface.co/spaces/OpenSound/EzAudio-ControlNet).
359
  """)
360
  with gr.Row():
361
  with gr.Column(scale=2):
 
31
  def get_random_string():
32
  return "".join(str(uuid.uuid4()).split("-"))
33
 
 
 
 
 
 
 
 
 
 
 
34
  @spaces.GPU
35
  def seed_everything(seed):
36
  if seed != -1:
 
70
 
71
  return (start, end)
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  from whisperx import load_align_model, load_model, load_audio
74
  from whisperx import align as align_func
75
 
 
76
  ssrspeech_model_name = "English"
77
  text_tokenizer = TextTokenizer(backend="espeak")
78
  language = "en"
79
  transcribe_model_name = "base.en"
80
 
 
 
 
 
 
 
81
  ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
82
  if not os.path.exists(ssrspeech_fn):
83
  os.system(f"wget https://huggingface.co/westbrook/SSR-Speech-{ssrspeech_model_name}/resolve/main/{ssrspeech_model_name}.pth -O " + ssrspeech_fn)
 
115
  def transcribe(audio_path):
116
  align_model, _ = load_align_model(language_code=language, device=device)
117
  transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
118
+ segments = transcribe_model.transcribe(audio_path, batch_size=8)["segments"]
119
+ for segment in segments:
120
+ segment['text'] = replace_numbers_with_words(segment['text'])
121
+ segments = align_model.align(segments, audio_path)
122
  state = get_transcribe_state(segments)
123
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
124
 
 
307
  def get_app():
308
  with gr.Blocks() as app:
309
  gr.Markdown("""
310
+ # SSR-Speech: High-quality Speech Editor and Text-to-Speech Synthesizer
311
+ Generate and edit speech from text. Adjust advanced settings for more control.
312
 
313
+ Learn more about 🟣**SSR-Speech** on the [SSR-Speech Homepage](https://wanghelin1997.github.io/SSR-Speech-Demo/).
314
 
315
+ πŸš€ The **SSR-Speech (Mandarin)** demo is now live! Try it on [πŸ€—SSR-Speech-Mandarin Space](https://huggingface.co/spaces/OpenSound/SSR-Speech-Mandarin).
316
  """)
317
  with gr.Row():
318
  with gr.Column(scale=2):