jiedong-yang commited on
Commit
5367de0
ยท
1 Parent(s): 4ee15f5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -11
app.py CHANGED
@@ -10,6 +10,8 @@ from wordcloud import WordCloud, STOPWORDS
10
  from scipy.io.wavfile import write
11
  from espnet2.bin.tts_inference import Text2Speech
12
 
 
 
13
  # load whisper model for ASR and BART for summarization
14
  asr_model = whisper.load_model('base.en')
15
  summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
@@ -23,7 +25,7 @@ def load_model(name: str):
23
  :return:
24
  """
25
  global asr_model
26
- asr_model = whisper.load_model(f"{name.lower()}.en")
27
  return name
28
 
29
 
@@ -144,9 +146,6 @@ with demo:
144
  2. Generate transcription with Whisper (English Only)
145
  3. Summarize the transcribed speech
146
  4. Generate summary speech with the ESPNet model
147
-
148
- model references:
149
- - [Whisper](https://github.com/openai/whisper), [ESPNet](https://github.com/espnet/espnet_model_zoo)
150
  """)
151
 
152
  # data preparation
@@ -161,17 +160,13 @@ with demo:
161
 
162
  url.change(audio_from_url, inputs=url, outputs=speech)
163
 
164
- examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
165
- "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
166
- inputs=[url])
167
-
168
  # ASR
169
  text = gr.Textbox(label="Transcription", placeholder="transcription")
170
 
171
  with gr.Row():
172
- default_values = dict(model='Base', bs=5, bo=5) if torch.cuda.is_available() \
173
- else dict(model='Tiny', bs=1, bo=1)
174
- model_options = gr.Dropdown(['Tiny', 'Base'], value=default_values['model'], label="models")
175
  model_options.change(load_model, inputs=model_options, outputs=model_options)
176
 
177
  beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
@@ -202,6 +197,13 @@ with demo:
202
 
203
  text.change(wordcloud_func, inputs=text, outputs=image)
204
 
 
 
 
 
 
 
 
205
 
206
  if __name__ == '__main__':
207
  demo.launch()
 
10
  from scipy.io.wavfile import write
11
  from espnet2.bin.tts_inference import Text2Speech
12
 
13
+ from utils import *
14
+
15
  # load whisper model for ASR and BART for summarization
16
  asr_model = whisper.load_model('base.en')
17
  summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
 
25
  :return:
26
  """
27
  global asr_model
28
+ asr_model = whisper.load_model(f"{name.lower()}")
29
  return name
30
 
31
 
 
146
  2. Generate transcription with Whisper (English Only)
147
  3. Summarize the transcribed speech
148
  4. Generate summary speech with the ESPNet model
 
 
 
149
  """)
150
 
151
  # data preparation
 
160
 
161
  url.change(audio_from_url, inputs=url, outputs=speech)
162
 
 
 
 
 
163
  # ASR
164
  text = gr.Textbox(label="Transcription", placeholder="transcription")
165
 
166
  with gr.Row():
167
+ default_values = dict(model='Base.en', bs=5, bo=5) if torch.cuda.is_available() \
168
+ else dict(model='Tiny.en', bs=1, bo=1)
169
+ model_options = gr.Dropdown(['Tiny.en', 'Base.en'], value=default_values['model'], label="models")
170
  model_options.change(load_model, inputs=model_options, outputs=model_options)
171
 
172
  beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
 
197
 
198
  text.change(wordcloud_func, inputs=text, outputs=image)
199
 
200
+ examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
201
+ "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
202
+ fn=lambda x: speech_to_text(audio_from_url(x)),
203
+ inputs=url, outputs=text, cache_examples=True)
204
+
205
+ gr.HTML(footer_html)
206
+
207
 
208
  if __name__ == '__main__':
209
  demo.launch()