Spaces:

jiedong-yang
/

Speech-Summarization-with-Whisper

Runtime error

App Files Files Community

jiedong-yang commited on Nov 2, 2022

Commit

5367de0

1 Parent(s): 4ee15f5

Upload app.py

Browse files

Files changed (1) hide show

app.py +13 -11

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from wordcloud import WordCloud, STOPWORDS
 from scipy.io.wavfile import write
 from espnet2.bin.tts_inference import Text2Speech
 # load whisper model for ASR and BART for summarization
 asr_model = whisper.load_model('base.en')
 summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
@@ -23,7 +25,7 @@ def load_model(name: str):
     :return:
     """
     global asr_model
-    asr_model = whisper.load_model(f"{name.lower()}.en")
     return name
@@ -144,9 +146,6 @@ with demo:
     2. Generate transcription with Whisper (English Only)
     3. Summarize the transcribed speech
     4. Generate summary speech with the ESPNet model
-    model references:
-    - [Whisper](https://github.com/openai/whisper), [ESPNet](https://github.com/espnet/espnet_model_zoo)
     """)
     # data preparation
@@ -161,17 +160,13 @@ with demo:
         url.change(audio_from_url, inputs=url, outputs=speech)
-    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
-                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
-                           inputs=[url])
     # ASR
     text = gr.Textbox(label="Transcription", placeholder="transcription")
     with gr.Row():
-        default_values = dict(model='Base', bs=5, bo=5) if torch.cuda.is_available() \
-            else dict(model='Tiny', bs=1, bo=1)
-        model_options = gr.Dropdown(['Tiny', 'Base'], value=default_values['model'], label="models")
         model_options.change(load_model, inputs=model_options, outputs=model_options)
         beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
@@ -202,6 +197,13 @@ with demo:
     text.change(wordcloud_func, inputs=text, outputs=image)
 if __name__ == '__main__':
     demo.launch()

 from scipy.io.wavfile import write
 from espnet2.bin.tts_inference import Text2Speech
+from utils import *
 # load whisper model for ASR and BART for summarization
 asr_model = whisper.load_model('base.en')
 summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
     :return:
     """
     global asr_model
+    asr_model = whisper.load_model(f"{name.lower()}")
     return name
     2. Generate transcription with Whisper (English Only)
     3. Summarize the transcribed speech
     4. Generate summary speech with the ESPNet model
     """)
     # data preparation
         url.change(audio_from_url, inputs=url, outputs=speech)
     # ASR
     text = gr.Textbox(label="Transcription", placeholder="transcription")
     with gr.Row():
+        default_values = dict(model='Base.en', bs=5, bo=5) if torch.cuda.is_available() \
+            else dict(model='Tiny.en', bs=1, bo=1)
+        model_options = gr.Dropdown(['Tiny.en', 'Base.en'], value=default_values['model'], label="models")
         model_options.change(load_model, inputs=model_options, outputs=model_options)
         beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
     text.change(wordcloud_func, inputs=text, outputs=image)
+    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
+                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
+                           fn=lambda x: speech_to_text(audio_from_url(x)),
+                           inputs=url, outputs=text, cache_examples=True)
+    gr.HTML(footer_html)
 if __name__ == '__main__':
     demo.launch()