indonesian-whisperer

Runtime error

App Files Files Community

cahya commited on Jan 2, 2023

Commit

1b6ec9d

1 Parent(s): ce254f5

add model dropdown

Browse files

Files changed (1) hide show

app.py +31 -13

app.py CHANGED Viewed

@@ -11,6 +11,20 @@ from gpuinfo import GPUInfo
 MODEL_NAME = "cahya/whisper-medium-id"  # this always needs to stay in line 8 :D sorry for the hackiness
 lang = "id"
 title = "Indonesian Whisperer"
 description = "Cross Language Speech to Speech (Indonesian/English to 25 other languages) using OpenAI Whisper and Coqui TTS"
@@ -46,17 +60,18 @@ languages = {
 device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=MODEL_NAME,
-    chunk_length_s=30,
-    device=device,
-)
-pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
-def transcribe(microphone, file_upload):
     warn_output = ""
     if (microphone is not None) and (file_upload is not None):
         warn_output = (
@@ -80,11 +95,12 @@ default_lang = "en"
 coquiTTS = CoquiTTS()
-def tts(language: str, audio_microphone: str, audio_file: str):
     language = languages[language]
     time_start = time.time()
     print(f"### {datetime.now()} TTS", language, audio_file)
-    transcription = transcribe(audio_microphone, audio_file)
     print(f"### {datetime.now()} transcribed:", transcription)
     translation = translate(transcription, language, "id")
     # return output
@@ -113,6 +129,8 @@ with gr.Blocks() as blocks:
             audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
             audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
             language = gr.Dropdown([lang for lang in languages.keys()], label="Target Language", value="English")
             with gr.Row():  # mobile_collapse=False
                 submit = gr.Button("Submit", variant="primary")
             examples = gr.Examples(examples=["data/Jokowi - 2022.mp3", "data/Soekarno - 1963.mp3", "data/JFK.mp3"],
@@ -131,8 +149,8 @@ with gr.Blocks() as blocks:
     # actions
     submit.click(
-        tts,
-        [language, audio_microphone, audio_upload],
         [text_source, text_target, audio, system_info],
     )

 MODEL_NAME = "cahya/whisper-medium-id"  # this always needs to stay in line 8 :D sorry for the hackiness
+whisper_models = {
+    "Indonesian Whisper Tiny": {
+        "name": "cahya/whisper-tiny-id",
+        "pipe": None,
+    },
+    "Indonesian Whisper Small": {
+        "name": "cahya/whisper-small-id",
+        "pipe": None,
+    },
+    "Indonesian Whisper Medium": {
+        "name": "cahya/whisper-medium-id",
+        "pipe": None,
+    },
+}
 lang = "id"
 title = "Indonesian Whisperer"
 description = "Cross Language Speech to Speech (Indonesian/English to 25 other languages) using OpenAI Whisper and Coqui TTS"
 device = 0 if torch.cuda.is_available() else "cpu"
+for model in whisper_models:
+    whisper_models[model]["pipe"] = pipeline(
+        task="automatic-speech-recognition",
+        model=whisper_models[model]["name"],
+        chunk_length_s=30,
+        device=device,
+    )
+    whisper_models[model]["pipe"].model.config.forced_decoder_ids = \
+        whisper_models[model]["pipe"].tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
+def transcribe(pipe, microphone, file_upload):
     warn_output = ""
     if (microphone is not None) and (file_upload is not None):
         warn_output = (
 coquiTTS = CoquiTTS()
+def process(language: str, model: str, audio_microphone: str, audio_file: str):
     language = languages[language]
+    pipe = whisper_models[model]["pipe"]
     time_start = time.time()
     print(f"### {datetime.now()} TTS", language, audio_file)
+    transcription = transcribe(pipe, audio_microphone, audio_file)
     print(f"### {datetime.now()} transcribed:", transcription)
     translation = translate(transcription, language, "id")
     # return output
             audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
             audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
             language = gr.Dropdown([lang for lang in languages.keys()], label="Target Language", value="English")
+            model = gr.Dropdown([model for model in whisper_models.keys()],
+                                     label="Whisper Model", value="Indonesian Whisper Medium")
             with gr.Row():  # mobile_collapse=False
                 submit = gr.Button("Submit", variant="primary")
             examples = gr.Examples(examples=["data/Jokowi - 2022.mp3", "data/Soekarno - 1963.mp3", "data/JFK.mp3"],
     # actions
     submit.click(
+        process,
+        [language, model, audio_microphone, audio_upload],
         [text_source, text_target, audio, system_info],
     )