Spaces:

justus-tobias
/

VoiceBot

Sleeping

App Files Files Community

j-tobias commited on Aug 16, 2024

Commit

e9daf29

1 Parent(s): e1e27eb

added model

Browse files

Files changed (2) hide show

app.py +54 -14
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -23,14 +23,14 @@ client = InferenceClient(
     token=hf_token)
-processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
-model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
-model.config.forced_decoder_ids = None
-def chat(audio, chat:list):
-    transcription = transcribe(audio)
     chat.append({'role':'user','content':transcription})
     response = client.chat_completion(
         messages=chat,
@@ -40,7 +40,7 @@ def chat(audio, chat:list):
     chat.append({'role':'assistant','content':response})
     return chat
-def transcribe(audio):
     sr, audio = audio
     audio = audio.astype(np.float32)
     if len(audio.shape) > 2 and audio.shape[1] > 1:
@@ -52,8 +52,44 @@ def transcribe(audio):
     transcription = processor.tokenizer.normalize(transcription[0])
     return transcription
 with gr.Blocks() as app:
     chatbot = gr.Chatbot(
         value=[{
             'role':'System',
@@ -71,12 +107,16 @@ with gr.Blocks() as app:
             scale=8
         )
-        # mode_option = gr.Radio(
-        #     choices=["online", "local"],
-        #     scale=1
-        # )
     # Event listener for when the audio recording stops
-    audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot], outputs=chatbot)
 app.launch()

     token=hf_token)
+def chat(audio, chat:list, asr_model:str):
+    if asr_model == "openai/whisper-large-v2":
+        transcription = transcribe_whisper_large_v2(audio)
+    elif asr_model == "openai/whisper-tiny.en":
+        transcription = transcribe_whisper_tiny_en(audio)
+    else:
+        raise ValueError(f"No Model found with the given choice: {asr_model}")
     chat.append({'role':'user','content':transcription})
     response = client.chat_completion(
         messages=chat,
     chat.append({'role':'assistant','content':response})
     return chat
+def transcribe_whisper_large_v2(audio):
     sr, audio = audio
     audio = audio.astype(np.float32)
     if len(audio.shape) > 2 and audio.shape[1] > 1:
     transcription = processor.tokenizer.normalize(transcription[0])
     return transcription
+def transcribe_whisper_tiny_en(audio):
+    sr, audio = audio
+    audio = audio.astype(np.float32)
+    if len(audio.shape) > 2 and audio.shape[1] > 1:
+        audio = np.mean(audio, axis=1)
+    audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
+    predicted_ids = model.generate(input_features)
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+    transcription = processor.tokenizer.normalize(transcription[0])
+    return transcription
+def load_model(asr_model_choice:str):
+    global processor
+    global model
+    global model_flag
+    if asr_model_choice == "openai/whisper-large-v2":
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
+        model.config.forced_decoder_ids = None
+        model_flag = "openai/whisper-large-v2"
+    elif asr_model_choice == "openai/whisper-tiny.en":
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model_flag = "openai/whisper-tiny.en"
+    print("Model Loaded: ",model_flag)
 with gr.Blocks() as app:
+    gr.Markdown("# VoiceBot")
+    gr.Markdown("Welcome to VoiceBot 👋, here is how it works")
+    gr.Markdown("This Bot can only be interacted with through your voice. Press record and say something, after stopping the recoding your audio will be processed directly. You have the option to choose between different models. The model you choose influences the Bot's perfomance to understand what you have said. A better perfomance also comes with longer waiting time. 😕")
+    gr.Markdown("Have fun playing arround 🎉")
+    gr.Markdown("If you have any wishes for models or and idea, feel free to let me know 🙌")
     chatbot = gr.Chatbot(
         value=[{
             'role':'System',
             scale=8
         )
+    with gr.Accordion(label="Settings", open=False):
+        asr_model_choice = gr.Radio(
+            label="Select ASR Model",
+            choices=["openai/whisper-large-v2","openai/whisper-tiny.en"],
+            value="openai/whisper-tiny.en"
+        )
+        asr_model_choice.change(load_model, asr_model_choice)
     # Event listener for when the audio recording stops
+    audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot, asr_model_choice], outputs=chatbot)
 app.launch()

requirements.txt CHANGED Viewed

@@ -33,7 +33,9 @@ markdown-it-py==3.0.0
 MarkupSafe==2.1.5
 matplotlib==3.9.2
 mdurl==0.1.2
 msgpack==1.0.8
 numba==0.60.0
 numpy==2.0.1
 orjson==3.10.7
@@ -66,9 +68,11 @@ sniffio==1.3.1
 soundfile==0.12.1
 soxr==0.4.0
 starlette==0.38.2
 threadpoolctl==3.5.0
 tokenizers==0.19.1
 tomlkit==0.12.0
 tqdm==4.66.5
 transformers==4.44.0
 typer==0.12.3

 MarkupSafe==2.1.5
 matplotlib==3.9.2
 mdurl==0.1.2
+mpmath==1.3.0
 msgpack==1.0.8
+networkx==3.3
 numba==0.60.0
 numpy==2.0.1
 orjson==3.10.7
 soundfile==0.12.1
 soxr==0.4.0
 starlette==0.38.2
+sympy==1.13.2
 threadpoolctl==3.5.0
 tokenizers==0.19.1
 tomlkit==0.12.0
+torch==2.4.0
 tqdm==4.66.5
 transformers==4.44.0
 typer==0.12.3