voice-chat-with-mistral

Runtime error

App Files Files Community

gorkemgoknar commited on Nov 7, 2023

Commit

e548e55

•

1 Parent(s): 245a4de

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -8

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ import torch
 import nltk  # we'll use this to split into sentences
 nltk.download("punkt")
 import subprocess
 import langid
 import uuid
@@ -549,7 +550,7 @@ def generate_speech(history,chatbot_role,llm_model):
 # will generate speech audio file per sentence
-def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=True):
     language = "autodetect"
     wav_bytestream = b""
@@ -583,7 +584,7 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
     #    sentence = sentence[:-1] + " " + sentence[-1]
     # regex does the job well
-    sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2",sentence)
     print("Sentence for speech:", sentence)
@@ -618,23 +619,34 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
             # XTTS is actually using streaming response but we are playing audio by sentence
             # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
-                wav_chunks = wave_header_chunk()
                 frame_length = 0
                 for chunk in audio_stream:
                     try:
                         wav_bytestream += chunk
-                        wav_chunks += chunk
                         frame_length += len(chunk)
                     except:
                         # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
                         continue
             if audio_stream is not None:
                 if not return_as_byte:
                     audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
-                    with open(audio_unique_filename, "wb") as f:
-                        f.write(wav_chunks)
-                    #Will write filename to context variable
                     return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
                 else:
                     return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
@@ -776,7 +788,7 @@ This Space demonstrates how to speak to a chatbot, based solely on open accessib
 It relies on following models :
 Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
 LLM Mistral    : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
-LLM Zephyr     : [Zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF).
 Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:

 import nltk  # we'll use this to split into sentences
 nltk.download("punkt")
+import noisereduce as nr
 import subprocess
 import langid
 import uuid
 # will generate speech audio file per sentence
+def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=False):
     language = "autodetect"
     wav_bytestream = b""
     #    sentence = sentence[:-1] + " " + sentence[-1]
     # regex does the job well
+    sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
     print("Sentence for speech:", sentence)
             # XTTS is actually using streaming response but we are playing audio by sentence
             # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
                 frame_length = 0
                 for chunk in audio_stream:
                     try:
                         wav_bytestream += chunk
                         frame_length += len(chunk)
                     except:
                         # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
                         continue
+            # Filter output for better voice
+            filter_output=True
+            if filter_output:
+                data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
+                float_data = data_s16 * 0.5**15
+                reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
+                wav_bytestream = (reduced_noise * 32767).astype(np.int16)
+                wav_bytestream = wav_bytestream.tobytes()
             if audio_stream is not None:
                 if not return_as_byte:
                     audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
+                    with wave.open(audio_unique_filename, "w") as f:
+                        f.setnchannels(1)
+                        # 2 bytes per sample.
+                        f.setsampwidth(2)
+                        f.setframerate(24000)
+                        f.writeframes(wav_bytestream)
                     return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
                 else:
                     return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
 It relies on following models :
 Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
 LLM Mistral    : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
+LLM Zephyr     : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
 Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note: