voice-chat-with-mistral

Runtime error

App Files Files Community

gorkemgoknar commited on Oct 19, 2023

Commit

2b2b539

1 Parent(s): ca0feab

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -18

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ import torch
 import nltk  # we'll use this to split into sentences
 nltk.download("punkt")
 import langid
 import uuid
@@ -114,8 +115,8 @@ import numpy as np
 from gradio_client import Client
 from huggingface_hub import InferenceClient
-WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 30))
-whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
 text_client = InferenceClient(
     "mistralai/Mistral-7B-Instruct-v0.1",
     timeout=WHISPER_TIMEOUT,
@@ -133,8 +134,25 @@ def get_latents(speaker_wav):
     ) = model.get_conditioning_latents(audio_path=speaker_wav)
     return gpt_cond_latent, diffusion_conditioning, speaker_embedding
-def get_latents(speaker_wav):
-    # Generate speaker embedding and latents for TTS
     (
         gpt_cond_latent,
         diffusion_conditioning,
@@ -161,11 +179,9 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
     return wav_buf.read()
 xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
-def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
-    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
     # Fast language autodetection
-    if len(prompt)>15 and language=="autodetect":
         language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
         if language_predicted == "zh":
             #we use zh-cn on xtts
@@ -181,7 +197,12 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
         # Hard to detect language fast in short sentence, use english default
         language = "en"
         print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
     try:
         t0 = time.time()
         chunks = model.inference_stream(
@@ -197,7 +218,7 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
                 first_chunk_time = time.time() - t0
                 metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
                 first_chunk = False
-            print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
             # In case output is required to be multiple voice files
             # out_file = f'{char}_{i}.wav'
@@ -368,22 +389,48 @@ def get_sentence(history, system_prompt=""):
     sentence_hash_list = []
     text_to_generate = ""
     for character in generate(history[-1][0], history[:-1]):
         history[-1][1] = character
         # It is coming word by word
         text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
         if len(text_to_generate) > 1:
             dif = len(text_to_generate) - len(sentence_list)
             if dif == 1 and len(sentence_list) != 0:
                 continue
-            sentence = text_to_generate[len(sentence_list)]
-            # This is expensive replace with hashing!
-            sentence_hash = hash(sentence)
             if sentence_hash not in sentence_hash_list:
                 sentence_hash_list.append(sentence_hash)
                 sentence_list.append(sentence)
@@ -394,9 +441,14 @@ def get_sentence(history, system_prompt=""):
     last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
     sentence_hash = hash(last_sentence)
     if sentence_hash not in sentence_hash_list:
         sentence_hash_list.append(sentence_hash)
         sentence_list.append(last_sentence)
-        print("New Sentence: ", last_sentence)
         yield (last_sentence, history)
@@ -408,6 +460,7 @@ def generate_speech(history):
     wav_bytestream = b""
     for sentence, history in get_sentence(history):
         print(sentence)
         # Sometimes prompt </s> coming on output remove it
         # Some post process for speech only
         sentence = sentence.replace("</s>", "")
@@ -417,9 +470,9 @@ def generate_speech(history):
         sentence = sentence.replace("```", "")
         sentence = sentence.replace("(", " ")
         sentence = sentence.replace(")", " ")
         # A fast fix for last chacter, may produce weird sounds if it is with text
-        if sentence[-1] in ["!", "?", ".", ","]:
             # just add a space
             sentence = sentence[:-1] + " " + sentence[-1]
         print("Sentence for speech:", sentence)
@@ -436,7 +489,12 @@ def generate_speech(history):
                 print("SPLITTED LONG SENTENCE:",sentence_list)
             for sentence in sentence_list:
                 if any(c.isalnum() for c in sentence):
                     #exists at least 1 alphanumeric (utf-8)
                     audio_stream = get_voice_streaming(
                             sentence, language, latent_map["Female_Voice"]
@@ -511,7 +569,7 @@ def generate_speech(history):
                 print("RuntimeError: non device-side assert error:", str(e))
                 raise e
-    time.sleep(1.5)
     wav_bytestream = wave_header_chunk() + wav_bytestream
     outfile = "combined.wav"
     with open(outfile, "wb") as f:
@@ -587,4 +645,4 @@ Note:
 - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
     )
 demo.queue()
-demo.launch(debug=True)

 import nltk  # we'll use this to split into sentences
 nltk.download("punkt")
+import subprocess
 import langid
 import uuid
 from gradio_client import Client
 from huggingface_hub import InferenceClient
+WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 45))
+whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/",timeout=WHISPER_TIMEOUT)
 text_client = InferenceClient(
     "mistralai/Mistral-7B-Instruct-v0.1",
     timeout=WHISPER_TIMEOUT,
     ) = model.get_conditioning_latents(audio_path=speaker_wav)
     return gpt_cond_latent, diffusion_conditioning, speaker_embedding
+def get_latents(speaker_wav,voice_cleanup=False):
+    if (voice_cleanup):
+        try:
+            cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
+            resample_filter="-ac 1 -ar 22050"
+            out_filename = speaker_wav + str(uuid.uuid4()) + ".wav"  #ffmpeg to know output format
+            #we will use newer ffmpeg as that has afftn denoise filter
+            shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
+            command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
+            speaker_wav=out_filename
+            print("Filtered microphone input")
+        except subprocess.CalledProcessError:
+            # There was an error - command exited with non-zero code
+            print("Error: failed filtering, use original microphone input")
+    else:
+            speaker_wav=speaker_wav
+    # create as function as we can populate here with voice cleanup/filtering
     (
         gpt_cond_latent,
         diffusion_conditioning,
     return wav_buf.read()
 xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
+def detect_language(prompt):
     # Fast language autodetection
+    if len(prompt)>15:
         language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
         if language_predicted == "zh":
             #we use zh-cn on xtts
         # Hard to detect language fast in short sentence, use english default
         language = "en"
         print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
+    return language
+def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
+    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
     try:
         t0 = time.time()
         chunks = model.inference_stream(
                 first_chunk_time = time.time() - t0
                 metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
                 first_chunk = False
+            #print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
             # In case output is required to be multiple voice files
             # out_file = f'{char}_{i}.wav'
     sentence_hash_list = []
     text_to_generate = ""
+    stored_sentence = None
+    stored_sentence_hash = None
     for character in generate(history[-1][0], history[:-1]):
         history[-1][1] = character
         # It is coming word by word
         text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
         if len(text_to_generate) > 1:
             dif = len(text_to_generate) - len(sentence_list)
             if dif == 1 and len(sentence_list) != 0:
                 continue
+            if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
+                continue
+            # All this complexity due to trying append first short sentence to next one for proper language auto-detect
+            if stored_sentence is not None and stored_sentence_hash is None and dif>1:
+                #means we consumed stored sentence and should look at next sentence to generate
+                sentence = text_to_generate[len(sentence_list)+1]
+            elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
+                print("Appending stored")
+                sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
+                stored_sentence_hash = None
+            else:
+                sentence = text_to_generate[len(sentence_list)]
+            # too short sentence just append to next one if there is any
+            # this is for proper language detection
+            if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
+                if sentence[-1] in [".","!","?"]:
+                    if stored_sentence_hash != hash(sentence):
+                        stored_sentence = sentence
+                        stored_sentence_hash = hash(sentence)
+                        print("Storing:",stored_sentence)
+                        continue
+            sentence_hash = hash(sentence)
+            if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
+                continue
             if sentence_hash not in sentence_hash_list:
                 sentence_hash_list.append(sentence_hash)
                 sentence_list.append(sentence)
     last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
     sentence_hash = hash(last_sentence)
     if sentence_hash not in sentence_hash_list:
+        if stored_sentence is not None and stored_sentence_hash is not None:
+            last_sentence = stored_sentence + last_sentence
+            stored_sentence = stored_sentence_hash = None
+            print("Last Sentence with stored:",last_sentence)
         sentence_hash_list.append(sentence_hash)
         sentence_list.append(last_sentence)
+        print("Last Sentence: ", last_sentence)
         yield (last_sentence, history)
     wav_bytestream = b""
     for sentence, history in get_sentence(history):
         print(sentence)
         # Sometimes prompt </s> coming on output remove it
         # Some post process for speech only
         sentence = sentence.replace("</s>", "")
         sentence = sentence.replace("```", "")
         sentence = sentence.replace("(", " ")
         sentence = sentence.replace(")", " ")
         # A fast fix for last chacter, may produce weird sounds if it is with text
+        if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
             # just add a space
             sentence = sentence[:-1] + " " + sentence[-1]
         print("Sentence for speech:", sentence)
                 print("SPLITTED LONG SENTENCE:",sentence_list)
             for sentence in sentence_list:
                 if any(c.isalnum() for c in sentence):
+                    if language=="autodetect":
+                        #on first call autodetect, nexts sentence calls will use same language
+                        language = detect_language(sentence)
                     #exists at least 1 alphanumeric (utf-8)
                     audio_stream = get_voice_streaming(
                             sentence, language, latent_map["Female_Voice"]
                 print("RuntimeError: non device-side assert error:", str(e))
                 raise e
+    time.sleep(1)
     wav_bytestream = wave_header_chunk() + wav_bytestream
     outfile = "combined.wav"
     with open(outfile, "wb") as f:
 - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
     )
 demo.queue()
+demo.launch(debug=True)