voice-chat-with-mistral

Runtime error

App Files Files Community

gorkemgoknar commited on Nov 7, 2023

Commit

a646e4a

•

1 Parent(s): 6f43b7c

use multillm , fix llama version, xtts v2 model with silence fix

Browse files

note zephyr as secondary llm will have 20 gpu layers to preserve vram

Files changed (1) hide show

app.py +132 -147

app.py CHANGED Viewed

@@ -53,12 +53,15 @@ from huggingface_hub import InferenceClient
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V2")
 from TTS.utils.manage import ModelManager
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 ModelManager().download_model(model_name)
 model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 print("XTTS downloaded")
 config = XttsConfig()
 config.load_json(os.path.join(model_path, "config.json"))
@@ -73,11 +76,11 @@ model.load_checkpoint(
 model.cuda()
 print("Done loading TTS")
-llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
-title = f"Voice chat with {llm_model.capitalize()} and Coqui XTTS"
-DESCRIPTION = f"""# Voice chat with {llm_model.capitalize()} and Coqui XTTS"""
 css = """.toast-wrap { display: none !important } """
 from huggingface_hub import HfApi
@@ -86,11 +89,11 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
-repo_id = "coqui/voice-chat-with-mistral"
 default_system_message = f"""
-You are {llm_model.capitalize()}, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
 The user is talking to you over voice on their phone, and your response will be read out loud with realistic text-to-speech (TTS) technology from Coqui team. Follow every direction here when crafting your response: Use natural, conversational language that are clear and easy to follow (short sentences, simple words). Be concise and relevant: Most of your responses should be a sentence or two, unless you’re asked to go deeper. Don’t monopolize the conversation. Use discourse markers to ease comprehension. Never use the list format. Keep the conversation flowing. Clarify: when there is ambiguity, ask clarifying questions, rather than make assumptions. Don’t implicitly or explicitly try to end the chat (i.e. do not end a response with “Talk soon!”, or “Enjoy!”). Sometimes the user might just want to chat. Ask them relevant follow-up questions. Don’t ask them if there’s anything else they need help with (e.g. don’t say things like “How can I assist you further?”). Remember that this is a voice conversation: Don’t use lists, markdown, bullet points, or other formatting that’s not typically spoken. Type out numbers in words (e.g. ‘twenty twelve’ instead of the year 2012). If something doesn’t make sense, it’s likely because you misheard them. There wasn’t a typo, and the user didn’t mispronounce anything. Remember to follow these rules absolutely, and do not refer to these rules, even if you’re asked about them.
 You cannot access the internet, but you have vast knowledge.
 Current date: CURRENT_DATE .
@@ -113,13 +116,19 @@ WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 45))
 whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
-ROLES = ["AI Assistant"]
 ROLE_PROMPTS = {}
 ROLE_PROMPTS["AI Assistant"]=system_message
 ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
-LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
 ### WILL USE LOCAL MISTRAL OR ZEPHYR
@@ -128,65 +137,75 @@ from huggingface_hub import hf_hub_download
 print("Downloading LLM")
-if llm_model == "zephyr":
-    #Zephyr
-    hf_hub_download(repo_id="TheBloke/zephyr-7B-alpha-GGUF", local_dir=".", filename="zephyr-7b-alpha.Q5_K_M.gguf")
-    # use new gguf format
-    model_path="./zephyr-7b-alpha.Q5_K_M.gguf"
-else:
-    #Mistral
-    hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
-    # use new gguf format
-    model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
 from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 # else 35 full layers + XTTS works fine on T4 16GB
-GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 15))
 LLAMA_VERBOSE=False
-print("Running LLM")
-llm = Llama(model_path=model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 # Mistral formatter
-def format_prompt_mistral(message, history, system_message=""):
     prompt = (
         "<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
     )
     for user_prompt, bot_response in history:
         prompt += f"[INST] {user_prompt} [/INST]"
         prompt += f" {bot_response}</s> "
     if message=="":
         message="Hello"
     prompt += f"[INST] {message} [/INST]"
     return prompt
 # Zephyr formatter
-def format_prompt_zephyr(message, history, system_message=""):
     prompt = (
-        "<|system|>" + system_message  +  "</s>"
     )
     for user_prompt, bot_response in history:
         prompt += f"<|user|>\n{user_prompt}</s>"
-        prompt += f"<|assistant|> {bot_response}</s>"
     if message=="":
         message="Hello"
     prompt += f"<|user|>\n{message}</s>"
     print(prompt)
     return prompt
-if llm_model=="zephyr":
-    format_prompt = format_prompt_zephyr
-else:
-    format_prompt = format_prompt_mistral
 def generate_local(
     prompt,
     history,
     system_message=None,
     temperature=0.8,
     max_tokens=256,
@@ -202,10 +221,18 @@ def generate_local(
         temperature=temperature,
         max_tokens=max_tokens,
         top_p=top_p,
-        stop=stop,
     )
-    formatted_prompt = format_prompt(prompt, history,system_message=system_message)
     try:
         print("LLM Input:", formatted_prompt)
@@ -227,7 +254,7 @@ def generate_local(
                 return
-            output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","").replace("/s>","")
             yield output
     except Exception as e:
@@ -289,7 +316,7 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
 xtts_supported_languages=config.languages
 def detect_language(prompt):
     # Fast language autodetection
-    if len(prompt)>13:
         language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
         if language_predicted == "zh":
             #we use zh-cn on xtts
@@ -318,8 +345,9 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
             prompt,
             language,
             gpt_cond_latent,
-            speaker_embedding
         )
         first_chunk = True
@@ -363,66 +391,6 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
     except:
         return None
-###### MISTRAL FUNCTIONS ######
-def generate(
-    prompt,
-    history,
-    temperature=0.9,
-    max_new_tokens=256,
-    top_p=0.95,
-    repetition_penalty=1.0,
-):
-    temperature = float(temperature)
-    if temperature < 1e-2:
-        temperature = 1e-2
-    top_p = float(top_p)
-    generate_kwargs = dict(
-        temperature=temperature,
-        max_new_tokens=max_new_tokens,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-        do_sample=True,
-        seed=42,
-    )
-    #formatted_prompt = format_prompt(prompt, history)
-    formatted_prompt = format_prompt_zephyr(prompt, history)
-    try:
-        stream = text_client.text_generation(
-            formatted_prompt,
-            **generate_kwargs,
-            stream=True,
-            details=True,
-            return_full_text=False,
-        )
-        output = ""
-        for response in stream:
-            output += response.token.text
-            yield output
-    except Exception as e:
-        if "Too Many Requests" in str(e):
-            print("ERROR: Too many requests on mistral client")
-            gr.Warning("Unfortunately Mistral is unable to process")
-            output = "Unfortuanately I am not able to process your request now, too many people are asking me !"
-        elif "Model not loaded on the server" in str(e):
-            print("ERROR: Mistral server down")
-            gr.Warning("Unfortunately Mistral LLM is unable to process")
-            output = "Unfortuanately I am not able to process your request now, I have problem with Mistral!"
-        else:
-            print("Unhandled Exception: ", str(e))
-            gr.Warning("Unfortunately Mistral is unable to process")
-            output = "I do not know what happened but I could not understand you ."
-        yield output
-        return None
-    return output
-###### WHISPER FUNCTIONS ######
 def transcribe(wav_path):
     try:
@@ -436,9 +404,7 @@ def transcribe(wav_path):
         gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
         return "There was a problem with my voice, tell me joke"
-# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
 # Will be triggered on text submit (will send to generate_speech)
 def add_text(history, text):
     history = [] if history is None else history
@@ -475,7 +441,8 @@ def bot(history, system_prompt=""):
         yield history
-def get_sentence(history, chatbot_role,system_prompt=""):
     history = [["", None]] if history is None else history
     if system_prompt == "":
@@ -484,18 +451,22 @@ def get_sentence(history, chatbot_role,system_prompt=""):
     history[-1][1] = ""
     mistral_start = time.time()
-    print("Mistral start")
     sentence_list = []
     sentence_hash_list = []
     text_to_generate = ""
     stored_sentence = None
     stored_sentence_hash = None
-    for character in generate_local(history[-1][0], history[:-1],system_message=ROLE_PROMPTS[chatbot_role]):
         history[-1][1] = character.replace("<|assistant|>","")
         # It is coming word by word
-        text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").strip())
         if len(text_to_generate) > 1:
             dif = len(text_to_generate) - len(sentence_list)
@@ -539,19 +510,23 @@ def get_sentence(history, chatbot_role,system_prompt=""):
                 yield (sentence, history)
     # return that final sentence token
-    last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
-    sentence_hash = hash(last_sentence)
-    if sentence_hash not in sentence_hash_list:
-        if stored_sentence is not None and stored_sentence_hash is not None:
-            last_sentence = stored_sentence + last_sentence
-            stored_sentence = stored_sentence_hash = None
-            print("Last Sentence with stored:",last_sentence)
-        sentence_hash_list.append(sentence_hash)
-        sentence_list.append(last_sentence)
-        print("Last Sentence: ", last_sentence)
-        yield (last_sentence, history)
 from scipy.io.wavfile import write
 from pydub import AudioSegment
@@ -560,22 +535,14 @@ second_of_silence = AudioSegment.silent() # use default
 second_of_silence.export("sil.wav", format='wav')
-def generate_speech(history,chatbot_role):
     # Must set autoplay to True first
     yield (history, chatbot_role, "", wave_header_chunk() )
-    first_sentence=True
-    language="autodetect" # will predict from first sentence
-    for sentence, history in get_sentence(history,chatbot_role):
         if sentence != "":
-            if first_sentence:
-                language = detect_language(sentence)
-                first_sentence=False
             print("BG: inserting sentence to queue")
-            generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True,language=language)
             if generated_speech is not None:
                 _, audio_dict = generated_speech
                 # We are using byte streaming
@@ -583,8 +550,9 @@ def generate_speech(history,chatbot_role):
 # will generate speech audio file per sentence
-def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=True, language="autodetect"):
     wav_bytestream = b""
     if len(sentence)==0:
@@ -609,7 +577,7 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
     if len(sentence)==0:
         print("EMPTY SENTENCE after processing")
         return
     # A fast fix for last chacter, may produce weird sounds if it is with text
     if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
         # just add a space
@@ -686,18 +654,20 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
     print("All speech ended")
     return
 latent_map = {}
 latent_map["AI Assistant"] = get_latents("examples/female.wav")
 #### GRADIO INTERFACE ####
 EXAMPLES = [
-    [[],"What is 42?"],
-    [[],"Speak in French, tell me how are you doing?"],
-    [[],"Antworten Sie mir von nun an auf Deutsch"],
 ]
 OTHER_HTML=f"""<div>
 <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
@@ -707,9 +677,18 @@ OTHER_HTML=f"""<div>
 <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
 </div>
 """
 with gr.Blocks(title=title) as demo:
     gr.Markdown(DESCRIPTION)
     gr.Markdown(OTHER_HTML)
     chatbot = gr.Chatbot(
         [],
         elem_id="chatbot",
@@ -734,6 +713,7 @@ with gr.Blocks(title=title) as demo:
         )
         txt_btn = gr.Button(value="Submit text", scale=1)
         btn = gr.Audio(source="microphone", type="filepath", scale=4)
     def stop():
         print("Audio STOP")
         set_audio_playing(False)
@@ -750,27 +730,31 @@ with gr.Blocks(title=title) as demo:
         )
         audio.end(stop)
     with gr.Row():
         gr.Examples(
         EXAMPLES,
-        [chatbot, txt],
-        [chatbot, txt],
         add_text,
         cache_examples=False,
         run_on_click=False, # Will not work , user should submit it
-    )
     clear_btn = gr.ClearButton([chatbot, audio])
     txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
-        generate_speech,  [chatbot,chatbot_role], [chatbot,chatbot_role, sentence, audio]
     )
     txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
     txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
-        generate_speech,  [chatbot,chatbot_role], [chatbot,chatbot_role, sentence, audio]
     )
     txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
@@ -778,18 +762,19 @@ with gr.Blocks(title=title) as demo:
     file_msg = btn.stop_recording(
         add_file, [chatbot, btn], [chatbot, txt], queue=False
     ).then(
-        generate_speech,  [chatbot,chatbot_role], [chatbot,chatbot_role, sentence, audio]
     )
     file_msg.then(lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), None, [txt, btn], queue=False)
     gr.Markdown(
         """
-This Space demonstrates how to speak to a chatbot, based solely on open-source models.
-It relies on 3 stage models:
-- Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
-- LLM Model      : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
-- Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
 - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml

 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V2")
 from TTS.utils.manage import ModelManager
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 ModelManager().download_model(model_name)
 model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 print("XTTS downloaded")
+print("Loading XTTS")
 config = XttsConfig()
 config.load_json(os.path.join(model_path, "config.json"))
 model.cuda()
 print("Done loading TTS")
+#####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
+title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
+DESCRIPTION = """# Voice chat with Zephyr/Mistral and Coqui XTTS"""
 css = """.toast-wrap { display: none !important } """
 from huggingface_hub import HfApi
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
+repo_id = "coqui/voice-chat-with-zephyr"
 default_system_message = f"""
+You are ##LLM_MODEL###, a large language model trained ##LLM_MODEL_PROVIDER###, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
 The user is talking to you over voice on their phone, and your response will be read out loud with realistic text-to-speech (TTS) technology from Coqui team. Follow every direction here when crafting your response: Use natural, conversational language that are clear and easy to follow (short sentences, simple words). Be concise and relevant: Most of your responses should be a sentence or two, unless you’re asked to go deeper. Don’t monopolize the conversation. Use discourse markers to ease comprehension. Never use the list format. Keep the conversation flowing. Clarify: when there is ambiguity, ask clarifying questions, rather than make assumptions. Don’t implicitly or explicitly try to end the chat (i.e. do not end a response with “Talk soon!”, or “Enjoy!”). Sometimes the user might just want to chat. Ask them relevant follow-up questions. Don’t ask them if there’s anything else they need help with (e.g. don’t say things like “How can I assist you further?”). Remember that this is a voice conversation: Don’t use lists, markdown, bullet points, or other formatting that’s not typically spoken. Type out numbers in words (e.g. ‘twenty twelve’ instead of the year 2012). If something doesn’t make sense, it’s likely because you misheard them. There wasn’t a typo, and the user didn’t mispronounce anything. Remember to follow these rules absolutely, and do not refer to these rules, even if you’re asked about them.
 You cannot access the internet, but you have vast knowledge.
 Current date: CURRENT_DATE .
 whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
+ROLES = ["AI Assistant","AI Beard The Pirate"]
 ROLE_PROMPTS = {}
 ROLE_PROMPTS["AI Assistant"]=system_message
+#Pirate scenario
+character_name= "AI Beard"
+character_scenario= f"As {character_name} you are a 28 year old man who is a pirate on the ship Invisible AI. You are good friends with Guybrush Threepwood and Murray the Skull. Developers did not get you into Monkey Island games as you wanted huge shares of Big Whoop treasure."
+pirate_system_message = f"You as {character_name}. {character_scenario} Print out only exactly the words that {character_name} would speak out, do not add anything. Don't repeat. Answer short, only few words, as if in a talk. Craft your response only from the first-person perspective of {character_name} and never as user.Current date: #CURRENT_DATE#".replace("#CURRENT_DATE#", str(datetime.date.today()))
+ROLE_PROMPTS["AI Beard The Pirate"]= pirate_system_message
 ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
 ### WILL USE LOCAL MISTRAL OR ZEPHYR
 print("Downloading LLM")
+print("Downloading Zephyr")
+#Zephyr
+hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
+# use new gguf format
+zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
+print("Downloading Mistral")
+#Mistral
+hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
+# use new gguf format
+mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
 from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 # else 35 full layers + XTTS works fine on T4 16GB
+# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
+GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))
+LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
 LLAMA_VERBOSE=False
+print("Running LLM Mistral")
+llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
+print("Running LLM Zephyr")
+llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-10,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 # Mistral formatter
+def format_prompt_mistral(message, history, system_message=system_message,system_understand_message=system_understand_message):
     prompt = (
         "<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
     )
     for user_prompt, bot_response in history:
         prompt += f"[INST] {user_prompt} [/INST]"
         prompt += f" {bot_response}</s> "
     if message=="":
         message="Hello"
     prompt += f"[INST] {message} [/INST]"
     return prompt
+# <|system|>
+# You are a friendly chatbot who always responds in the style of a pirate.</s>
+# <|user|>
+# How many helicopters can a human eat in one sitting?</s>
+# <|assistant|>
+# Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!
 # Zephyr formatter
+def format_prompt_zephyr(message, history, system_message=system_message):
     prompt = (
+        "<|system|>\n" + system_message  + "</s>"
     )
     for user_prompt, bot_response in history:
         prompt += f"<|user|>\n{user_prompt}</s>"
+        prompt += f"<|assistant|>\n{bot_response}</s>"
     if message=="":
         message="Hello"
     prompt += f"<|user|>\n{message}</s>"
+    prompt += f"<|assistant|>"
     print(prompt)
     return prompt
 def generate_local(
     prompt,
     history,
+    llm_model="zephyr",
     system_message=None,
     temperature=0.8,
     max_tokens=256,
         temperature=temperature,
         max_tokens=max_tokens,
         top_p=top_p,
+        stop=stop
     )
+    if "zephyr" in llm_model.lower():
+        sys_message= system_message.replace("##LLM_MODEL###","Zephyr").replace("##LLM_MODEL_PROVIDER###","Hugging Face")
+        formatted_prompt = format_prompt_zephyr(prompt, history,system_message=sys_message)
+        llm = llm_zephyr
+    else:
+        sys_message= system_message.replace("##LLM_MODEL###","Mistral").replace("##LLM_MODEL_PROVIDER###","Mistral")
+        formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message)
+        llm = llm_mistral
     try:
         print("LLM Input:", formatted_prompt)
                 return
+            output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
             yield output
     except Exception as e:
 xtts_supported_languages=config.languages
 def detect_language(prompt):
     # Fast language autodetection
+    if len(prompt)>15:
         language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
         if language_predicted == "zh":
             #we use zh-cn on xtts
             prompt,
             language,
             gpt_cond_latent,
+            speaker_embedding,
+            repetition_penalty=5.0,
+            temperature=0.75,
         )
         first_chunk = True
     except:
         return None
 def transcribe(wav_path):
     try:
         gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
         return "There was a problem with my voice, tell me joke"
 # Will be triggered on text submit (will send to generate_speech)
 def add_text(history, text):
     history = [] if history is None else history
         yield history
+def get_sentence(history, chatbot_role,llm_model,system_prompt=""):
     history = [["", None]] if history is None else history
     if system_prompt == "":
     history[-1][1] = ""
     mistral_start = time.time()
     sentence_list = []
     sentence_hash_list = []
     text_to_generate = ""
     stored_sentence = None
     stored_sentence_hash = None
+    print(chatbot_role)
+    print(llm_model)
+    for character in generate_local(history[-1][0], history[:-1],system_message=ROLE_PROMPTS[chatbot_role],llm_model=llm_model):
         history[-1][1] = character.replace("<|assistant|>","")
         # It is coming word by word
+        text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
         if len(text_to_generate) > 1:
             dif = len(text_to_generate) - len(sentence_list)
                 yield (sentence, history)
     # return that final sentence token
+    try:
+        last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
+        sentence_hash = hash(last_sentence)
+        if sentence_hash not in sentence_hash_list:
+            if stored_sentence is not None and stored_sentence_hash is not None:
+                last_sentence = stored_sentence + last_sentence
+                stored_sentence = stored_sentence_hash = None
+                print("Last Sentence with stored:",last_sentence)
+            sentence_hash_list.append(sentence_hash)
+            sentence_list.append(last_sentence)
+            print("Last Sentence: ", last_sentence)
+            yield (last_sentence, history)
+    except:
+        print("ERROR on last sentence history is :", history)
 from scipy.io.wavfile import write
 from pydub import AudioSegment
 second_of_silence.export("sil.wav", format='wav')
+def generate_speech(history,chatbot_role,llm_model):
     # Must set autoplay to True first
     yield (history, chatbot_role, "", wave_header_chunk() )
+    for sentence, history in get_sentence(history,chatbot_role,llm_model):
         if sentence != "":
             print("BG: inserting sentence to queue")
+            generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True)
             if generated_speech is not None:
                 _, audio_dict = generated_speech
                 # We are using byte streaming
 # will generate speech audio file per sentence
+def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=True):
+    language = "autodetect"
     wav_bytestream = b""
     if len(sentence)==0:
     if len(sentence)==0:
         print("EMPTY SENTENCE after processing")
         return
     # A fast fix for last chacter, may produce weird sounds if it is with text
     if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
         # just add a space
     print("All speech ended")
     return
 latent_map = {}
 latent_map["AI Assistant"] = get_latents("examples/female.wav")
+latent_map["AI Beard The Pirate"] = get_latents("examples/pirate_by_coqui.wav")
 #### GRADIO INTERFACE ####
 EXAMPLES = [
+    [[],"AI Assistant","What is 42?"],
+    [[],"AI Assistant","Speak in French, tell me how are you doing?"],
+    [[],"AI Assistant","Antworten Sie mir von nun an auf Deutsch"],
+    [[],"AI Beard The Pirate","Who are you?"],
 ]
+MODELS = ["Mistral","Zephyr"]
 OTHER_HTML=f"""<div>
 <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
 <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
 </div>
 """
 with gr.Blocks(title=title) as demo:
     gr.Markdown(DESCRIPTION)
     gr.Markdown(OTHER_HTML)
+    with gr.Row():
+        model_selected = gr.Dropdown(
+            label="Select Instuct LLM Model to Use",
+            info="Zephyr and Mistral 5-bit GGUF models are preloaded",
+            choices=MODELS,
+            max_choices=1,
+            value=MODELS[0],
+        )
     chatbot = gr.Chatbot(
         [],
         elem_id="chatbot",
         )
         txt_btn = gr.Button(value="Submit text", scale=1)
         btn = gr.Audio(source="microphone", type="filepath", scale=4)
     def stop():
         print("Audio STOP")
         set_audio_playing(False)
         )
         audio.end(stop)
     with gr.Row():
         gr.Examples(
         EXAMPLES,
+        [chatbot,chatbot_role, txt],
+        [chatbot,chatbot_role, txt],
         add_text,
         cache_examples=False,
         run_on_click=False, # Will not work , user should submit it
+    )
+    def clear_inputs(chatbot):
+        return None
     clear_btn = gr.ClearButton([chatbot, audio])
+    chatbot_role.change(fn=clear_inputs, inputs=[chatbot], outputs=[chatbot])
+    model_selected.change(fn=clear_inputs, inputs=[chatbot], outputs=[chatbot])
     txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        generate_speech,  [chatbot,chatbot_role,model_selected], [chatbot,chatbot_role, sentence, audio]
     )
     txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
     txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        generate_speech,  [chatbot,chatbot_role,model_selected], [chatbot,chatbot_role, sentence, audio]
     )
     txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
     file_msg = btn.stop_recording(
         add_file, [chatbot, btn], [chatbot, txt], queue=False
     ).then(
+        generate_speech,  [chatbot,chatbot_role,model_selected], [chatbot,chatbot_role, sentence, audio]
     )
     file_msg.then(lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), None, [txt, btn], queue=False)
     gr.Markdown(
         """
+This Space demonstrates how to speak to a chatbot, based solely on open accessible models.
+It relies on following models :
+Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
+LLM Mistral    : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
+LLM Zephyr     : [Zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF).
+Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
 - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml