voice-chat-with-mistral

Runtime error

App Files Files Community

gorkemgoknar commited on Nov 13, 2023

Commit

6dcf90d

•

1 Parent(s): fcfbb80

Mistral use endpoint

Browse files

Files changed (1) hide show

app.py +81 -38

app.py CHANGED Viewed

@@ -90,7 +90,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
-repo_id = "coqui/voice-chat-with-mistral"
 default_system_message = f"""
@@ -147,29 +147,31 @@ print("Downloading Mistral 7B Instruct")
 hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
 mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
-print("Downloading Yi-6B")
 #Yi-6B
-hf_hub_download(repo_id="TheBloke/Yi-6B-GGUF", local_dir=".", filename="yi-6b.Q5_K_M.gguf")
-yi_model_path="./yi-6b.Q5_K_M.gguf"
 from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 # else 35 full layers + XTTS works fine on T4 16GB
 # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
-GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))
-LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
 LLAMA_VERBOSE=False
-print("Running LLM Mistral")
-llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 print("Running LLM Zephyr")
-llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-15,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
-print("Running Yi LLM")
-llm_yi = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS-15,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 # Mistral formatter
@@ -186,6 +188,20 @@ def format_prompt_mistral(message, history, system_message=system_message,system
     prompt += f"[INST] {message} [/INST]"
     return prompt
 # <|system|>
 # You are a friendly chatbot who always responds in the style of a pirate.</s>
 # <|user|>
@@ -208,13 +224,14 @@ def format_prompt_zephyr(message, history, system_message=system_message):
     print(prompt)
     return prompt
 def generate_local(
     prompt,
     history,
     llm_model="zephyr",
     system_message=None,
-    temperature=0.85,
-    max_tokens=128,
     top_p=0.95,
     stop = LLM_STOP_WORDS
 ):
@@ -239,37 +256,64 @@ def generate_local(
             llm_provider= "01.ai"
             llm_model = "Yi"
             llm = llm_yi
         else:
             llm_provider= "Mistral"
             llm_model = "Mistral"
             llm = llm_mistral
         sys_message= system_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
         sys_system_understand_message = system_understand_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
-        formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
     try:
         print("LLM Input:", formatted_prompt)
-        stream = llm(
-            formatted_prompt,
-            **generate_kwargs,
-            stream=True,
-        )
-        output = ""
-        for response in stream:
-            character= response["choices"][0]["text"]
-            if "<|user|>" in character:
-                # end of context
-                return
-            if emoji.is_emoji(character):
-                # Bad emoji not a meaning messes chat from next lines
-                return
-            output += character.replace("<|assistant|>","").replace("<|user|>","")
-            yield output
     except Exception as e:
         if "Too Many Requests" in str(e):
@@ -697,7 +741,7 @@ EXAMPLES = [
 ]
-MODELS = ["Mistral 7B Instruct","Zephyr 7B Beta","Yi 6B"]
 OTHER_HTML=f"""<div>
 <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
@@ -714,7 +758,7 @@ with gr.Blocks(title=title) as demo:
     with gr.Row():
         model_selected = gr.Dropdown(
             label="Select Instuct LLM Model to Use",
-            info="Mistral, Zephyr, Yi : 5-bit GGUF models are preloaded",
             choices=MODELS,
             max_choices=1,
             value=MODELS[0],
@@ -802,9 +846,8 @@ with gr.Blocks(title=title) as demo:
 This Space demonstrates how to speak to a chatbot, based solely on open accessible models.
 It relies on following models :
 Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
-LLM Mistral    : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
 LLM Zephyr     : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
-LLM Yi         : [Yi-6B](https://huggingface.co/01-ai/Yi-6B) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/Yi-6B-GGUF).
 Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
@@ -813,4 +856,4 @@ Note:
 - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
     )
 demo.queue()
-demo.launch(debug=True)

 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
+repo_id = "coqui/voice-chat-with-zephyr"
 default_system_message = f"""
 hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
 mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
+#print("Downloading Yi-6B")
 #Yi-6B
+#hf_hub_download(repo_id="TheBloke/Yi-6B-GGUF", local_dir=".", filename="yi-6b.Q5_K_M.gguf")
+#yi_model_path="./yi-6b.Q5_K_M.gguf"
 from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 # else 35 full layers + XTTS works fine on T4 16GB
 # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
+GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 5))
+LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>"]
 LLAMA_VERBOSE=False
+print("Running LLM Mistral as InferenceClient")
+#llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
+llm_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
 print("Running LLM Zephyr")
+llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
+#print("Running Yi LLM")
+#llm_yi = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE,model_type="mistral")
 # Mistral formatter
     prompt += f"[INST] {message} [/INST]"
     return prompt
+def format_prompt_yi(message, history, system_message=system_message,system_understand_message=system_understand_message):
+    prompt = (
+        "<s>[INST] <<SYS>>\n" + system_message + "\n<</SYS>>\n\n[/INST]"
+    )
+    for user_prompt, bot_response in history:
+        prompt += f"[INST] {user_prompt} [/INST]"
+        prompt += f" {bot_response}</s> "
+    if message=="":
+        message="Hello"
+    prompt += f"[INST] {message} [/INST]"
+    return prompt
 # <|system|>
 # You are a friendly chatbot who always responds in the style of a pirate.</s>
 # <|user|>
     print(prompt)
     return prompt
 def generate_local(
     prompt,
     history,
     llm_model="zephyr",
     system_message=None,
+    temperature=0.8,
+    max_tokens=256,
     top_p=0.95,
     stop = LLM_STOP_WORDS
 ):
             llm_provider= "01.ai"
             llm_model = "Yi"
             llm = llm_yi
+            max_tokens= round(max_tokens/2)
         else:
             llm_provider= "Mistral"
             llm_model = "Mistral"
             llm = llm_mistral
         sys_message= system_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
         sys_system_understand_message = system_understand_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
+        if "yi" in llm_model.lower():
+            formatted_prompt = format_prompt_yi(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
+        else:
+            formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
     try:
         print("LLM Input:", formatted_prompt)
+        if llm_model=="Mistral":
+            # USE Mistral endpoint
+            generate_kwargs = dict(
+                temperature=temperature,
+                max_new_tokens=max_tokens,
+                top_p=top_p,
+            )
+            stream = llm_mistral.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+            output = ""
+            for response in stream:
+                character = response.token.text
+                if "<|user|>" in character:
+                    # end of context
+                    return
+                if emoji.is_emoji(character):
+                    # Bad emoji not a meaning messes chat from next lines
+                    return
+                output += character
+                yield output
+        else:
+            # Local GGUF
+            stream = llm(
+                formatted_prompt,
+                **generate_kwargs,
+                stream=True,
+            )
+            output = ""
+            for response in stream:
+                character= response["choices"][0]["text"]
+                if "<|user|>" in character:
+                    # end of context
+                    return
+                if emoji.is_emoji(character):
+                    # Bad emoji not a meaning messes chat from next lines
+                    return
+                output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
+                yield output
     except Exception as e:
         if "Too Many Requests" in str(e):
 ]
+MODELS = ["Mistral 7B Instruct","Zephyr 7B Beta"]
 OTHER_HTML=f"""<div>
 <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
     with gr.Row():
         model_selected = gr.Dropdown(
             label="Select Instuct LLM Model to Use",
+            info="Mistral, Zephyr: Mistral uses inference endpoint, Zephyr is 5 bit GGUF",
             choices=MODELS,
             max_choices=1,
             value=MODELS[0],
 This Space demonstrates how to speak to a chatbot, based solely on open accessible models.
 It relies on following models :
 Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
+LLM Mistral    : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model.
 LLM Zephyr     : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
 Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
 - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
     )
 demo.queue()
+demo.launch(debug=True,share=True)