Spaces:
Runtime error
Runtime error
gorkemgoknar
commited on
Commit
•
6dcf90d
1
Parent(s):
fcfbb80
Mistral use endpoint
Browse files
app.py
CHANGED
@@ -90,7 +90,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
90 |
# will use api to restart space on a unrecoverable error
|
91 |
api = HfApi(token=HF_TOKEN)
|
92 |
|
93 |
-
repo_id = "coqui/voice-chat-with-
|
94 |
|
95 |
|
96 |
default_system_message = f"""
|
@@ -147,29 +147,31 @@ print("Downloading Mistral 7B Instruct")
|
|
147 |
hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
|
148 |
mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
|
149 |
|
150 |
-
print("Downloading Yi-6B")
|
151 |
#Yi-6B
|
152 |
-
hf_hub_download(repo_id="TheBloke/Yi-6B-GGUF", local_dir=".", filename="yi-6b.Q5_K_M.gguf")
|
153 |
-
yi_model_path="./yi-6b.Q5_K_M.gguf"
|
154 |
|
155 |
|
156 |
from llama_cpp import Llama
|
157 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
158 |
# else 35 full layers + XTTS works fine on T4 16GB
|
159 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
160 |
-
GPU_LAYERS=int(os.environ.get("GPU_LAYERS",
|
161 |
|
162 |
-
LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
|
163 |
|
164 |
LLAMA_VERBOSE=False
|
165 |
-
print("Running LLM Mistral")
|
166 |
-
llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
|
|
|
|
167 |
|
168 |
print("Running LLM Zephyr")
|
169 |
-
llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS
|
170 |
|
171 |
-
print("Running Yi LLM")
|
172 |
-
llm_yi = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS
|
173 |
|
174 |
|
175 |
# Mistral formatter
|
@@ -186,6 +188,20 @@ def format_prompt_mistral(message, history, system_message=system_message,system
|
|
186 |
prompt += f"[INST] {message} [/INST]"
|
187 |
return prompt
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
# <|system|>
|
190 |
# You are a friendly chatbot who always responds in the style of a pirate.</s>
|
191 |
# <|user|>
|
@@ -208,13 +224,14 @@ def format_prompt_zephyr(message, history, system_message=system_message):
|
|
208 |
print(prompt)
|
209 |
return prompt
|
210 |
|
|
|
211 |
def generate_local(
|
212 |
prompt,
|
213 |
history,
|
214 |
llm_model="zephyr",
|
215 |
system_message=None,
|
216 |
-
temperature=0.
|
217 |
-
max_tokens=
|
218 |
top_p=0.95,
|
219 |
stop = LLM_STOP_WORDS
|
220 |
):
|
@@ -239,37 +256,64 @@ def generate_local(
|
|
239 |
llm_provider= "01.ai"
|
240 |
llm_model = "Yi"
|
241 |
llm = llm_yi
|
|
|
242 |
else:
|
243 |
llm_provider= "Mistral"
|
244 |
llm_model = "Mistral"
|
245 |
llm = llm_mistral
|
246 |
sys_message= system_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
|
247 |
sys_system_understand_message = system_understand_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
|
248 |
-
|
249 |
-
|
|
|
|
|
|
|
250 |
|
251 |
try:
|
252 |
print("LLM Input:", formatted_prompt)
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
-
|
267 |
-
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
-
|
271 |
-
|
272 |
-
yield output
|
273 |
|
274 |
except Exception as e:
|
275 |
if "Too Many Requests" in str(e):
|
@@ -697,7 +741,7 @@ EXAMPLES = [
|
|
697 |
|
698 |
]
|
699 |
|
700 |
-
MODELS = ["Mistral 7B Instruct","Zephyr 7B Beta"
|
701 |
|
702 |
OTHER_HTML=f"""<div>
|
703 |
<a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
|
@@ -714,7 +758,7 @@ with gr.Blocks(title=title) as demo:
|
|
714 |
with gr.Row():
|
715 |
model_selected = gr.Dropdown(
|
716 |
label="Select Instuct LLM Model to Use",
|
717 |
-
info="Mistral, Zephyr,
|
718 |
choices=MODELS,
|
719 |
max_choices=1,
|
720 |
value=MODELS[0],
|
@@ -802,9 +846,8 @@ with gr.Blocks(title=title) as demo:
|
|
802 |
This Space demonstrates how to speak to a chatbot, based solely on open accessible models.
|
803 |
It relies on following models :
|
804 |
Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
|
805 |
-
LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model
|
806 |
LLM Zephyr : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
|
807 |
-
LLM Yi : [Yi-6B](https://huggingface.co/01-ai/Yi-6B) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/Yi-6B-GGUF).
|
808 |
Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
|
809 |
|
810 |
Note:
|
@@ -813,4 +856,4 @@ Note:
|
|
813 |
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
814 |
)
|
815 |
demo.queue()
|
816 |
-
demo.launch(debug=True)
|
|
|
90 |
# will use api to restart space on a unrecoverable error
|
91 |
api = HfApi(token=HF_TOKEN)
|
92 |
|
93 |
+
repo_id = "coqui/voice-chat-with-zephyr"
|
94 |
|
95 |
|
96 |
default_system_message = f"""
|
|
|
147 |
hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
|
148 |
mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
|
149 |
|
150 |
+
#print("Downloading Yi-6B")
|
151 |
#Yi-6B
|
152 |
+
#hf_hub_download(repo_id="TheBloke/Yi-6B-GGUF", local_dir=".", filename="yi-6b.Q5_K_M.gguf")
|
153 |
+
#yi_model_path="./yi-6b.Q5_K_M.gguf"
|
154 |
|
155 |
|
156 |
from llama_cpp import Llama
|
157 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
158 |
# else 35 full layers + XTTS works fine on T4 16GB
|
159 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
160 |
+
GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 5))
|
161 |
|
162 |
+
LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>"]
|
163 |
|
164 |
LLAMA_VERBOSE=False
|
165 |
+
print("Running LLM Mistral as InferenceClient")
|
166 |
+
#llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
167 |
+
llm_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
|
168 |
+
|
169 |
|
170 |
print("Running LLM Zephyr")
|
171 |
+
llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
172 |
|
173 |
+
#print("Running Yi LLM")
|
174 |
+
#llm_yi = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE,model_type="mistral")
|
175 |
|
176 |
|
177 |
# Mistral formatter
|
|
|
188 |
prompt += f"[INST] {message} [/INST]"
|
189 |
return prompt
|
190 |
|
191 |
+
def format_prompt_yi(message, history, system_message=system_message,system_understand_message=system_understand_message):
|
192 |
+
prompt = (
|
193 |
+
"<s>[INST] <<SYS>>\n" + system_message + "\n<</SYS>>\n\n[/INST]"
|
194 |
+
)
|
195 |
+
for user_prompt, bot_response in history:
|
196 |
+
prompt += f"[INST] {user_prompt} [/INST]"
|
197 |
+
prompt += f" {bot_response}</s> "
|
198 |
+
|
199 |
+
if message=="":
|
200 |
+
message="Hello"
|
201 |
+
prompt += f"[INST] {message} [/INST]"
|
202 |
+
return prompt
|
203 |
+
|
204 |
+
|
205 |
# <|system|>
|
206 |
# You are a friendly chatbot who always responds in the style of a pirate.</s>
|
207 |
# <|user|>
|
|
|
224 |
print(prompt)
|
225 |
return prompt
|
226 |
|
227 |
+
|
228 |
def generate_local(
|
229 |
prompt,
|
230 |
history,
|
231 |
llm_model="zephyr",
|
232 |
system_message=None,
|
233 |
+
temperature=0.8,
|
234 |
+
max_tokens=256,
|
235 |
top_p=0.95,
|
236 |
stop = LLM_STOP_WORDS
|
237 |
):
|
|
|
256 |
llm_provider= "01.ai"
|
257 |
llm_model = "Yi"
|
258 |
llm = llm_yi
|
259 |
+
max_tokens= round(max_tokens/2)
|
260 |
else:
|
261 |
llm_provider= "Mistral"
|
262 |
llm_model = "Mistral"
|
263 |
llm = llm_mistral
|
264 |
sys_message= system_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
|
265 |
sys_system_understand_message = system_understand_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
|
266 |
+
|
267 |
+
if "yi" in llm_model.lower():
|
268 |
+
formatted_prompt = format_prompt_yi(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
|
269 |
+
else:
|
270 |
+
formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
|
271 |
|
272 |
try:
|
273 |
print("LLM Input:", formatted_prompt)
|
274 |
+
if llm_model=="Mistral":
|
275 |
+
# USE Mistral endpoint
|
276 |
+
generate_kwargs = dict(
|
277 |
+
temperature=temperature,
|
278 |
+
max_new_tokens=max_tokens,
|
279 |
+
top_p=top_p,
|
280 |
+
)
|
281 |
+
|
282 |
+
stream = llm_mistral.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
|
283 |
+
output = ""
|
284 |
+
for response in stream:
|
285 |
+
character = response.token.text
|
286 |
+
if "<|user|>" in character:
|
287 |
+
# end of context
|
288 |
+
return
|
289 |
+
|
290 |
+
if emoji.is_emoji(character):
|
291 |
+
# Bad emoji not a meaning messes chat from next lines
|
292 |
+
return
|
293 |
|
294 |
+
output += character
|
295 |
+
yield output
|
296 |
+
else:
|
297 |
+
# Local GGUF
|
298 |
+
stream = llm(
|
299 |
+
formatted_prompt,
|
300 |
+
**generate_kwargs,
|
301 |
+
stream=True,
|
302 |
+
)
|
303 |
+
output = ""
|
304 |
+
for response in stream:
|
305 |
+
character= response["choices"][0]["text"]
|
306 |
+
|
307 |
+
if "<|user|>" in character:
|
308 |
+
# end of context
|
309 |
+
return
|
310 |
+
|
311 |
+
if emoji.is_emoji(character):
|
312 |
+
# Bad emoji not a meaning messes chat from next lines
|
313 |
+
return
|
314 |
|
315 |
+
output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
|
316 |
+
yield output
|
|
|
317 |
|
318 |
except Exception as e:
|
319 |
if "Too Many Requests" in str(e):
|
|
|
741 |
|
742 |
]
|
743 |
|
744 |
+
MODELS = ["Mistral 7B Instruct","Zephyr 7B Beta"]
|
745 |
|
746 |
OTHER_HTML=f"""<div>
|
747 |
<a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
|
|
|
758 |
with gr.Row():
|
759 |
model_selected = gr.Dropdown(
|
760 |
label="Select Instuct LLM Model to Use",
|
761 |
+
info="Mistral, Zephyr: Mistral uses inference endpoint, Zephyr is 5 bit GGUF",
|
762 |
choices=MODELS,
|
763 |
max_choices=1,
|
764 |
value=MODELS[0],
|
|
|
846 |
This Space demonstrates how to speak to a chatbot, based solely on open accessible models.
|
847 |
It relies on following models :
|
848 |
Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
|
849 |
+
LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model.
|
850 |
LLM Zephyr : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
|
|
|
851 |
Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
|
852 |
|
853 |
Note:
|
|
|
856 |
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
857 |
)
|
858 |
demo.queue()
|
859 |
+
demo.launch(debug=True,share=True)
|