gorkemgoknar commited on
Commit
6dcf90d
1 Parent(s): fcfbb80

Mistral use endpoint

Browse files
Files changed (1) hide show
  1. app.py +81 -38
app.py CHANGED
@@ -90,7 +90,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
90
  # will use api to restart space on a unrecoverable error
91
  api = HfApi(token=HF_TOKEN)
92
 
93
- repo_id = "coqui/voice-chat-with-mistral"
94
 
95
 
96
  default_system_message = f"""
@@ -147,29 +147,31 @@ print("Downloading Mistral 7B Instruct")
147
  hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
148
  mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
149
 
150
- print("Downloading Yi-6B")
151
  #Yi-6B
152
- hf_hub_download(repo_id="TheBloke/Yi-6B-GGUF", local_dir=".", filename="yi-6b.Q5_K_M.gguf")
153
- yi_model_path="./yi-6b.Q5_K_M.gguf"
154
 
155
 
156
  from llama_cpp import Llama
157
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
158
  # else 35 full layers + XTTS works fine on T4 16GB
159
  # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
160
- GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))
161
 
162
- LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
163
 
164
  LLAMA_VERBOSE=False
165
- print("Running LLM Mistral")
166
- llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 
 
167
 
168
  print("Running LLM Zephyr")
169
- llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-15,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
170
 
171
- print("Running Yi LLM")
172
- llm_yi = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS-15,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
173
 
174
 
175
  # Mistral formatter
@@ -186,6 +188,20 @@ def format_prompt_mistral(message, history, system_message=system_message,system
186
  prompt += f"[INST] {message} [/INST]"
187
  return prompt
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  # <|system|>
190
  # You are a friendly chatbot who always responds in the style of a pirate.</s>
191
  # <|user|>
@@ -208,13 +224,14 @@ def format_prompt_zephyr(message, history, system_message=system_message):
208
  print(prompt)
209
  return prompt
210
 
 
211
  def generate_local(
212
  prompt,
213
  history,
214
  llm_model="zephyr",
215
  system_message=None,
216
- temperature=0.85,
217
- max_tokens=128,
218
  top_p=0.95,
219
  stop = LLM_STOP_WORDS
220
  ):
@@ -239,37 +256,64 @@ def generate_local(
239
  llm_provider= "01.ai"
240
  llm_model = "Yi"
241
  llm = llm_yi
 
242
  else:
243
  llm_provider= "Mistral"
244
  llm_model = "Mistral"
245
  llm = llm_mistral
246
  sys_message= system_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
247
  sys_system_understand_message = system_understand_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
248
- formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
249
-
 
 
 
250
 
251
  try:
252
  print("LLM Input:", formatted_prompt)
253
- stream = llm(
254
- formatted_prompt,
255
- **generate_kwargs,
256
- stream=True,
257
- )
258
- output = ""
259
- for response in stream:
260
- character= response["choices"][0]["text"]
261
-
262
- if "<|user|>" in character:
263
- # end of context
264
- return
 
 
 
 
 
 
 
265
 
266
- if emoji.is_emoji(character):
267
- # Bad emoji not a meaning messes chat from next lines
268
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
-
271
- output += character.replace("<|assistant|>","").replace("<|user|>","")
272
- yield output
273
 
274
  except Exception as e:
275
  if "Too Many Requests" in str(e):
@@ -697,7 +741,7 @@ EXAMPLES = [
697
 
698
  ]
699
 
700
- MODELS = ["Mistral 7B Instruct","Zephyr 7B Beta","Yi 6B"]
701
 
702
  OTHER_HTML=f"""<div>
703
  <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
@@ -714,7 +758,7 @@ with gr.Blocks(title=title) as demo:
714
  with gr.Row():
715
  model_selected = gr.Dropdown(
716
  label="Select Instuct LLM Model to Use",
717
- info="Mistral, Zephyr, Yi : 5-bit GGUF models are preloaded",
718
  choices=MODELS,
719
  max_choices=1,
720
  value=MODELS[0],
@@ -802,9 +846,8 @@ with gr.Blocks(title=title) as demo:
802
  This Space demonstrates how to speak to a chatbot, based solely on open accessible models.
803
  It relies on following models :
804
  Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
805
- LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
806
  LLM Zephyr : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
807
- LLM Yi : [Yi-6B](https://huggingface.co/01-ai/Yi-6B) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/Yi-6B-GGUF).
808
  Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
809
 
810
  Note:
@@ -813,4 +856,4 @@ Note:
813
  - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
814
  )
815
  demo.queue()
816
- demo.launch(debug=True)
 
90
  # will use api to restart space on a unrecoverable error
91
  api = HfApi(token=HF_TOKEN)
92
 
93
+ repo_id = "coqui/voice-chat-with-zephyr"
94
 
95
 
96
  default_system_message = f"""
 
147
  hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
148
  mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
149
 
150
+ #print("Downloading Yi-6B")
151
  #Yi-6B
152
+ #hf_hub_download(repo_id="TheBloke/Yi-6B-GGUF", local_dir=".", filename="yi-6b.Q5_K_M.gguf")
153
+ #yi_model_path="./yi-6b.Q5_K_M.gguf"
154
 
155
 
156
  from llama_cpp import Llama
157
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
158
  # else 35 full layers + XTTS works fine on T4 16GB
159
  # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
160
+ GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 5))
161
 
162
+ LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>"]
163
 
164
  LLAMA_VERBOSE=False
165
+ print("Running LLM Mistral as InferenceClient")
166
+ #llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
167
+ llm_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
168
+
169
 
170
  print("Running LLM Zephyr")
171
+ llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
172
 
173
+ #print("Running Yi LLM")
174
+ #llm_yi = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE,model_type="mistral")
175
 
176
 
177
  # Mistral formatter
 
188
  prompt += f"[INST] {message} [/INST]"
189
  return prompt
190
 
191
+ def format_prompt_yi(message, history, system_message=system_message,system_understand_message=system_understand_message):
192
+ prompt = (
193
+ "<s>[INST] <<SYS>>\n" + system_message + "\n<</SYS>>\n\n[/INST]"
194
+ )
195
+ for user_prompt, bot_response in history:
196
+ prompt += f"[INST] {user_prompt} [/INST]"
197
+ prompt += f" {bot_response}</s> "
198
+
199
+ if message=="":
200
+ message="Hello"
201
+ prompt += f"[INST] {message} [/INST]"
202
+ return prompt
203
+
204
+
205
  # <|system|>
206
  # You are a friendly chatbot who always responds in the style of a pirate.</s>
207
  # <|user|>
 
224
  print(prompt)
225
  return prompt
226
 
227
+
228
  def generate_local(
229
  prompt,
230
  history,
231
  llm_model="zephyr",
232
  system_message=None,
233
+ temperature=0.8,
234
+ max_tokens=256,
235
  top_p=0.95,
236
  stop = LLM_STOP_WORDS
237
  ):
 
256
  llm_provider= "01.ai"
257
  llm_model = "Yi"
258
  llm = llm_yi
259
+ max_tokens= round(max_tokens/2)
260
  else:
261
  llm_provider= "Mistral"
262
  llm_model = "Mistral"
263
  llm = llm_mistral
264
  sys_message= system_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
265
  sys_system_understand_message = system_understand_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
266
+
267
+ if "yi" in llm_model.lower():
268
+ formatted_prompt = format_prompt_yi(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
269
+ else:
270
+ formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
271
 
272
  try:
273
  print("LLM Input:", formatted_prompt)
274
+ if llm_model=="Mistral":
275
+ # USE Mistral endpoint
276
+ generate_kwargs = dict(
277
+ temperature=temperature,
278
+ max_new_tokens=max_tokens,
279
+ top_p=top_p,
280
+ )
281
+
282
+ stream = llm_mistral.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
283
+ output = ""
284
+ for response in stream:
285
+ character = response.token.text
286
+ if "<|user|>" in character:
287
+ # end of context
288
+ return
289
+
290
+ if emoji.is_emoji(character):
291
+ # Bad emoji not a meaning messes chat from next lines
292
+ return
293
 
294
+ output += character
295
+ yield output
296
+ else:
297
+ # Local GGUF
298
+ stream = llm(
299
+ formatted_prompt,
300
+ **generate_kwargs,
301
+ stream=True,
302
+ )
303
+ output = ""
304
+ for response in stream:
305
+ character= response["choices"][0]["text"]
306
+
307
+ if "<|user|>" in character:
308
+ # end of context
309
+ return
310
+
311
+ if emoji.is_emoji(character):
312
+ # Bad emoji not a meaning messes chat from next lines
313
+ return
314
 
315
+ output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
316
+ yield output
 
317
 
318
  except Exception as e:
319
  if "Too Many Requests" in str(e):
 
741
 
742
  ]
743
 
744
+ MODELS = ["Mistral 7B Instruct","Zephyr 7B Beta"]
745
 
746
  OTHER_HTML=f"""<div>
747
  <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
 
758
  with gr.Row():
759
  model_selected = gr.Dropdown(
760
  label="Select Instuct LLM Model to Use",
761
+ info="Mistral, Zephyr: Mistral uses inference endpoint, Zephyr is 5 bit GGUF",
762
  choices=MODELS,
763
  max_choices=1,
764
  value=MODELS[0],
 
846
  This Space demonstrates how to speak to a chatbot, based solely on open accessible models.
847
  It relies on following models :
848
  Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
849
+ LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model.
850
  LLM Zephyr : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
 
851
  Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
852
 
853
  Note:
 
856
  - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
857
  )
858
  demo.queue()
859
+ demo.launch(debug=True,share=True)