ruslanmv's picture
Update app.py
3ecaff4 verified
from huggingface_hub import hf_hub_download
import logging
import sys
import gradio as gr
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
messages_to_prompt,
completion_to_prompt,
)
def download(model):
repo_id = model_info[model]["repo_id"]
filename = model_info[model]["filename"]
model_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
resume_download=True,
cache_dir=MODELS_PATH,
)
return model_path
MODELS_PATH = "./models"
models = ["Llama-2-13B-chat", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", "vicuna-7B-v1.5", "CodeLlama-7B"]
model_info = {
"Llama-2-13B-chat": {
"repo_id": "TheBloke/Llama-2-13B-chat-GGUF",
"filename": "llama-2-13b-chat.Q4_K_M.gguf",
},
"Mistral-7B-Instruct-v0.2": {
"repo_id": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
"filename": "mistral-7b-instruct-v0.2.Q4_K_M.gguf",
},
"zephyr-7B-beta": {
"repo_id": "TheBloke/zephyr-7B-beta-GGUF",
"filename": "zephyr-7b-beta.Q4_K_M.gguf",
},
"vicuna-7B-v1.5": {
"repo_id": "TheBloke/vicuna-7B-v1.5-GGUF",
"filename": "vicuna-7b-v1.5.Q4_K_M.gguf",
},
"CodeLlama-7B": {
"repo_id": "TheBloke/CodeLlama-7B-GGUF",
"filename": "codellama-7b.Q4_K_M.gguf",
},
# "Falcon-7B-Instruct": {
# "repo_id": "TheBloke/Falcon-7B-Instruct-GGML",
# "filename": "falcon-7b-instruct.ggccv1.q4_1.bin",
# },
}
for model_name in models:
download(model_name)
mistral_model_path = hf_hub_download(
repo_id= "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
resume_download=True,
cache_dir=MODELS_PATH,)
"""Step 3 : if you use GPU then make sure ( n_gpu_layers":1) at least 1, you can increase or decrease it based on your GPU performance"""
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
# model_url=model_url,
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=mistral_model_path,
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
def model_initialization(model):
if(model !=""):
gr.Info("model downloading and configuration process has been started, please wait...")
MODELS_PATH = "./models"
repo_id=""
filename=""
if(model=="Llama-2-13B-chat"):
repo_id="TheBloke/Llama-2-13B-chat-GGUF"
filename="llama-2-13b-chat.Q4_K_M.gguf"
elif(model=="Mistral-7B-Instruct-v0.2") :
repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
elif(model=="zephyr-7B-beta"):
repo_id="TheBloke/zephyr-7B-beta-GGUF "
filename="zephyr-7b-beta.Q4_K_M.gguf"
elif(model=="vicuna-7B-v1.5"):
repo_id="TheBloke/vicuna-7B-v1.5-GGUF"
filename="vicuna-7b-v1.5.Q4_K_M.gguf"
# elif(model=="Falcon-7B-Instruct"):
# repo_id="TheBloke/Falcon-7B-Instruct-GGML"
# filename="falcon-7b-instruct.ggccv1.q4_1.bin"
elif(model=="CodeLlama-7B"):
repo_id="TheBloke/CodeLlama-7B-GGUF"
filename="codellama-7b.Q4_K_M.gguf"
else:
gr.Warning("please select at least one model")
mistral_model_path = hf_hub_download(
repo_id= repo_id,
filename= filename,
resume_download=True,
cache_dir=MODELS_PATH,)
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
# model_url=model_url,
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=mistral_model_path,
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={},
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
gr.Info("model has been configured and ready to chat")
return "model has been configured and ready to chat, your current model is "+model
def predict(message, history):
messages = []
answer = []
response = llm.stream_complete(message)
for bot_response in response:
token = bot_response.delta
answer.append(token)
final_answer = " ".join(answer)
yield final_answer
with gr.Blocks() as UI:
models=gr.Dropdown(["CodeLlama-7B","Llama-2-13B-chat" ,"Mistral-7B-Instruct-v0.2", "zephyr-7B-beta",
"vicuna-7B-v1.5"],value=["CodeLlama-7B","Llama-2-13B-chat", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta",
"vicuna-7B-v1.5"], label="please select at least one model", info="default model is Mistral-7B-Instruct-v0.2")
textInfo = gr.Textbox(value="current model is Mistral-7B-Instruct-v0.2",label="Model Status");
# Chatbot interface
chatUI= gr.ChatInterface(
predict,
title="Open Source LLM ChatBot",
description="Ask any question",
theme="soft",
examples=["Hello", "are you LLM model?", "how can i finetune a pre-trained LLM model?","How can i build a chatbot using local open-souce LLM ?"],
cache_examples=False,
submit_btn="Send Message",
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear",
)
models.change(fn=model_initialization,inputs=[models],outputs=[textInfo])
if __name__ == "__main__":
UI.launch(debug=True) #