Spaces:
Sleeping
Sleeping
import gradio as gr | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
# Model identifier from Hugging Face | |
model_repo = "ID2223-Lab/llama_lora_merged_GGUF" # Hugging Face model ID | |
# Download the GGUF file from Hugging Face | |
model_path = hf_hub_download(repo_id=model_repo, filename="FineTune_Llama.gguf") | |
# Load the GGUF model using llama-cpp-python | |
print("Loading model...") | |
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=8) # Adjust threads as needed | |
print("Model loaded!") | |
# Function for inference | |
def chat_with_model(user_input, chat_history): | |
""" | |
Process user input and generate a response from the model. | |
:param user_input: User's input string | |
:param chat_history: List of [user_message, ai_response] pairs | |
:return: Updated chat history | |
""" | |
# Construct the prompt from chat history | |
prompt = "" | |
for user, ai in chat_history: | |
prompt += f"User: {user}\nAI: {ai}\n" | |
prompt += f"User: {user_input}\nAI:" # Add the latest user input | |
# Generate response from the model | |
raw_response = llm(prompt)["choices"][0]["text"].strip() | |
# Clean the response (remove extra tags, if any) | |
response = raw_response.split("User:")[0].strip() | |
# Update chat history with the new turn | |
chat_history.append((user_input, response)) | |
return chat_history, chat_history | |
# Gradio UI | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🦙 LLaMA Chatbot finetune with FineTome-100k") | |
chatbot = gr.Chatbot(label="Chat with the Model") | |
with gr.Row(): | |
with gr.Column(scale=4): | |
user_input = gr.Textbox(label="Your Message", placeholder="Type a message...") | |
with gr.Column(scale=1): | |
submit_btn = gr.Button("Send") | |
chat_history = gr.State([]) | |
# Link components | |
submit_btn.click( | |
chat_with_model, | |
inputs=[user_input, chat_history], | |
outputs=[chatbot, chat_history], | |
show_progress=True, | |
) | |
# Launch the Gradio app | |
demo.launch() | |