import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Model identifier from Hugging Face model_repo = "ID2223-Lab/llama_lora_merged_GGUF" # Hugging Face model ID # Download the GGUF file from Hugging Face model_path = hf_hub_download(repo_id=model_repo, filename="FineTune_Llama.gguf") # Load the GGUF model using llama-cpp-python print("Loading model...") llm = Llama(model_path=model_path, n_ctx=2048, n_threads=8) # Adjust threads as needed print("Model loaded!") # Function for inference def chat_with_model(user_input, chat_history): """ Process user input and generate a response from the model. :param user_input: User's input string :param chat_history: List of [user_message, ai_response] pairs :return: Updated chat history """ # Construct the prompt from chat history prompt = "" for user, ai in chat_history: prompt += f"User: {user}\nAI: {ai}\n" prompt += f"User: {user_input}\nAI:" # Add the latest user input # Generate response from the model raw_response = llm(prompt)["choices"][0]["text"].strip() # Clean the response (remove extra tags, if any) response = raw_response.split("User:")[0].strip() # Update chat history with the new turn chat_history.append((user_input, response)) return chat_history, chat_history # Gradio UI with gr.Blocks() as demo: gr.Markdown("# 🦙 LLaMA Chatbot finetune with FineTome-100k") chatbot = gr.Chatbot(label="Chat with the Model") with gr.Row(): with gr.Column(scale=4): user_input = gr.Textbox(label="Your Message", placeholder="Type a message...") with gr.Column(scale=1): submit_btn = gr.Button("Send") chat_history = gr.State([]) # Link components submit_btn.click( chat_with_model, inputs=[user_input, chat_history], outputs=[chatbot, chat_history], show_progress=True, ) # Launch the Gradio app demo.launch()