import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel, PeftConfig from huggingface_hub import hf_hub_download # Hugging Face repository IDs base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF" adapter_repo = "Mat17892/llama_lora_gguf" # Download model and adapter print("Downloading base model...") base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf") print("Downloading LoRA adapter...") lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf") # Load the tokenizer and base model print("Loading base model and tokenizer...") tokenizer = AutoTokenizer.from_pretrained(base_model_path) base_model = AutoModelForCausalLM.from_pretrained(base_model_path) # Load the LoRA adapter print("Loading LoRA adapter...") config = PeftConfig.from_pretrained(lora_adapter_path) model = PeftModel.from_pretrained(base_model, lora_adapter_path) print("Model is ready!") # Function for inference def chat_with_model(user_input, chat_history): """ Generate a response from the model using the chat history and user input. """ # Prepare the prompt prompt = "" for user, ai in chat_history: prompt += f"User: {user}\nAI: {ai}\n" prompt += f"User: {user_input}\nAI:" # Add latest user input # Tokenize input inputs = tokenizer(prompt, return_tensors="pt") # Generate response outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Update chat history chat_history.append((user_input, response)) return chat_history, chat_history # Gradio UI with gr.Blocks() as demo: gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter") chatbot = gr.Chatbot(label="Chat with the Model") with gr.Row(): with gr.Column(scale=4): user_input = gr.Textbox(label="Your Message", placeholder="Type a message...") with gr.Column(scale=1): submit_btn = gr.Button("Send") chat_history = gr.State([]) # Link components submit_btn.click( chat_with_model, inputs=[user_input, chat_history], outputs=[chatbot, chat_history], show_progress=True, ) # Launch the Gradio app demo.launch()