import gradio as gr import os from llama_cpp import Llama from huggingface_hub import hf_hub_download # Force CPU-only execution os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled # Application title and description title = "Mistral-7B-Instruct-GGUF Run On CPU" description = """ 🔎 [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp). """ # Model setup model_dir = "models" model_name = "unsloth.Q4_K_M.gguf" model_path = os.path.join(model_dir, model_name) # Download the model if not already present hf_hub_download( repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf", filename=model_name, local_dir=model_dir ) # Check if the model file exists if not os.path.exists(model_path): raise FileNotFoundError(f"Model file not found at {model_path}") # Load the model using llama_cpp print("Loading the model...") llm = Llama(model_path=model_path) print("Model loaded successfully!") # Define the function to generate responses def generate_response(prompt, temperature=0.7, top_p=1.0, max_tokens=256): """ Generate a response from the model. Args: prompt (str): The user's input prompt. temperature (float): Sampling temperature. top_p (float): Top-p sampling parameter. max_tokens (int): Maximum number of tokens to generate. Returns: str: The model's response. """ try: response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p) return response["choices"][0]["text"].strip() except Exception as e: return f"Error generating response: {e}" # Set up Gradio interface with gr.Blocks() as demo: gr.Markdown("# 🦙 Llama GGUF Model Chatbot") gr.Markdown(description) # Input box for the user prompt prompt_input = gr.Textbox(label="Your Prompt", placeholder="Type your message here...", lines=5) # Advanced settings with gr.Accordion("Advanced Settings", open=False): temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.01, label="Top-p") max_tokens = gr.Slider(16, 512, value=256, step=16, label="Max Tokens") # Output box for the model's response response_output = gr.Textbox(label="Model Response", placeholder="The model's response will appear here...", lines=10) # Generate button generate_button = gr.Button("Generate Response") # Connect inputs and outputs generate_button.click( generate_response, inputs=[prompt_input, temperature, top_p, max_tokens], outputs=[response_output] ) # Launch the Gradio app if __name__ == "__main__": demo.launch()