import gradio as gr from llama_cpp import Llama import time # Load your LLaMA model with GPU support, quantization, and multi-threading llm = Llama.from_pretrained( repo_id="Ahil1991/Bee-8.3B", filename="Bee 8.3B.gguf", use_gpu=True, # Enable GPU if available quantize="4bit", # Quantization for speed (4-bit or 8-bit, adjust based on needs) num_threads=4 # Adjust based on CPU cores available (only for CPU use) ) # Function to generate a "typing" effect by yielding each part of the response def chat_with_model(user_input): messages = [ { "role": "user", "content": user_input } ] # Get response from the model response = llm.create_chat_completion(messages=messages) # Extract the full content from the response full_response = response['choices'][0]['message']['content'] # Stream the response like it's typing in real-time for i in range(1, len(full_response) + 1): yield full_response[:i] # Yield progressively larger chunks of the response time.sleep(0.05) # Simulate typing speed (adjust as needed) # Create a Gradio interface that streams the response iface = gr.Interface( fn=chat_with_model, # The function to handle input inputs="text", # Input: text from user outputs="text", # Output: streamed response as text title="Chat with Bee 8B Model", # Title for the Gradio app description="Ask anything and get responses from Bee in real-time!", live=False # Set to False to only process input when the user hits Submit ) # Launch the Gradio interface iface.launch()