import gradio as gr
from llama_cpp import Llama
import time

# Load your LLaMA model with GPU support, quantization, and multi-threading
llm = Llama.from_pretrained(
    repo_id="Ahil1991/Bee-8.3B",
    filename="Bee 8.3B.gguf",
    use_gpu=True,  # Enable GPU if available
    quantize="4bit",  # Quantization for speed (4-bit or 8-bit, adjust based on needs)
    num_threads=4  # Adjust based on CPU cores available (only for CPU use)
)

# Function to generate a "typing" effect by yielding each part of the response
def chat_with_model(user_input):
    messages = [
        {
            "role": "user",
            "content": user_input
        }
    ]
    
    # Get response from the model
    response = llm.create_chat_completion(messages=messages)
    
    # Extract the full content from the response
    full_response = response['choices'][0]['message']['content']
    
    # Stream the response like it's typing in real-time
    for i in range(1, len(full_response) + 1):
        yield full_response[:i]  # Yield progressively larger chunks of the response
        time.sleep(0.05)  # Simulate typing speed (adjust as needed)

# Create a Gradio interface that streams the response
iface = gr.Interface(
    fn=chat_with_model,              # The function to handle input
    inputs="text",                   # Input: text from user
    outputs="text",                  # Output: streamed response as text
    title="Chat with Bee 8B Model",  # Title for the Gradio app
    description="Ask anything and get responses from Bee in real-time!",
    live=False  # Set to False to only process input when the user hits Submit
)

# Launch the Gradio interface
iface.launch()