Bee_8B_HF_Space / app.py
Ahil1991's picture
Update app.py
4bf76fd verified
import gradio as gr
from llama_cpp import Llama
import time
# Load your LLaMA model with GPU support, quantization, and multi-threading
llm = Llama.from_pretrained(
repo_id="Ahil1991/Bee-8.3B",
filename="Bee 8.3B.gguf",
use_gpu=True, # Enable GPU if available
quantize="4bit", # Quantization for speed (4-bit or 8-bit, adjust based on needs)
num_threads=4 # Adjust based on CPU cores available (only for CPU use)
)
# Function to generate a "typing" effect by yielding each part of the response
def chat_with_model(user_input):
messages = [
{
"role": "user",
"content": user_input
}
]
# Get response from the model
response = llm.create_chat_completion(messages=messages)
# Extract the full content from the response
full_response = response['choices'][0]['message']['content']
# Stream the response like it's typing in real-time
for i in range(1, len(full_response) + 1):
yield full_response[:i] # Yield progressively larger chunks of the response
time.sleep(0.05) # Simulate typing speed (adjust as needed)
# Create a Gradio interface that streams the response
iface = gr.Interface(
fn=chat_with_model, # The function to handle input
inputs="text", # Input: text from user
outputs="text", # Output: streamed response as text
title="Chat with Bee 8B Model", # Title for the Gradio app
description="Ask anything and get responses from Bee in real-time!",
live=False # Set to False to only process input when the user hits Submit
)
# Launch the Gradio interface
iface.launch()