Spaces:
Sleeping
Sleeping
import gradio as gr | |
from llama_cpp import Llama | |
import time | |
# Load your LLaMA model with GPU support, quantization, and multi-threading | |
llm = Llama.from_pretrained( | |
repo_id="Ahil1991/Bee-8.3B", | |
filename="Bee 8.3B.gguf", | |
use_gpu=True, # Enable GPU if available | |
quantize="4bit", # Quantization for speed (4-bit or 8-bit, adjust based on needs) | |
num_threads=4 # Adjust based on CPU cores available (only for CPU use) | |
) | |
# Function to generate a "typing" effect by yielding each part of the response | |
def chat_with_model(user_input): | |
messages = [ | |
{ | |
"role": "user", | |
"content": user_input | |
} | |
] | |
# Get response from the model | |
response = llm.create_chat_completion(messages=messages) | |
# Extract the full content from the response | |
full_response = response['choices'][0]['message']['content'] | |
# Stream the response like it's typing in real-time | |
for i in range(1, len(full_response) + 1): | |
yield full_response[:i] # Yield progressively larger chunks of the response | |
time.sleep(0.05) # Simulate typing speed (adjust as needed) | |
# Create a Gradio interface that streams the response | |
iface = gr.Interface( | |
fn=chat_with_model, # The function to handle input | |
inputs="text", # Input: text from user | |
outputs="text", # Output: streamed response as text | |
title="Chat with Bee 8B Model", # Title for the Gradio app | |
description="Ask anything and get responses from Bee in real-time!", | |
live=False # Set to False to only process input when the user hits Submit | |
) | |
# Launch the Gradio interface | |
iface.launch() | |