Spaces:
Sleeping
Sleeping
File size: 1,681 Bytes
67eb0b2 127d2d3 1813f3b 97b6124 67eb0b2 f9c6b3d 8f7bece 97b6124 67eb0b2 127d2d3 67eb0b2 127d2d3 67eb0b2 127d2d3 27b8cd9 127d2d3 67eb0b2 27b8cd9 67eb0b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import gradio as gr
from llama_cpp import Llama
import time
# Load your LLaMA model with GPU support, quantization, and multi-threading
llm = Llama.from_pretrained(
repo_id="Ahil1991/Bee-8.3B",
filename="Bee 8.3B.gguf",
use_gpu=True, # Enable GPU if available
quantize="4bit", # Quantization for speed (4-bit or 8-bit, adjust based on needs)
num_threads=4 # Adjust based on CPU cores available (only for CPU use)
)
# Function to generate a "typing" effect by yielding each part of the response
def chat_with_model(user_input):
messages = [
{
"role": "user",
"content": user_input
}
]
# Get response from the model
response = llm.create_chat_completion(messages=messages)
# Extract the full content from the response
full_response = response['choices'][0]['message']['content']
# Stream the response like it's typing in real-time
for i in range(1, len(full_response) + 1):
yield full_response[:i] # Yield progressively larger chunks of the response
time.sleep(0.05) # Simulate typing speed (adjust as needed)
# Create a Gradio interface that streams the response
iface = gr.Interface(
fn=chat_with_model, # The function to handle input
inputs="text", # Input: text from user
outputs="text", # Output: streamed response as text
title="Chat with Bee 8B Model", # Title for the Gradio app
description="Ask anything and get responses from Bee in real-time!",
live=False # Set to False to only process input when the user hits Submit
)
# Launch the Gradio interface
iface.launch()
|