Spaces:

Ahil1991
/

Bee_8B_HF_Space

Sleeping

Bee_8B_HF_Space / app.py

Update app.py

4bf76fd verified 8 months ago

1.68 kB

	import gradio as gr
	from llama_cpp import Llama
	import time

	# Load your LLaMA model with GPU support, quantization, and multi-threading
	llm = Llama.from_pretrained(
	repo_id="Ahil1991/Bee-8.3B",
	filename="Bee 8.3B.gguf",
	use_gpu=True, # Enable GPU if available
	quantize="4bit", # Quantization for speed (4-bit or 8-bit, adjust based on needs)
	num_threads=4 # Adjust based on CPU cores available (only for CPU use)
	)

	# Function to generate a "typing" effect by yielding each part of the response
	def chat_with_model(user_input):
	messages = [
	{
	"role": "user",
	"content": user_input
	}
	]

	# Get response from the model
	response = llm.create_chat_completion(messages=messages)

	# Extract the full content from the response
	full_response = response['choices'][0]['message']['content']

	# Stream the response like it's typing in real-time
	for i in range(1, len(full_response) + 1):
	yield full_response[:i] # Yield progressively larger chunks of the response
	time.sleep(0.05) # Simulate typing speed (adjust as needed)

	# Create a Gradio interface that streams the response
	iface = gr.Interface(
	fn=chat_with_model, # The function to handle input
	inputs="text", # Input: text from user
	outputs="text", # Output: streamed response as text
	title="Chat with Bee 8B Model", # Title for the Gradio app
	description="Ask anything and get responses from Bee in real-time!",
	live=False # Set to False to only process input when the user hits Submit
	)

	# Launch the Gradio interface
	iface.launch()