Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,10 +2,13 @@ import gradio as gr
|
|
2 |
from llama_cpp import Llama
|
3 |
import time
|
4 |
|
5 |
-
# Load your LLaMA model
|
6 |
llm = Llama.from_pretrained(
|
7 |
repo_id="Ahil1991/Bee-V.01-7B",
|
8 |
filename="Bee-V.01.gguf",
|
|
|
|
|
|
|
9 |
)
|
10 |
|
11 |
# Function to generate a "typing" effect by yielding each part of the response
|
@@ -34,7 +37,8 @@ iface = gr.Interface(
|
|
34 |
inputs="text", # Input: text from user
|
35 |
outputs="text", # Output: streamed response as text
|
36 |
title="Chat with Bee 8B Model", # Title for the Gradio app
|
37 |
-
description="Ask anything and get responses from Bee in real-time!"
|
|
|
38 |
)
|
39 |
|
40 |
# Launch the Gradio interface
|
|
|
2 |
from llama_cpp import Llama
|
3 |
import time
|
4 |
|
5 |
+
# Load your LLaMA model with GPU support, quantization, and multi-threading
|
6 |
llm = Llama.from_pretrained(
|
7 |
repo_id="Ahil1991/Bee-V.01-7B",
|
8 |
filename="Bee-V.01.gguf",
|
9 |
+
use_gpu=True, # Enable GPU if available
|
10 |
+
quantize="4bit", # Quantization for speed (4-bit or 8-bit, adjust based on needs)
|
11 |
+
num_threads=4 # Adjust based on CPU cores available (only for CPU use)
|
12 |
)
|
13 |
|
14 |
# Function to generate a "typing" effect by yielding each part of the response
|
|
|
37 |
inputs="text", # Input: text from user
|
38 |
outputs="text", # Output: streamed response as text
|
39 |
title="Chat with Bee 8B Model", # Title for the Gradio app
|
40 |
+
description="Ask anything and get responses from Bee in real-time!",
|
41 |
+
live=False # Set to False to only process input when the user hits Submit
|
42 |
)
|
43 |
|
44 |
# Launch the Gradio interface
|