Spaces:

Ahil1991
/

Bee_8B_HF_Space

Sleeping

Ahil1991 commited on Sep 15, 2024

Commit

97b6124

verified ·

1 Parent(s): 127d2d3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,10 +2,13 @@ import gradio as gr
 from llama_cpp import Llama
 import time
-# Load your LLaMA model
 llm = Llama.from_pretrained(
     repo_id="Ahil1991/Bee-V.01-7B",
     filename="Bee-V.01.gguf",
 )
 # Function to generate a "typing" effect by yielding each part of the response
@@ -34,7 +37,8 @@ iface = gr.Interface(
     inputs="text",                   # Input: text from user
     outputs="text",                  # Output: streamed response as text
     title="Chat with Bee 8B Model",  # Title for the Gradio app
-    description="Ask anything and get responses from Bee in real-time!"
 )
 # Launch the Gradio interface

 from llama_cpp import Llama
 import time
+# Load your LLaMA model with GPU support, quantization, and multi-threading
 llm = Llama.from_pretrained(
     repo_id="Ahil1991/Bee-V.01-7B",
     filename="Bee-V.01.gguf",
+    use_gpu=True,  # Enable GPU if available
+    quantize="4bit",  # Quantization for speed (4-bit or 8-bit, adjust based on needs)
+    num_threads=4  # Adjust based on CPU cores available (only for CPU use)
 )
 # Function to generate a "typing" effect by yielding each part of the response
     inputs="text",                   # Input: text from user
     outputs="text",                  # Output: streamed response as text
     title="Chat with Bee 8B Model",  # Title for the Gradio app
+    description="Ask anything and get responses from Bee in real-time!",
+    live=False  # Set to False to only process input when the user hits Submit
 )
 # Launch the Gradio interface