Ahil1991 commited on
Commit
97b6124
·
verified ·
1 Parent(s): 127d2d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -2
app.py CHANGED
@@ -2,10 +2,13 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  import time
4
 
5
- # Load your LLaMA model
6
  llm = Llama.from_pretrained(
7
  repo_id="Ahil1991/Bee-V.01-7B",
8
  filename="Bee-V.01.gguf",
 
 
 
9
  )
10
 
11
  # Function to generate a "typing" effect by yielding each part of the response
@@ -34,7 +37,8 @@ iface = gr.Interface(
34
  inputs="text", # Input: text from user
35
  outputs="text", # Output: streamed response as text
36
  title="Chat with Bee 8B Model", # Title for the Gradio app
37
- description="Ask anything and get responses from Bee in real-time!"
 
38
  )
39
 
40
  # Launch the Gradio interface
 
2
  from llama_cpp import Llama
3
  import time
4
 
5
+ # Load your LLaMA model with GPU support, quantization, and multi-threading
6
  llm = Llama.from_pretrained(
7
  repo_id="Ahil1991/Bee-V.01-7B",
8
  filename="Bee-V.01.gguf",
9
+ use_gpu=True, # Enable GPU if available
10
+ quantize="4bit", # Quantization for speed (4-bit or 8-bit, adjust based on needs)
11
+ num_threads=4 # Adjust based on CPU cores available (only for CPU use)
12
  )
13
 
14
  # Function to generate a "typing" effect by yielding each part of the response
 
37
  inputs="text", # Input: text from user
38
  outputs="text", # Output: streamed response as text
39
  title="Chat with Bee 8B Model", # Title for the Gradio app
40
+ description="Ask anything and get responses from Bee in real-time!",
41
+ live=False # Set to False to only process input when the user hits Submit
42
  )
43
 
44
  # Launch the Gradio interface