Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,18 +1,15 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gpt4all import GPT4All
|
3 |
-
from huggingface_hub import hf_hub_download
|
4 |
import os
|
|
|
|
|
5 |
|
6 |
# Force CPU-only execution
|
7 |
-
os.environ["LLAMA_CPP_USE_CUDA"] = "0"
|
8 |
-
os.environ["FORCE_CPU"] = "1"
|
9 |
-
|
10 |
-
title = "Mistral-7B-Instruct-GGUF Run On CPU-Basic Free Hardware"
|
11 |
|
|
|
|
|
12 |
description = """
|
13 |
-
π [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp)
|
14 |
-
π¨ Running on CPU-Basic free hardware. Suggest duplicating this space to run without a queue.
|
15 |
-
Mistral does not support system prompt symbols (such as `<<SYS>>`) now, input your system prompt in the first message if needed. Learn more: [Guardrailing Mistral 7B](https://docs.mistral.ai/usage/guardrailing).
|
16 |
"""
|
17 |
|
18 |
# Model setup
|
@@ -20,7 +17,7 @@ model_dir = "models"
|
|
20 |
model_name = "unsloth.Q4_K_M.gguf"
|
21 |
model_path = os.path.join(model_dir, model_name)
|
22 |
|
23 |
-
# Download model if not already present
|
24 |
hf_hub_download(
|
25 |
repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
|
26 |
filename=model_name,
|
@@ -29,39 +26,60 @@ hf_hub_download(
|
|
29 |
|
30 |
# Check if the model file exists
|
31 |
if not os.path.exists(model_path):
|
32 |
-
raise
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
|
|
66 |
if __name__ == "__main__":
|
67 |
-
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
import os
|
3 |
+
from llama_cpp import Llama
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
|
6 |
# Force CPU-only execution
|
7 |
+
os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled
|
|
|
|
|
|
|
8 |
|
9 |
+
# Application title and description
|
10 |
+
title = "Mistral-7B-Instruct-GGUF Run On CPU"
|
11 |
description = """
|
12 |
+
π [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp).
|
|
|
|
|
13 |
"""
|
14 |
|
15 |
# Model setup
|
|
|
17 |
model_name = "unsloth.Q4_K_M.gguf"
|
18 |
model_path = os.path.join(model_dir, model_name)
|
19 |
|
20 |
+
# Download the model if not already present
|
21 |
hf_hub_download(
|
22 |
repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
|
23 |
filename=model_name,
|
|
|
26 |
|
27 |
# Check if the model file exists
|
28 |
if not os.path.exists(model_path):
|
29 |
+
raise FileNotFoundError(f"Model file not found at {model_path}")
|
30 |
+
|
31 |
+
# Load the model using llama_cpp
|
32 |
+
print("Loading the model...")
|
33 |
+
llm = Llama(model_path=model_path)
|
34 |
+
print("Model loaded successfully!")
|
35 |
+
|
36 |
+
# Define the function to generate responses
|
37 |
+
def generate_response(prompt, temperature=0.7, top_p=1.0, max_tokens=256):
|
38 |
+
"""
|
39 |
+
Generate a response from the model.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
prompt (str): The user's input prompt.
|
43 |
+
temperature (float): Sampling temperature.
|
44 |
+
top_p (float): Top-p sampling parameter.
|
45 |
+
max_tokens (int): Maximum number of tokens to generate.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
str: The model's response.
|
49 |
+
"""
|
50 |
+
try:
|
51 |
+
response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
|
52 |
+
return response["choices"][0]["text"].strip()
|
53 |
+
except Exception as e:
|
54 |
+
return f"Error generating response: {e}"
|
55 |
+
|
56 |
+
# Set up Gradio interface
|
57 |
+
with gr.Blocks() as demo:
|
58 |
+
gr.Markdown("# π¦ Llama GGUF Model Chatbot")
|
59 |
+
gr.Markdown(description)
|
60 |
+
|
61 |
+
# Input box for the user prompt
|
62 |
+
prompt_input = gr.Textbox(label="Your Prompt", placeholder="Type your message here...", lines=5)
|
63 |
+
|
64 |
+
# Advanced settings
|
65 |
+
with gr.Accordion("Advanced Settings", open=False):
|
66 |
+
temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
|
67 |
+
top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.01, label="Top-p")
|
68 |
+
max_tokens = gr.Slider(16, 512, value=256, step=16, label="Max Tokens")
|
69 |
+
|
70 |
+
# Output box for the model's response
|
71 |
+
response_output = gr.Textbox(label="Model Response", placeholder="The model's response will appear here...", lines=10)
|
72 |
+
|
73 |
+
# Generate button
|
74 |
+
generate_button = gr.Button("Generate Response")
|
75 |
+
|
76 |
+
# Connect inputs and outputs
|
77 |
+
generate_button.click(
|
78 |
+
generate_response,
|
79 |
+
inputs=[prompt_input, temperature, top_p, max_tokens],
|
80 |
+
outputs=[response_output]
|
81 |
+
)
|
82 |
|
83 |
+
# Launch the Gradio app
|
84 |
if __name__ == "__main__":
|
85 |
+
demo.launch()
|