Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,28 +3,25 @@ import os
|
|
3 |
from llama_cpp import Llama
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
|
6 |
-
# Force CPU-only execution
|
7 |
os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled
|
8 |
|
9 |
-
|
10 |
-
title = "Mistral-7B-Instruct-GGUF Run On CPU"
|
11 |
description = """
|
12 |
-
🔎 [
|
|
|
|
|
13 |
"""
|
14 |
|
15 |
-
# Model setup
|
16 |
model_dir = "models"
|
17 |
model_name = "unsloth.Q4_K_M.gguf"
|
18 |
model_path = os.path.join(model_dir, model_name)
|
19 |
|
20 |
-
# Download the model if not already present
|
21 |
hf_hub_download(
|
22 |
repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
|
23 |
filename=model_name,
|
24 |
local_dir=model_dir
|
25 |
)
|
26 |
|
27 |
-
# Check if the model file exists
|
28 |
if not os.path.exists(model_path):
|
29 |
raise FileNotFoundError(f"Model file not found at {model_path}")
|
30 |
|
@@ -33,53 +30,36 @@ print("Loading the model...")
|
|
33 |
llm = Llama(model_path=model_path)
|
34 |
print("Model loaded successfully!")
|
35 |
|
36 |
-
|
37 |
-
def generate_response(prompt, temperature=0.7, top_p=1.0, max_tokens=256):
|
38 |
"""
|
39 |
Generate a response from the model.
|
40 |
-
|
41 |
Args:
|
42 |
-
|
43 |
temperature (float): Sampling temperature.
|
44 |
top_p (float): Top-p sampling parameter.
|
45 |
max_tokens (int): Maximum number of tokens to generate.
|
46 |
-
|
47 |
Returns:
|
48 |
str: The model's response.
|
49 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
try:
|
51 |
response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
|
52 |
return response["choices"][0]["text"].strip()
|
53 |
except Exception as e:
|
54 |
return f"Error generating response: {e}"
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
with gr.Accordion("Advanced Settings", open=False):
|
66 |
-
temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
|
67 |
-
top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.01, label="Top-p")
|
68 |
-
max_tokens = gr.Slider(16, 512, value=256, step=16, label="Max Tokens")
|
69 |
-
|
70 |
-
# Output box for the model's response
|
71 |
-
response_output = gr.Textbox(label="Model Response", placeholder="The model's response will appear here...", lines=10)
|
72 |
-
|
73 |
-
# Generate button
|
74 |
-
generate_button = gr.Button("Generate Response")
|
75 |
-
|
76 |
-
# Connect inputs and outputs
|
77 |
-
generate_button.click(
|
78 |
-
generate_response,
|
79 |
-
inputs=[prompt_input, temperature, top_p, max_tokens],
|
80 |
-
outputs=[response_output]
|
81 |
-
)
|
82 |
-
|
83 |
-
# Launch the Gradio app
|
84 |
-
if __name__ == "__main__":
|
85 |
-
demo.launch(share=True)
|
|
|
3 |
from llama_cpp import Llama
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
|
|
|
6 |
os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled
|
7 |
|
8 |
+
title = "Gemma 2 2B - Bulgarian Joke Master - GGUF"
|
|
|
9 |
description = """
|
10 |
+
🔎 [Gemma 2 2B](https://huggingface.co/unsloth/gemma-2-2b-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format.
|
11 |
+
This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp).
|
12 |
+
Running on CPU, it can still produce impressive results, although larger models may require more processing power.
|
13 |
"""
|
14 |
|
|
|
15 |
model_dir = "models"
|
16 |
model_name = "unsloth.Q4_K_M.gguf"
|
17 |
model_path = os.path.join(model_dir, model_name)
|
18 |
|
|
|
19 |
hf_hub_download(
|
20 |
repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
|
21 |
filename=model_name,
|
22 |
local_dir=model_dir
|
23 |
)
|
24 |
|
|
|
25 |
if not os.path.exists(model_path):
|
26 |
raise FileNotFoundError(f"Model file not found at {model_path}")
|
27 |
|
|
|
30 |
llm = Llama(model_path=model_path)
|
31 |
print("Model loaded successfully!")
|
32 |
|
33 |
+
def generate_response(messages, temperature=0.7, top_p=1.0, max_tokens=256):
|
|
|
34 |
"""
|
35 |
Generate a response from the model.
|
|
|
36 |
Args:
|
37 |
+
messages (list): List of conversation history in a tuple format (user, assistant).
|
38 |
temperature (float): Sampling temperature.
|
39 |
top_p (float): Top-p sampling parameter.
|
40 |
max_tokens (int): Maximum number of tokens to generate.
|
|
|
41 |
Returns:
|
42 |
str: The model's response.
|
43 |
"""
|
44 |
+
prompt = ""
|
45 |
+
for user_message, assistant_message in messages:
|
46 |
+
prompt += f"<start_of_turn>user\n{user_message}\n<end_of_turn>"
|
47 |
+
prompt += f"<start_of_turn>model\n{assistant_message}\n<end_of_turn>"
|
48 |
+
|
49 |
+
prompt += "<start_of_turn>user\n" + messages[-1][0] + "\n<end_of_turn>"
|
50 |
+
|
51 |
try:
|
52 |
response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
|
53 |
return response["choices"][0]["text"].strip()
|
54 |
except Exception as e:
|
55 |
return f"Error generating response: {e}"
|
56 |
|
57 |
+
with gr.ChatInterface(
|
58 |
+
fn=generate_response,
|
59 |
+
title=title,
|
60 |
+
description=description,
|
61 |
+
theme="huggingface",
|
62 |
+
allow_screenshot=False,
|
63 |
+
examples=[["Hello, tell me a Bulgarian joke!"]]
|
64 |
+
) as demo:
|
65 |
+
demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|