Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled | |
title = "Gemma 2 2B - Bulgarian Joke Master - GGUF" | |
description = """ | |
π [Gemma 2 2B](https://huggingface.co/unsloth/gemma-2-2b-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format. | |
This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp). | |
Running on CPU, it can still produce impressive results, although larger models may require more processing power. | |
""" | |
model_dir = "models" | |
model_name = "unsloth.Q4_K_M.gguf" | |
model_path = os.path.join(model_dir, model_name) | |
hf_hub_download( | |
repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf", | |
filename=model_name, | |
local_dir=model_dir | |
) | |
if not os.path.exists(model_path): | |
raise FileNotFoundError(f"Model file not found at {model_path}") | |
print("Loading the model...") | |
llm = Llama(model_path=model_path) | |
print("Model loaded successfully!") | |
# Define the function to generate responses | |
def generate_response(messages, temperature=0.7, top_p=1.0, max_tokens=256): | |
""" | |
Generate a response from the model. | |
Args: | |
messages (list): List of conversation history in the format: | |
[{'role': 'user', 'content': '...'}, {'role': 'assistant', 'content': '...'}] | |
temperature (float): Sampling temperature. | |
top_p (float): Top-p sampling parameter. | |
max_tokens (int): Maximum number of tokens to generate. | |
Returns: | |
str: The model's response. | |
""" | |
prompt = "" | |
for message in messages: | |
role = message['role'] | |
content = message['content'] | |
if role == 'user': | |
prompt += f"<start_of_turn>user\n{content}\n<end_of_turn>" | |
elif role == 'assistant': | |
prompt += f"<start_of_turn>model\n{content}\n<end_of_turn>" | |
prompt += "<start_of_turn>user\n" + messages[-1]['content'] + "\n<end_of_turn>" | |
try: | |
response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p) | |
return response["choices"][0]["text"].strip() | |
except Exception as e: | |
return f"Error generating response: {e}" | |
with gr.ChatInterface( | |
fn=generate_response, | |
title=title, | |
description=description, | |
theme="huggingface", | |
examples=[["Hello, tell me a Bulgarian joke!"]] | |
) as demo: | |
demo.launch(share=True) | |
llm.close() |