import gradio as gr import os from llama_cpp import Llama from huggingface_hub import hf_hub_download os.environ["LLAMA_CPP_USE_CUDA"] = "0" title = "Gemma 2 2B - Bulgarian Joke Master - GGUF" description = """ 🔎 [Gemma 2 2B](https://huggingface.co/unsloth/gemma-2-2b-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format. This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp). Running on CPU, it can still produce impressive results, although larger models may require more processing power. """ model_dir = "models" model_name = "unsloth.Q4_K_M.gguf" model_path = os.path.join(model_dir, model_name) hf_hub_download( repo_id="vislupus/bulgarian-joke-master-SmolLM2-135M-Instruct-bnb-4bit-gguf", filename=model_name, local_dir=model_dir ) if not os.path.exists(model_path): raise FileNotFoundError(f"Model file not found at {model_path}") print("Loading the model...") llm = Llama(model_path=model_path) print("Model loaded successfully!") def generate_response(message, history, temperature=0.7, top_p=1.0, max_tokens=256): """ Generate a response based on the user's message and the conversation history. """ try: conversation = "" for msg in history: role, content = msg conversation += f"{role}\n{content}\n" conversation += f"user\n{message}\n" response = llm(conversation, max_tokens=max_tokens, temperature=temperature, top_p=top_p) return response["choices"][0]["text"].strip() except Exception as e: return f"Error generating response: {str(e)}" if __name__ == "__main__": gguf_demo = gr.ChatInterface( generate_response, title=title, description=description, ) gguf_demo.launch(share=True) try: if llm: llm.close() except Exception as e: print(f"Error closing model: {e}")