Spaces:

vislupus
/

Bulgarian-Joke-Master

Sleeping

App Files Files Community

Bulgarian-Joke-Master / app.py

vislupus

Update app.py

b3d7c0e verified 2 months ago

raw

history blame

2.53 kB

	import gradio as gr
	import os
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled

	title = "Gemma 2 2B - Bulgarian Joke Master - GGUF"
	description = """
	🔎 [Gemma 2 2B](https://huggingface.co/unsloth/gemma-2-2b-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format.
	This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp).
	Running on CPU, it can still produce impressive results, although larger models may require more processing power.
	"""

	model_dir = "models"
	model_name = "unsloth.Q4_K_M.gguf"
	model_path = os.path.join(model_dir, model_name)

	hf_hub_download(
	repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
	filename=model_name,
	local_dir=model_dir
	)

	if not os.path.exists(model_path):
	raise FileNotFoundError(f"Model file not found at {model_path}")

	print("Loading the model...")
	llm = Llama(model_path=model_path)
	print("Model loaded successfully!")

	# Define the function to generate responses
	def generate_response(messages, temperature=0.7, top_p=1.0, max_tokens=256):
	"""
	Generate a response from the model.
	Args:
	messages (list): List of conversation history in the format:
	[{'role': 'user', 'content': '...'}, {'role': 'assistant', 'content': '...'}]
	temperature (float): Sampling temperature.
	top_p (float): Top-p sampling parameter.
	max_tokens (int): Maximum number of tokens to generate.
	Returns:
	str: The model's response.
	"""
	prompt = ""
	for message in messages:
	role = message['role']
	content = message['content']

	if role == 'user':
	prompt += f"<start_of_turn>user\n{content}\n<end_of_turn>"
	elif role == 'assistant':
	prompt += f"<start_of_turn>model\n{content}\n<end_of_turn>"

	prompt += "<start_of_turn>user\n" + messages[-1]['content'] + "\n<end_of_turn>"

	try:
	response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
	return response["choices"][0]["text"].strip()
	except Exception as e:
	return f"Error generating response: {e}"

	with gr.ChatInterface(
	fn=generate_response,
	title=title,
	description=description,
	theme="huggingface",
	examples=[["Hello, tell me a Bulgarian joke!"]]
	) as demo:
	demo.launch(share=True)

	llm.close()