Turkish-Llama-8b-DPO-v0.1

Sleeping

App Files Files Community

Turkish-Llama-8b-DPO-v0.1 / app.py

ysdede

Fix: DEFAULT_MAX_NEW_TOKENS

70520da 24 days ago

raw

history blame contribute delete

5.18 kB

	import os
	from threading import Thread
	from typing import Iterator

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	MAX_MAX_NEW_TOKENS = 8192
	DEFAULT_MAX_NEW_TOKENS = 2048
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	DESCRIPTION = """\
	# Turkish LLaMA 8B Chat

	This Space demonstrates [Turkish-Llama-8b-DPO-v0.1](https://huggingface.co/ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1) by YTU COSMOS Research Group, an 8B parameter model fine-tuned for Turkish language understanding and generation. Feel free to play with it, or duplicate to run generations without a queue!

	🔎 This model is the newest and most advanced iteration of CosmosLLama, developed by merging two distinctly trained CosmosLLaMa-Instruct DPO models.

	🤖 The model is optimized for Turkish language tasks and can handle various text generation scenarios including conversations, instructions, and general text completion.

	💡 You can also try the model on the official demo page: [cosmos.yildiz.edu.tr/cosmosllama](https://cosmos.yildiz.edu.tr/cosmosllama)
	"""

	LICENSE = """
	<p/>

	---
	This demo uses [Turkish-Llama-8b-DPO-v0.1](https://huggingface.co/ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1) by YTU COSMOS Research Group,
	and is governed by the original llama3 license.
	"""

	if not torch.cuda.is_available():
	DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"


	if torch.cuda.is_available():
	model_id = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	)
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	tokenizer.use_default_system_prompt = False

	TERMINATORS = [
	tokenizer.eos_token_id,
	tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	]


	@spaces.GPU
	def generate(
	message: str,
	chat_history: list[dict],
	system_prompt: str = "",
	max_new_tokens: int = 2048,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.0,
	) -> Iterator[str]:
	conversation = []
	if system_prompt:
	conversation.append({"role": "system", "content": system_prompt})
	conversation += chat_history
	conversation.append({"role": "user", "content": message})

	input_ids = tokenizer.apply_chat_template(
	conversation,
	add_generation_prompt=True,
	return_tensors="pt"
	)
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(model.device)

	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	{"input_ids": input_ids},
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	eos_token_id=TERMINATORS,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)


	chat_interface = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.Textbox(
	label="System prompt",
	lines=6,
	value="Sen bir yapay zeka asistanısın. Kullanıcı sana bir görev verecek. Amacın görevi olabildiğince sadık bir şekilde tamamlamak. Görevi yerine getirirken adım adım düşün ve adımlarını gerekçelendir.",
	),
	gr.Slider(
	label="Max new tokens",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	),
	gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=4.0,
	step=0.1,
	value=0.6,
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=0.9,
	),
	gr.Slider(
	label="Top-k",
	minimum=1,
	maximum=1000,
	step=1,
	value=50,
	),
	gr.Slider(
	label="Repetition penalty",
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	value=1.0,
	),
	],
	stop_btn=None,
	examples=[
	["Merhaba! Nasılsın?"],
	["Yapay zeka alanında açık kaynak kodun faydaları nelerdir?"],
	],
	cache_examples=False,
	type="messages",
	)

	with gr.Blocks(css_paths="style.css", fill_height=True) as demo:
	gr.Markdown(DESCRIPTION)
	gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
	chat_interface.render()
	gr.Markdown(LICENSE)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()