Spaces:

Dorjzodovsuren
/

Mongolian_Llama3

Runtime error

App Files Files Community

Mongolian_Llama3 / app.py

Dorjzodovsuren

Update app.py

8b42dad verified about 2 months ago

raw

history blame

4.4 kB

	# import gradio as gr
	# from huggingface_hub import InferenceClient

	# """
	# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
	# """
	# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")


	# def respond(
	# message,
	# history: list[tuple[str, str]],
	# system_message,
	# max_tokens,
	# temperature,
	# top_p,
	# ):
	# messages = [{"role": "system", "content": system_message}]

	# for val in history:
	# if val[0]:
	# messages.append({"role": "user", "content": val[0]})
	# if val[1]:
	# messages.append({"role": "assistant", "content": val[1]})

	# messages.append({"role": "user", "content": message})

	# response = ""

	# for message in client.chat_completion(
	# messages,
	# max_tokens=max_tokens,
	# stream=True,
	# temperature=temperature,
	# top_p=top_p,
	# ):
	# token = message.choices[0].delta.content

	# response += token
	# yield response


	# """
	# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
	# """
	# demo = gr.ChatInterface(
	# respond,
	# additional_inputs=[
	# gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
	# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	# gr.Slider(
	# minimum=0.1,
	# maximum=1.0,
	# value=0.95,
	# step=0.05,
	# label="Top-p (nucleus sampling)",
	# ),
	# ],
	# )


	# if __name__ == "__main__":
	# demo.launch()

	import torch
	import gradio as gr
	from threading import Thread
	from peft import PeftModel, PeftConfig
	from unsloth import FastLanguageModel
	from transformers import TextStreamer
	from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer

	max_seq_length = 1024
	dtype = torch.float16
	load_in_4bit = True
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = "Dorjzodovsuren/Mongolian_Llama3-v0.1",
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
	)

	EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

	alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{}

	### Input:
	{}

	### Response:
	{}"""


	# Enable native 2x faster inference
	FastLanguageModel.for_inference(model)

	# Create a text streamer
	text_streamer = TextStreamer(tokenizer, skip_prompt=False,skip_special_tokens=True)

	# Get the device based on GPU availability
	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# Move model into device
	model = model.to(device)

	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	stop_ids = [29, 0]
	for stop_id in stop_ids:
	if input_ids[0][-1] == stop_id:
	return True
	return False

	# Current implementation does not support conversation based on history.
	# Highly recommend to experiment on various hyper parameters to compare qualities.
	def predict(message, history):
	stop = StopOnTokens()
	messages = alpaca_prompt.format(
	message,
	"",
	"",
	)

	model_inputs = tokenizer([messages], return_tensors="pt").to(device)

	streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=1024,
	top_p=0.95,
	temperature=0.001,
	repetition_penalty=1.1,
	stopping_criteria=StoppingCriteriaList([stop])
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	partial_message = ""
	for new_token in streamer:
	if new_token != '<':
	partial_message += new_token
	yield partial_message

	gr.ChatInterface(predict).launch(show_api=True)