import gradio as gr from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer from threading import Thread import torch from huggingface_hub import InferenceClient """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") from unsloth import FastLanguageModel import torch max_seq_length = 2048 dtype = torch.float16 load_in_4bit = True model_id = "giustinod/TestLogica-AZService" model, tokenizer = FastLanguageModel.from_pretrained( model_name = model_id, max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit ) FastLanguageModel.for_inference(model) # Enable native 2x faster inference class StopOnTokens(StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: stop_ids = [29, 0] for stop_id in stop_ids: if input_ids[0][-1] == stop_id: return True return False def predict(message, history): history_transformer_format = history + [[message, ""]] stop = StopOnTokens() messages = "".join(["".join(["\n:"+str(item[0]), "\n:"+str(item[1])]) for item in history_transformer_format]) model_inputs = tokenizer([messages], return_tensors="pt").to("cuda") streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( model_inputs, streamer = streamer, max_new_tokens = 1024, do_sample = True, top_p = 0.95, top_k = 1000, temperature = 1.0, num_beams = 1, stopping_criteria = StoppingCriteriaList([stop]) ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_message = "" for new_token in streamer: if new_token != '<': partial_message += new_token yield partial_message if __name__ == "__main__": gr.ChatInterface(predict).launch()