import gradio as gr import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig model_path = os.environ.get("HF_REPO_ID") access_token = os.environ.get("HF_TOKEN") tokenizer = AutoTokenizer.from_pretrained(model_path, token=access_token) bnb_config = BitsAndBytesConfig( load_in_4bit=True, # load_in_8bit=use_8_bit, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "bfloat16"), bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained(model_path, token=access_token, quantization_config=bnb_config, torch_dtype=torch.float16, # attn_implementation="flash_attention_2", device_map='auto') if torch.cuda.is_available(): device = "cuda" else: device = "cpu" def generate( question, context=None, temperature=0.7, top_p=0.7, top_k=40, num_beams=4, max_new_tokens=256,): prompt = f"### CONTEXT:\n{context}\n\n### QUESTION:\n{question}\n\n### ANSWER:" inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, ) # with torch.autocast("cuda"): with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens, ) seq = generation_output.sequences[0] output = tokenizer.decode(seq) return output """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): context = "" for chat in history: context += f"রোগী: {chat[0]}\nথেরাপিস্ট: {chat[1]}\n" answer = generate(message, context, temperature=temperature, top_p=top_p, max_new_tokens=max_tokens).split('### ANSWER:')[1] if '' in answer: answer = answer.split('')[0].strip() return answer """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": demo.launch()