import gradio as gr import torch from unsloth import FastLanguageModel from transformers import TextStreamer from transformers import AutoModelForCausalLM, AutoTokenizer # Replace with your model name #MODEL_NAME = "ssirikon/Gemma7b-bnb-Unsloth" #MODEL_NAME = "unsloth/gemma-7b-bnb-4bit" MODEL_NAME = "Lohith9459/QnAD2_gemma7b" # Load the model and tokenizer max_seq_length = 512 dtype = torch.bfloat16 load_in_4bit = True #model = FastLanguageModel.from_pretrained(MODEL_NAME, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit) #tokenizer = model.tokenizer model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) def generate_answer(question): instruction = "Generate an answer for the following question in less than two sentences." formatted_text = f"""Below is an instruction that describes a task. \ Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {question} ### Response: """ inputs = tokenizer([formatted_text], return_tensors="pt").to("cuda") text_streamer = TextStreamer(tokenizer) generated_ids = model.generate(**inputs, streamer=text_streamer, max_new_tokens=512) generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) def get_answer(text): start_tag = "### Response:" # Find the start and end indices start_idx = text.find(start_tag) # Check if both tags are found if start_idx == -1: return None # Tags not found # Extract content between the tags answer = text[start_idx + len(start_tag):].strip() return answer return get_answer(generated_text) # Create the Gradio interface demo = gr.Interface( fn=generate_answer, inputs=gr.Textbox(lines=5, label="Ask Question on AI/ML"), outputs=gr.Textbox(label="G-15 Gemma7b Model Generated Answer") ) demo.launch()