import gradio as gr from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline import torch # Load your model and tokenizer model_name = "midrees2806/2Krows_uoe_edu" tokenizer = LlamaTokenizer.from_pretrained(model_name) model = LlamaForCausalLM.from_pretrained(model_name,torch_dtype=torch.float16,device_map="cpu") # Define the pipeline pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer) def generate_response(prompt): # Format the prompt as required by the model input_text = f"[INST] {prompt} [/INST]" # Generate response with max_new_tokens specified response = pipe(input_text, max_new_tokens=50) # Adjust 50 as needed # Extract the generated text from the response answer = response[0]['generated_text'].split('[/INST]')[-1].strip() return answer # Gradio Interface setup iface = gr.Interface( fn=generate_response, inputs="text", outputs="text", title="LLaMA-2 Chatbot", description="Ask anything to the LLaMA-2 fine-tuned model!", ) # Launch the Gradio app iface.launch()