Spaces:
Runtime error
Runtime error
File size: 3,054 Bytes
ee6f18e 350e3ac dda9ab5 ee6f18e ccc774e ee6f18e edbe2ee ee6f18e ccc774e ee6f18e edbe2ee ee6f18e edbe2ee ee6f18e edbe2ee ee6f18e edbe2ee ccc774e edbe2ee ee6f18e edbe2ee ee6f18e edbe2ee ee6f18e edbe2ee ee6f18e d7ed0bf ee6f18e d7ed0bf ee6f18e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
import torch
import gradio as gr
#Setting device to cuda
torch.set_default_device("cuda")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model_name = "deepapaikar/katzbot-phi2"
# model = AutoModelForCausalLM.from_from_pretrained(model_name)
# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("deepapaikar/katzbot-phi2",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("deepapaikar/katzbot-phi2", trust_remote_code=True)
# pipeline = transformers.pipeline(
# "text-generation",
# model=model,
# torch_dtype=torch.float16,
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# def predict_answer(question, token=25):
# messages = [{"role": "user", "content": f"{question}"}]
# prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).to(device)
# outputs = pipeline(prompt, max_new_tokens=token, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
# return outputs[0]["generated_text"]
def predict_answer(question, token=25):
messages = [{"role": "user", "content": f"{question}"}]
# Generate prompt text using the chat template
prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Tokenize the prompt text to create input IDs suitable for the model
inputs = tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True)
# Move the tensor to the specified device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Use the model directly for inference
model.eval() # Ensure the model is in evaluation mode
model.to(device) # Ensure the model is on the correct device
# Generate outputs
output_sequences = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
max_length=token + inputs['input_ids'].shape[-1], # Adjust max_length accordingly
do_sample=True,
temperature=0.7,
top_k=50,
top_p=0.95
)
# Decode the output sequences to text
output_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
return output_text
def gradio_predict(question, token):
answer = predict_answer(question, token)
return answer
# Define the Gradio interface
iface = gr.Interface(
fn=gradio_predict,
inputs=[gr.Textbox(label="Question", placeholder="e.g. Where is Yeshiva University located?", scale=4),
gr.Slider(2, 100, value=25, label="Token Count", info="Choose between 2 and 100")],
outputs=gr.TextArea(label="Answer"),
title="KatzBot",
description="Phi2-trial1",
)
# Launch the app
iface.queue().launch(debug=True)
|