testapp / app.py
thamada's picture
Update app.py
6af59ad verified
import time
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
def run_LLM (model, tokenizer, streamer, prompt):
token_ids = tokenizer.encode(prompt, return_tensors="pt")
output_ids = model.generate(
input_ids=token_ids.to(model.device),
#max_new_tokens=300,
max_new_tokens=3000000,
do_sample=True,
temperature=0.8,
)
n_tokens = len(output_ids[0])
output_text = tokenizer.decode(output_ids[0])
return (output_text, n_tokens)
def display_message():
model = AutoModelForCausalLM.from_pretrained("cyberagent/calm2-7b-chat",
device_map="cuda",
torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("cyberagent/calm2-7b-chat")
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
prompt = """わが国の経済について今後の予想を教えてください。
ASSISTANT: """
t_start = time.perf_counter()
t_prev = t_start
t_sum = 0.0
total_tokens = 0
log = ''
for i in range(10):
(result, n_tokens) = run_LLM(model, tokenizer, streamer, prompt)
total_tokens = total_tokens + n_tokens
t_curr = time.perf_counter()
t_lap = t_curr - t_prev
t_prev = t_curr
t_sum = t_sum + t_lap
t_avg = t_sum/(i+1.0)
speed_now = n_tokens/t_lap
speed_avg = total_tokens/t_sum
row = []
row.append(i+1)
row.append(t_lap)
row.append(t_avg)
row.append(speed_now)
row.append(speed_avg)
row.append(n_tokens)
row.append(total_tokens)
row.append(t_sum)
line = "%d %f %f %f %f %d %d %f" % (i+1, t_lap, t_avg, speed_now, speed_avg, n_tokens, total_tokens, t_sum)
log = log + line + "¥n"
return log
if __name__ == '__main__':
iface = gr.Interface(fn=display_message, inputs=None, outputs="text")
iface.launch()