import time import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer def run_LLM (model, tokenizer, streamer, prompt): token_ids = tokenizer.encode(prompt, return_tensors="pt") output_ids = model.generate( input_ids=token_ids.to(model.device), #max_new_tokens=300, max_new_tokens=3000000, do_sample=True, temperature=0.8, ) n_tokens = len(output_ids[0]) output_text = tokenizer.decode(output_ids[0]) return (output_text, n_tokens) def display_message(): model = AutoModelForCausalLM.from_pretrained("cyberagent/calm2-7b-chat", device_map="cuda", torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained("cyberagent/calm2-7b-chat") streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) prompt = """わが国の経済について今後の予想を教えてください。 ASSISTANT: """ t_start = time.perf_counter() t_prev = t_start t_sum = 0.0 total_tokens = 0 log = '' for i in range(10): (result, n_tokens) = run_LLM(model, tokenizer, streamer, prompt) total_tokens = total_tokens + n_tokens t_curr = time.perf_counter() t_lap = t_curr - t_prev t_prev = t_curr t_sum = t_sum + t_lap t_avg = t_sum/(i+1.0) speed_now = n_tokens/t_lap speed_avg = total_tokens/t_sum row = [] row.append(i+1) row.append(t_lap) row.append(t_avg) row.append(speed_now) row.append(speed_avg) row.append(n_tokens) row.append(total_tokens) row.append(t_sum) line = "%d %f %f %f %f %d %d %f" % (i+1, t_lap, t_avg, speed_now, speed_avg, n_tokens, total_tokens, t_sum) log = log + line + "¥n" return log if __name__ == '__main__': iface = gr.Interface(fn=display_message, inputs=None, outputs="text") iface.launch()