Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
# 模型名称(官方仓库) | |
model_name = "Qwen/Qwen2.5-0.5B-Instruct" | |
# 加载分词器 | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_name, | |
trust_remote_code=True | |
) | |
# 加载模型到 CPU | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
trust_remote_code=True, | |
).to("cpu") # 显式移至CPU | |
# 简易对话函数 | |
def predict(query, history=None): | |
if history is None: | |
history = [] | |
# 编码输入 | |
inputs = tokenizer(query, return_tensors="pt") | |
# 放到CPU张量上 | |
input_ids = inputs["input_ids"].to("cpu") | |
attention_mask = inputs["attention_mask"].to("cpu") | |
# 推理 | |
with torch.no_grad(): | |
output_ids = model.generate( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
max_new_tokens=128, | |
do_sample=True, | |
top_p=0.9, | |
temperature=0.8 | |
) | |
# 解码 | |
output_text = tokenizer.decode( | |
output_ids[0][inputs["input_ids"].shape[1]:], | |
skip_special_tokens=True | |
) | |
# 更新对话历史 | |
history.append((query, output_text)) | |
return history, history | |
# 搭建 Gradio 界面 | |
with gr.Blocks() as demo: | |
gr.Markdown("## Qwen2.5-0.5B-Instruct (CPU) 测试 Demo") | |
chatbot = gr.Chatbot(label="Qwen Chatbot") | |
msg = gr.Textbox(label="输入你的问题或对话") | |
state = gr.State([]) | |
submit = gr.Button("发送") | |
submit.click( | |
fn=predict, | |
inputs=[msg, state], | |
outputs=[chatbot, state] | |
) | |
# 启动服务 | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |