import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM # 模型名称(官方仓库) model_name = "Qwen/Qwen2.5-0.5B-Instruct" # 加载分词器 tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) # 加载模型到 CPU model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, ).to("cpu") # 显式移至CPU # 简易对话函数 def predict(query, history=None): if history is None: history = [] # 编码输入 inputs = tokenizer(query, return_tensors="pt") # 放到CPU张量上 input_ids = inputs["input_ids"].to("cpu") attention_mask = inputs["attention_mask"].to("cpu") # 推理 with torch.no_grad(): output_ids = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=128, do_sample=True, top_p=0.9, temperature=0.8 ) # 解码 output_text = tokenizer.decode( output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True ) # 更新对话历史 history.append((query, output_text)) return history, history # 搭建 Gradio 界面 with gr.Blocks() as demo: gr.Markdown("## Qwen2.5-0.5B-Instruct (CPU) 测试 Demo") chatbot = gr.Chatbot(label="Qwen Chatbot") msg = gr.Textbox(label="输入你的问题或对话") state = gr.State([]) submit = gr.Button("发送") submit.click( fn=predict, inputs=[msg, state], outputs=[chatbot, state] ) # 启动服务 demo.launch(server_name="0.0.0.0", server_port=7860)