import gradio as gr from optimum.intel import OVModelForCausalLM from transformers import AutoTokenizer, pipeline # Load the model and tokenizer model_id = "hsuwill000/Qwen2.5-1.5B-Instruct-openvino-8bit" model = OVModelForCausalLM.from_pretrained(model_id, device="CPU") # 明确指定设备 tokenizer = AutoTokenizer.from_pretrained(model_id) # Create generation pipeline pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) def respond(message, history): try: # Combine the entire conversation history input_text = message if history: input_text = "\n".join([f"User: {h[0]}\nBot: {h[1]}" for h in history]) + f"\nUser: {message}" # Generate response response = pipe( input_text, max_length=512, truncation=True, num_return_sequences=1, temperature=0.7, # 控制生成多样性 top_p=0.9, # 控制生成质量 ) reply = response[0]['generated_text'].strip() # Update history history.append((message, reply)) return history except Exception as e: print(f"Error: {e}") return history + [(message, "Sorry, something went wrong. Please try again.")] # Custom clear function def clear_history(): return [] # Set up Gradio chat interface with gr.Blocks() as demo: gr.Markdown("# Qwen2.5-1.5B-Instruct-openvino Chat") gr.Markdown("Chat with Qwen2.5-1.5B-Instruct-openvino model.") chatbot = gr.Chatbot() msg = gr.Textbox(label="Your Message") clear_btn = gr.Button("Clear History") msg.submit(respond, [msg, chatbot], chatbot) clear_btn.click(clear_history, None, chatbot, queue=False) if __name__ == "__main__": demo.launch()