import gradio as gr
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Load the model and tokenizer
model_id = "hsuwill000/Qwen2.5-1.5B-Instruct-openvino-8bit"
model = OVModelForCausalLM.from_pretrained(model_id, device="CPU")  # 明确指定设备
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Create generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

def respond(message, history):
    try:
        # Combine the entire conversation history
        input_text = message
        if history:
            input_text = "\n".join([f"User: {h[0]}\nBot: {h[1]}" for h in history]) + f"\nUser: {message}"
        
        # Generate response
        response = pipe(
            input_text,
            max_length=512,
            truncation=True,
            num_return_sequences=1,
            temperature=0.7,  # 控制生成多样性
            top_p=0.9,        # 控制生成质量
        )
        reply = response[0]['generated_text'].strip()
        
        # Update history
        history.append((message, reply))
        return history
    
    except Exception as e:
        print(f"Error: {e}")
        return history + [(message, "Sorry, something went wrong. Please try again.")]

# Custom clear function
def clear_history():
    return []

# Set up Gradio chat interface
with gr.Blocks() as demo:
    gr.Markdown("# Qwen2.5-1.5B-Instruct-openvino Chat")
    gr.Markdown("Chat with Qwen2.5-1.5B-Instruct-openvino model.")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Your Message")
    clear_btn = gr.Button("Clear History")
    
    msg.submit(respond, [msg, chatbot], chatbot)
    clear_btn.click(clear_history, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch()