import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
from huggingface_hub import InferenceClient

# Load configuration
MODEL_PATH = "sagar007/phi2_25k"
peft_config = PeftConfig.from_pretrained(MODEL_PATH)

# Initialize client for Zero-GPU environment
client = InferenceClient()

def load_model():
    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-2",
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    # Load PEFT model
    model = PeftModel.from_pretrained(base_model, MODEL_PATH)
    return model, AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

@client.gpu(timeout=120)
def generate_response(instruction, max_length=512):
    try:
        model, tokenizer = load_model()
        prompt = f"Instruction: {instruction}\nResponse:"
        
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )
            
        return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Response:")[-1].strip()
    except Exception as e:
        print(f"Error: {str(e)}")
        return "Sorry, I encountered an error. Please try again."

def chatbot(message, history):
    response = generate_response(message)
    return response

demo = gr.ChatInterface(
    chatbot,
    title="Phi-2 Zero-GPU Chat",
    description="Fine-tuned Phi-2 model running on Hugging Face Zero-GPU Spaces",
    examples=[
        ["Explain quantum computing in simple terms"],
        ["Write a poem about artificial intelligence"],
        ["How do I make a perfect omelette?"]
    ],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()