import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch

# --- 1. Check CUDA Availability and Set Device ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using device: {device} ({torch.cuda.get_device_name(0)})")
else:
    print("CUDA is not available. Falling back to CPU.")
    device = torch.device("cpu")

# --- 2. Load Tokenizer (with error handling) ---
MODEL_PATH = "sagar007/phi2_25k"
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    exit()

# --- 3. Load Base Model (Optimized for GPU) ---
try:
    base_model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-2",
        torch_dtype=torch.float16,  # Use float16 on GPU for efficiency
        device_map="auto",  # Automatically distribute model across GPUs
        trust_remote_code=True
    )
except Exception as e:
    print(f"Error loading base model: {e}")
    exit()

# --- 4. Load PEFT Model (Optimized for GPU) ---
try:
    peft_config = PeftConfig.from_pretrained(MODEL_PATH)
    model = PeftModel.from_pretrained(base_model, MODEL_PATH)
except Exception as e:
    print(f"Error loading PEFT model: {e}")
    exit()

# Move model to the GPU
model.to(device)
model.eval()

# --- 5. Generation Function (Optimized for GPU) ---
def generate_response(instruction, max_length=512):
    prompt = f"Instruction: {instruction}\nResponse:"
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_return_sequences=1,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("Response:")[1].strip()
    except Exception as e:
        print(f"Error during generation: {e}")
        return "Error during response generation."

# --- 6. Gradio Interface ---
def chatbot(message, history):
    response = generate_response(message)
    return response

demo = gr.ChatInterface(
    chatbot,
    title="Fine-tuned Phi-2 Chatbot (GPU)",
    description="This is a chatbot using a fine-tuned version of the Phi-2 model, running on GPU.",
    theme="default",
    examples=[
        "Explain the concept of machine learning.",
        "Write a short story about a robot learning to paint.",
        "What are some effective ways to reduce stress?",
    ],
    cache_examples=False,  # You can enable caching now
)

if __name__ == "__main__":
    demo.launch()