from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Path to your local model directory
model_path = "C:/Users/YourName/Documents/fine_tuned_llama_3_2"  # Update this path accordingly

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model with optimized settings
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto"  # Automatically use GPU if available
)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("✅ Model loaded successfully!")

# Function to generate text
def generate_text(prompt, max_length=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
prompt = "The future of AI is"
print("\nGenerated Output:\n", generate_text(prompt))