# from fastapi import FastAPI, HTTPException
# from pydantic import BaseModel
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
# from huggingface_hub import snapshot_download
# from safetensors.torch import load_file

# class ModelInput(BaseModel):
#     prompt: str
#     max_new_tokens: int = 50

# app = FastAPI()

# # Define model paths
# base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
# adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"

# try:
#     # First load the base model
#     print("Loading base model...")
#     model = AutoModelForCausalLM.from_pretrained(
#         base_model_path,
#         torch_dtype=torch.float16,
#         trust_remote_code=True,
#         device_map="auto"
#     )
    
#     # Load tokenizer from base model
#     print("Loading tokenizer...")
#     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    
#     # Download adapter weights
#     print("Downloading adapter weights...")
#     adapter_path_local = snapshot_download(adapter_path)
    
#     # Load the safetensors file
#     print("Loading adapter weights...")
#     state_dict = load_file(f"{adapter_path_local}/adapter_model.safetensors")
    
#     # Load state dict into model
#     model.load_state_dict(state_dict, strict=False)
    
#     print("Model and adapter loaded successfully!")

# except Exception as e:
#     print(f"Error during model loading: {e}")
#     raise

# def generate_response(model, tokenizer, instruction, max_new_tokens=128):
#     """Generate a response from the model based on an instruction."""
#     try:
#         messages = [{"role": "user", "content": instruction}]
#         input_text = tokenizer.apply_chat_template(
#             messages, tokenize=False, add_generation_prompt=True
#         )
        
#         inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
#         outputs = model.generate(
#             inputs,
#             max_new_tokens=max_new_tokens,
#             temperature=0.2,
#             top_p=0.9,
#             do_sample=True,
#         )
        
#         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         return response
    
#     except Exception as e:
#         raise ValueError(f"Error generating response: {e}")

# @app.post("/generate")
# async def generate_text(input: ModelInput):
#     try:
#         response = generate_response(
#             model=model,
#             tokenizer=tokenizer,
#             instruction=input.prompt,
#             max_new_tokens=input.max_new_tokens
#         )
#         return {"generated_text": response}
    
#     except Exception as e:
#         raise HTTPException(status_code=500, detail=str(e))

# @app.get("/")
# async def root():
#     return {"message": "Welcome to the Model API!"}


# //////////////////////////////////////////

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import snapshot_download
from safetensors.torch import load_file

class ModelInput(BaseModel):
    prompt: str
    max_new_tokens: int = 2048

app = FastAPI()

# Define model paths
base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"

try:
    # Load the base model
    print("Loading base model...")
    model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        device_map="cpu",          # Explicitly set CPU
        # load_in_8bit=True          # Enable int8 quantization
        trust_remote_code=True,
        # device_map="auto"
    )

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)

    # Download adapter weights
    print("Downloading adapter weights...")
    adapter_path_local = snapshot_download(repo_id=adapter_path)

    # Load the safetensors file
    print("Loading adapter weights...")
    adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
    state_dict = load_file(adapter_file)

    # Load state dict into model
    print("Applying adapter weights...")
    model.load_state_dict(state_dict, strict=False)

    print("Model and adapter loaded successfully!")

except Exception as e:
    print(f"Error during model loading: {e}")
    raise

def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
    """Generate a response from the model based on an instruction."""
    try:
        # Format input for the model
        inputs = tokenizer.encode(instruction, return_tensors="pt").to(model.device)
        
        # Generate response
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
        )

        # Decode and return the output
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

    except Exception as e:
        raise ValueError(f"Error generating response: {e}")

@app.post("/generate")
async def generate_text(input: ModelInput):
    try:
        response = generate_response(
            model=model,
            tokenizer=tokenizer,
            instruction=input.prompt,
            max_new_tokens=2048
        )
        return {"generated_text": response}

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
async def root():
    return {"message": "Welcome to the Model API!"}