from fastapi import FastAPI, Request, Depends
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from transformers import pipeline
from pydantic import BaseModel
from typing import Optional, Any
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, GenerationConfig
import os
from huggingface_hub import login

# Check whether we are executing inside a Hugging Face Space
SPACE_NAME = os.getenv("SPACE_NAME", default=None)
if SPACE_NAME is not None:
    print(f"Running inside {SPACE_NAME} Space.")
    try:
        # Try to auto-login using the Space's environment variables
        login(automatically=True)
    except Exception as e:
        print(f"Failed to auto-login ({str(e)}). Manually check the HF_ACCESS_TOKEN environment variable.")
        sys.exit(1)

try:
    HUGGINGFACE_TOKEN = os.environ['HF_ACCESS_TOKEN']
except KeyError:
    print('The environment variable "HF_ACCESS_TOKEN" is not found. Please configure it correctly in your Space.')
    sys.exit(1)

# Packages and model loading
import torch
base_model_id = "152334H/miqu-1-70b-sf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Tokenizer loading
eval_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", add_bos_token=True, trust_remote_code=True, use_auth_token=True)

# Streamer
streamer = TextStreamer(eval_tokenizer)

# App definition
app = FastAPI()

# Helper function to read raw request bodies
async def parse_raw(request: Request):
    return await request.body()

# Generate text
def generate_text(prompt: str) -> str:
    model_input = eval_tokenizer(prompt, return_tensors="pt").to("cuda")

    base_model.eval()
    with torch.no_grad():
        generated_sequences = base_model.generate(
            **model_input,
            max_new_tokens=4096,
            repetition_penalty=1.1,
            do_sample=True,
            temperature=1,
            streamer=streamer,
        )

    return eval_tokenizer.decode(generated_sequences[0], skip_special_tokens=True)

# Route for generating text
@app.post("/generate_text")
async def generate_text_route(data: BaseModel = Depends(parse_raw)):
    input_text = data.raw.decode("utf-8")
    if not input_text or len(input_text) <= 0:
        return JSONResponse({"error": "Empty input received."}, status_code=400)

    return {"output": generate_text(input_text)}

# Mount static files
app.mount("/static", StaticFiles(directory="static"), name="static")