import os from fastapi import FastAPI, Request from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Set writable cache directories for Hugging Face Spaces #os.environ["TRANSFORMERS_CACHE"] = "/app/.cache" #os.environ["HF_HOME"] = "/app/.cache" app = FastAPI() MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct" @app.on_event("startup") async def load_model(): global model, tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", torch_dtype=torch.float32) @app.post("/inference") async def inference(request: Request): data = await request.json() prompt = data.get("prompt", "") if not prompt: return {"error": "No prompt provided"} inputs = tokenizer(prompt, return_tensors="pt").to("cpu") outputs = model.generate(**inputs, max_length=200) result = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": result}