Spaces:
Sleeping
Sleeping
import os | |
from fastapi import FastAPI, Request | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
# Set writable cache directories for Hugging Face Spaces | |
#os.environ["TRANSFORMERS_CACHE"] = "/app/.cache" | |
#os.environ["HF_HOME"] = "/app/.cache" | |
app = FastAPI() | |
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct" | |
async def load_model(): | |
global model, tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", torch_dtype=torch.float32) | |
async def inference(request: Request): | |
data = await request.json() | |
prompt = data.get("prompt", "") | |
if not prompt: | |
return {"error": "No prompt provided"} | |
inputs = tokenizer(prompt, return_tensors="pt").to("cpu") | |
outputs = model.generate(**inputs, max_length=200) | |
result = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return {"response": result} | |