from fastapi import FastAPI | |
from pydantic import BaseModel | |
from llama_cpp import Llama | |
app = FastAPI() | |
# Load the model | |
llm = Llama.from_pretrained( | |
repo_id="unsloth/phi-4-GGUF", | |
filename="phi-4-Q4_K_M.gguf", | |
) | |
# Define request model | |
class ChatRequest(BaseModel): | |
system_prompt: str | |
query: str | |
async def chat(request: ChatRequest): | |
response = llm.create_chat_completion( | |
messages=[ | |
{"role": "system", "content": request.system_prompt}, | |
{"role": "user", "content": request.query}, | |
] | |
) | |
return {"response": response} | |