from fastapi import FastAPI, Form
from llama_cpp import Llama
from typing import List
import json

# Initialize FastAPI app
app = FastAPI()

# Load the Llama model
llm = Llama.from_pretrained(
    repo_id="HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
    filename="smollm2-360m-instruct-q8_0.gguf",  # Replace with the actual path to your GGUF file
)

# Endpoint to generate response from model based on user input
@app.post("/ask/")
async def ask_question(prompt: str = Form(...)):
    # Format the prompt as a chat message
    messages = [
        {"role": "user", "content": prompt}
    ]
    
    # Generate a response using Llama
    response = llm.create_chat_completion(messages=messages)
    response_content = response["choices"][0]["message"]["content"]
    
    return {"response": response_content}

# Endpoint to test a simple query (optional)
@app.get("/test/")
async def test():
    # Test the model with a simple question
    messages = [{"role": "user", "content": "What is the capital of France?"}]
    response = llm.create_chat_completion(messages=messages)
    response_content = response["choices"][0]["message"]["content"]
    
    return {"test_response": response_content}

 
import uvicorn

if __name__ == "__main__":
    uvicorn.run("main:app", host="0.0.0.0", port=8000)