Spaces:
Sleeping
Sleeping
File size: 1,296 Bytes
5716ab8 9f26a6c a15fb10 62d4e66 91207a8 5716ab8 9f26a6c 5716ab8 9f26a6c 5716ab8 9f26a6c 5716ab8 91207a8 d088330 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from fastapi import FastAPI, Form
from llama_cpp import Llama
from typing import List
import json
# Initialize FastAPI app
app = FastAPI()
# Load the Llama model
llm = Llama.from_pretrained(
repo_id="HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
filename="smollm2-360m-instruct-q8_0.gguf", # Replace with the actual path to your GGUF file
)
# Endpoint to generate response from model based on user input
@app.post("/ask/")
async def ask_question(prompt: str = Form(...)):
# Format the prompt as a chat message
messages = [
{"role": "user", "content": prompt}
]
# Generate a response using Llama
response = llm.create_chat_completion(messages=messages)
response_content = response["choices"][0]["message"]["content"]
return {"response": response_content}
# Endpoint to test a simple query (optional)
@app.get("/test/")
async def test():
# Test the model with a simple question
messages = [{"role": "user", "content": "What is the capital of France?"}]
response = llm.create_chat_completion(messages=messages)
response_content = response["choices"][0]["message"]["content"]
return {"test_response": response_content}
import uvicorn
if __name__ == "__main__":
uvicorn.run("main:app", host="0.0.0.0", port=8000) |