Spaces:

saifeddinemk
/

mlai

Sleeping

App Files Files Community

saifeddinemk commited on 6 days ago

Commit

efadfea

•

1 Parent(s): 9e9c793

Fixed app v2

Browse files

Files changed (1) hide show

app.py +25 -14

app.py CHANGED Viewed

@@ -1,18 +1,24 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
 # Initialize FastAPI app
 app = FastAPI()
-# Load the Llama model
-try:
-    llm = Llama.from_pretrained(
-        repo_id="QuantFactory/SecurityLLM-GGUF",
-        filename="SecurityLLM.Q8_0.gguf",
-    )
-except Exception as e:
-    raise RuntimeError(f"Failed to load model: {e}")
 # Define request model for log data
 class LogRequest(BaseModel):
@@ -25,6 +31,7 @@ class AnalysisResponse(BaseModel):
 # Define the route for security log analysis
 @app.post("/analyze_security_logs", response_model=AnalysisResponse)
 async def analyze_security_logs(request: LogRequest):
     try:
         # Security-focused prompt
         prompt = (
@@ -33,21 +40,25 @@ async def analyze_security_logs(request: LogRequest):
             "Provide details on potential threats, IPs involved, and suggest actions if any threats are detected.\n\n"
             f"{request.log_data}"
         )
-        # Generate response from the model
-        response = llm.create_chat_completion(
             messages=[
                 {
                     "role": "user",
                     "content": prompt
                 }
-            ]
         )
         # Extract and return the analysis text
         analysis_text = response["choices"][0]["message"]["content"]
         return AnalysisResponse(analysis=analysis_text)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-# To run the app, use: uvicorn app:app --reload

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
+from functools import lru_cache
+import asyncio
+import uvicorn
 # Initialize FastAPI app
 app = FastAPI()
+# Lazy load the Llama model with float16 precision
+@lru_cache(maxsize=1)
+def load_model():
+    try:
+        return Llama.from_pretrained(
+            repo_id="QuantFactory/SecurityLLM-GGUF",
+            filename="SecurityLLM.Q8_0.gguf",
+            torch_dtype="float16"  # Specify FP16 precision
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to load model: {e}")
 # Define request model for log data
 class LogRequest(BaseModel):
 # Define the route for security log analysis
 @app.post("/analyze_security_logs", response_model=AnalysisResponse)
 async def analyze_security_logs(request: LogRequest):
+    llm = load_model()
     try:
         # Security-focused prompt
         prompt = (
             "Provide details on potential threats, IPs involved, and suggest actions if any threats are detected.\n\n"
             f"{request.log_data}"
         )
+        # Generate response with controlled max tokens
+        response = await asyncio.to_thread(
+            llm.create_chat_completion,
             messages=[
                 {
                     "role": "user",
                     "content": prompt
                 }
+            ],
+            max_tokens=512  # Adjust to limit the response length
         )
         # Extract and return the analysis text
         analysis_text = response["choices"][0]["message"]["content"]
         return AnalysisResponse(analysis=analysis_text)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+# Run the FastAPI app using uvicorn
+if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, workers=4, reload=True)