Imran1
/

Qwen2.5-72B-Instruct-FP8

Model card Files Files and versions Community

Upload inference.py

#1

by FINGU-AI - opened Oct 9

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

Files changed (1) hide show

inference.py +80 -0

inference.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+import logging
+from typing import Dict, List, Optional
+import torch
+from fastapi import FastAPI, Request
+from vllm import LLM, SamplingParams
+from vllm.utils import random_uuid
+from chat_template import format_chat
+app = FastAPI()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+def model_fn(model_dir):
+    # The model is already in the container, so we don't need to download it
+    model = LLM(
+        model="/opt/ml/model",  # This is now a local path
+        trust_remote_code=True,
+        dtype="float16",
+        gpu_memory_utilization=0.9,
+    )
+    return model
+# Global model variable
+model = None
+@app.on_event("startup")
+async def startup_event():
+    global model
+    model = model_fn("/opt/ml/model")
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    try:
+        data = await request.json()
+        messages = data.get("messages", [])
+        formatted_prompt = format_chat(messages)
+        sampling_params = SamplingParams(
+            do_sample=data.get("do_sample", True),
+            temperature=data.get("temperature", 0.7),
+            top_p=data.get("top_p", 0.9),
+            max_new_tokens=data.get("max_new_tokens", 512),
+        )
+        outputs = model.generate(formatted_prompt, sampling_params)
+        generated_text = outputs[0].outputs[0].text
+        response = {
+            "id": f"chatcmpl-{random_uuid()}",
+            "object": "chat.completion",
+            "created": int(torch.cuda.current_timestamp()),
+            "model": "qwen-72b",
+            "choices": [{
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": generated_text
+                },
+                "finish_reason": "stop"
+            }],
+            "usage": {
+                "prompt_tokens": len(formatted_prompt),
+                "completion_tokens": len(generated_text),
+                "total_tokens": len(formatted_prompt) + len(generated_text)
+            }
+        }
+        return response
+    except Exception as e:
+        logger.error(f"Exception during prediction: {e}")
+        return {"error": str(e)}
+@app.get("/ping")
+def ping():
+    return {"status": "healthy"}