Rúben Almeida
commited on
Commit
·
4d163d0
1
Parent(s):
1a1e448
Add GPU support
Browse files- .dockerignore +4 -0
- Dockerfile +1 -1
- main.py +10 -2
- requirements.txt +1 -1
.dockerignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**.venv
|
2 |
+
**.pytest_cache
|
3 |
+
**__pycache__
|
4 |
+
**.env
|
Dockerfile
CHANGED
@@ -18,7 +18,7 @@ RUN pip install --upgrade pip
|
|
18 |
RUN pip install -U setuptools wheel
|
19 |
|
20 |
# Install torch cpu version
|
21 |
-
RUN pip install -U torch torchvision torchaudio
|
22 |
|
23 |
# Copy the requirements file into the container
|
24 |
COPY --chown=user requirements.txt .
|
|
|
18 |
RUN pip install -U setuptools wheel
|
19 |
|
20 |
# Install torch cpu version
|
21 |
+
RUN pip install -U torch torchvision torchaudio
|
22 |
|
23 |
# Copy the requirements file into the container
|
24 |
COPY --chown=user requirements.txt .
|
main.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import zipfile
|
2 |
from typing import Union
|
3 |
from awq import AutoAWQForCausalLM
|
@@ -6,13 +7,20 @@ from tempfile import NamedTemporaryFile
|
|
6 |
from contextlib import asynccontextmanager
|
7 |
from fastapi import FastAPI, HTTPException
|
8 |
from fastapi.responses import RedirectResponse, FileResponse
|
9 |
-
from dto import AWQConvertionRequest, GGUFConvertionRequest, GPTQConvertionRequest
|
10 |
|
11 |
### FastAPI Initialization
|
12 |
@asynccontextmanager
|
13 |
async def lifespan(app:FastAPI):
|
|
|
|
|
|
|
|
|
|
|
14 |
yield
|
15 |
|
|
|
|
|
16 |
app = FastAPI(title="Huggingface Safetensor Model Converter to AWQ", version="0.1.0", lifespan=lifespan)
|
17 |
### -------
|
18 |
|
@@ -26,7 +34,7 @@ def redirect_to_docs():
|
|
26 |
def convert(request: AWQConvertionRequest)->Union[FileResponse, dict]:
|
27 |
|
28 |
try:
|
29 |
-
model = AutoAWQForCausalLM.from_pretrained(request.hf_model_name,
|
30 |
except TypeError as e:
|
31 |
raise HTTPException(status_code=400, detail=f"Is this model supported by AWQ Quantization? Check:https://github.com/mit-han-lab/llm-awq?tab=readme-ov-file {e}")
|
32 |
|
|
|
1 |
+
import torch
|
2 |
import zipfile
|
3 |
from typing import Union
|
4 |
from awq import AutoAWQForCausalLM
|
|
|
7 |
from contextlib import asynccontextmanager
|
8 |
from fastapi import FastAPI, HTTPException
|
9 |
from fastapi.responses import RedirectResponse, FileResponse
|
10 |
+
from .dto import AWQConvertionRequest, GGUFConvertionRequest, GPTQConvertionRequest
|
11 |
|
12 |
### FastAPI Initialization
|
13 |
@asynccontextmanager
|
14 |
async def lifespan(app:FastAPI):
|
15 |
+
torch.cuda.empty_cache()
|
16 |
+
|
17 |
+
print("Starting FastAPI server...")
|
18 |
+
print(f"Running on {"cuda" if torch.cuda.is_available() else "cpu"}")
|
19 |
+
|
20 |
yield
|
21 |
|
22 |
+
torch.cuda.empty_cache()
|
23 |
+
|
24 |
app = FastAPI(title="Huggingface Safetensor Model Converter to AWQ", version="0.1.0", lifespan=lifespan)
|
25 |
### -------
|
26 |
|
|
|
34 |
def convert(request: AWQConvertionRequest)->Union[FileResponse, dict]:
|
35 |
|
36 |
try:
|
37 |
+
model = AutoAWQForCausalLM.from_pretrained(request.hf_model_name, trust_remote_code=True)
|
38 |
except TypeError as e:
|
39 |
raise HTTPException(status_code=400, detail=f"Is this model supported by AWQ Quantization? Check:https://github.com/mit-han-lab/llm-awq?tab=readme-ov-file {e}")
|
40 |
|
requirements.txt
CHANGED
@@ -8,7 +8,7 @@ pydantic
|
|
8 |
fastapi[standard]
|
9 |
transformers
|
10 |
huggingface_hub[hf_xet]
|
11 |
-
autoawq
|
12 |
pytest
|
13 |
requests
|
14 |
environs
|
|
|
8 |
fastapi[standard]
|
9 |
transformers
|
10 |
huggingface_hub[hf_xet]
|
11 |
+
autoawq>=0.2.8
|
12 |
pytest
|
13 |
requests
|
14 |
environs
|