Rúben Almeida
commited on
Commit
·
1a1e448
1
Parent(s):
59098b0
Revert to use CPU resources in the quantization process
Browse files- .vscode/launch.json +24 -0
- Dockerfile +2 -2
- main.py +2 -2
- requirements.txt +1 -1
.vscode/launch.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
// Use IntelliSense to learn about possible attributes.
|
3 |
+
// Hover to view descriptions of existing attributes.
|
4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
5 |
+
"version": "0.2.0",
|
6 |
+
"configurations": [
|
7 |
+
{
|
8 |
+
"name": "Python Debugger: FastAPI",
|
9 |
+
"type": "debugpy",
|
10 |
+
"request": "launch",
|
11 |
+
"module": "uvicorn",
|
12 |
+
"args": [
|
13 |
+
"main:app",
|
14 |
+
"--reload",
|
15 |
+
"--port",
|
16 |
+
"5000",
|
17 |
+
"--host",
|
18 |
+
"0.0.0.0"
|
19 |
+
],
|
20 |
+
"jinja": true,
|
21 |
+
"cwd": "${workspaceFolder}",
|
22 |
+
}
|
23 |
+
]
|
24 |
+
}
|
Dockerfile
CHANGED
@@ -17,8 +17,8 @@ RUN pip install --upgrade pip
|
|
17 |
|
18 |
RUN pip install -U setuptools wheel
|
19 |
|
20 |
-
# Install torch
|
21 |
-
RUN pip install -U torch torchvision torchaudio
|
22 |
|
23 |
# Copy the requirements file into the container
|
24 |
COPY --chown=user requirements.txt .
|
|
|
17 |
|
18 |
RUN pip install -U setuptools wheel
|
19 |
|
20 |
+
# Install torch cpu version
|
21 |
+
RUN pip install -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
|
22 |
|
23 |
# Copy the requirements file into the container
|
24 |
COPY --chown=user requirements.txt .
|
main.py
CHANGED
@@ -6,7 +6,7 @@ from tempfile import NamedTemporaryFile
|
|
6 |
from contextlib import asynccontextmanager
|
7 |
from fastapi import FastAPI, HTTPException
|
8 |
from fastapi.responses import RedirectResponse, FileResponse
|
9 |
-
from
|
10 |
|
11 |
### FastAPI Initialization
|
12 |
@asynccontextmanager
|
@@ -26,7 +26,7 @@ def redirect_to_docs():
|
|
26 |
def convert(request: AWQConvertionRequest)->Union[FileResponse, dict]:
|
27 |
|
28 |
try:
|
29 |
-
model = AutoAWQForCausalLM.from_pretrained(request.hf_model_name)
|
30 |
except TypeError as e:
|
31 |
raise HTTPException(status_code=400, detail=f"Is this model supported by AWQ Quantization? Check:https://github.com/mit-han-lab/llm-awq?tab=readme-ov-file {e}")
|
32 |
|
|
|
6 |
from contextlib import asynccontextmanager
|
7 |
from fastapi import FastAPI, HTTPException
|
8 |
from fastapi.responses import RedirectResponse, FileResponse
|
9 |
+
from dto import AWQConvertionRequest, GGUFConvertionRequest, GPTQConvertionRequest
|
10 |
|
11 |
### FastAPI Initialization
|
12 |
@asynccontextmanager
|
|
|
26 |
def convert(request: AWQConvertionRequest)->Union[FileResponse, dict]:
|
27 |
|
28 |
try:
|
29 |
+
model = AutoAWQForCausalLM.from_pretrained(request.hf_model_name, device_map="cpu", trust_remote_code=True)
|
30 |
except TypeError as e:
|
31 |
raise HTTPException(status_code=400, detail=f"Is this model supported by AWQ Quantization? Check:https://github.com/mit-han-lab/llm-awq?tab=readme-ov-file {e}")
|
32 |
|
requirements.txt
CHANGED
@@ -8,7 +8,7 @@ pydantic
|
|
8 |
fastapi[standard]
|
9 |
transformers
|
10 |
huggingface_hub[hf_xet]
|
11 |
-
autoawq>=0.2.8
|
12 |
pytest
|
13 |
requests
|
14 |
environs
|
|
|
8 |
fastapi[standard]
|
9 |
transformers
|
10 |
huggingface_hub[hf_xet]
|
11 |
+
autoawq[cpu]>=0.2.8
|
12 |
pytest
|
13 |
requests
|
14 |
environs
|