MekkCyber
commited on
Commit
•
29baea6
1
Parent(s):
1c806d1
test zero gpu
Browse files
app.py
CHANGED
@@ -61,14 +61,22 @@ model = AutoModel.from_pretrained("{model_name}")"""
|
|
61 |
return model_card
|
62 |
|
63 |
@spaces.GPU
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
|
65 |
print(f"Quantizing model: {quantization_type}")
|
66 |
if quantization_type == "int4_weight_only" :
|
67 |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
|
68 |
else :
|
69 |
quantization_config = TorchAoConfig(quantization_type)
|
70 |
-
|
71 |
-
|
|
|
|
|
72 |
|
73 |
return model
|
74 |
|
@@ -110,11 +118,11 @@ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToke
|
|
110 |
return exists_message
|
111 |
if quantization_type == "int4_weight_only" and device == "cpu" :
|
112 |
return "int4_weight_only not supported on cpu"
|
113 |
-
try :
|
114 |
-
|
115 |
-
|
116 |
-
except Exception as e :
|
117 |
-
|
118 |
|
119 |
|
120 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
|
61 |
return model_card
|
62 |
|
63 |
@spaces.GPU
|
64 |
+
def load_model_gpu(model_name, quantization_config, auth_token) :
|
65 |
+
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
66 |
+
|
67 |
+
def load_model_cpu(model_name, quantization_config, auth_token) :
|
68 |
+
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
69 |
+
|
70 |
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
|
71 |
print(f"Quantizing model: {quantization_type}")
|
72 |
if quantization_type == "int4_weight_only" :
|
73 |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
|
74 |
else :
|
75 |
quantization_config = TorchAoConfig(quantization_type)
|
76 |
+
if device == "cuda" :
|
77 |
+
model = load_model_gpu(model_name, quantization_config, auth_token)
|
78 |
+
else :
|
79 |
+
model = load_model_cpu(model_name, quantization_config, auth_token)
|
80 |
|
81 |
return model
|
82 |
|
|
|
118 |
return exists_message
|
119 |
if quantization_type == "int4_weight_only" and device == "cpu" :
|
120 |
return "int4_weight_only not supported on cpu"
|
121 |
+
# try :
|
122 |
+
quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
|
123 |
+
return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
|
124 |
+
# except Exception as e :
|
125 |
+
# return e
|
126 |
|
127 |
|
128 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|