MekkCyber commited on
Commit
29baea6
1 Parent(s): 1c806d1

test zero gpu

Browse files
Files changed (1) hide show
  1. app.py +15 -7
app.py CHANGED
@@ -61,14 +61,22 @@ model = AutoModel.from_pretrained("{model_name}")"""
61
  return model_card
62
 
63
  @spaces.GPU
 
 
 
 
 
 
64
  def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
65
  print(f"Quantizing model: {quantization_type}")
66
  if quantization_type == "int4_weight_only" :
67
  quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
68
  else :
69
  quantization_config = TorchAoConfig(quantization_type)
70
-
71
- model = AutoModel.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
 
 
72
 
73
  return model
74
 
@@ -110,11 +118,11 @@ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToke
110
  return exists_message
111
  if quantization_type == "int4_weight_only" and device == "cpu" :
112
  return "int4_weight_only not supported on cpu"
113
- try :
114
- quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
115
- return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
116
- except Exception as e :
117
- return e
118
 
119
 
120
  with gr.Blocks(theme=gr.themes.Soft()) as app:
 
61
  return model_card
62
 
63
  @spaces.GPU
64
+ def load_model_gpu(model_name, quantization_config, auth_token) :
65
+ return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
66
+
67
+ def load_model_cpu(model_name, quantization_config, auth_token) :
68
+ return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
69
+
70
  def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
71
  print(f"Quantizing model: {quantization_type}")
72
  if quantization_type == "int4_weight_only" :
73
  quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
74
  else :
75
  quantization_config = TorchAoConfig(quantization_type)
76
+ if device == "cuda" :
77
+ model = load_model_gpu(model_name, quantization_config, auth_token)
78
+ else :
79
+ model = load_model_cpu(model_name, quantization_config, auth_token)
80
 
81
  return model
82
 
 
118
  return exists_message
119
  if quantization_type == "int4_weight_only" and device == "cpu" :
120
  return "int4_weight_only not supported on cpu"
121
+ # try :
122
+ quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
123
+ return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
124
+ # except Exception as e :
125
+ # return e
126
 
127
 
128
  with gr.Blocks(theme=gr.themes.Soft()) as app: