Steven10429 commited on
Commit
039130e
·
1 Parent(s): abba0b6
Files changed (2) hide show
  1. .gitignore +2 -1
  2. app.py +32 -12
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  *.log
2
- output
 
 
1
  *.log
2
+ output
3
+ temp
app.py CHANGED
@@ -53,8 +53,7 @@ def check_system_resources(model_name):
53
  log.info(f"Total system memory: {MEMORY}GB")
54
 
55
  model_size_gb = get_model_size_in_gb(model_name)
56
- required_memory_gb_16bit = model_size_gb * 1.5
57
- required_memory_gb = required_memory_gb_16bit
58
 
59
 
60
  log.info(f"Estimated required memory for model: {required_memory_gb:.1f}GB")
@@ -124,20 +123,21 @@ def download_and_merge_model(base_model_name, lora_model_name, output_dir, devic
124
  """
125
  os.makedirs("temp", exist_ok=True)
126
  log.info("Loading base model...")
127
- model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16)
128
  log.info("Loading adapter tokenizer...")
129
  adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name, trust_remote_code=True, device_map="auto", force_download=True)
130
  log.info("Resizing token embeddings...")
131
  added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
132
  model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
133
  log.info("Loading LoRA adapter...")
134
- peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16)
135
  log.info("Merging and unloading model...")
136
  model = peft_model.merge_and_unload()
137
  log.info("Saving model...")
138
  model.save_pretrained(output_dir)
139
  adapter_tokenizer.save_pretrained(output_dir)
140
  del model, peft_model
 
141
  return output_dir
142
 
143
  @timeit
@@ -192,22 +192,25 @@ def quantize(model_path, repo_id, quant_method=None):
192
  os.makedirs(model_output_dir, exist_ok=True)
193
 
194
  # 中间文件保存在 model_output 目录下
195
- guff_16 = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
196
 
197
- if not os.path.exists(guff_16):
198
  log.info(f"正在将模型转换为GGML格式")
199
  convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
200
- convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16}"
201
  print(f"syscall:[{convert_cmd}]")
202
  os.system(convert_cmd)
203
  else:
204
  log.info(f"GGML中间文件已存在,跳过转换")
 
 
 
205
 
206
  # 最终文件保存在 model_output 目录下
207
  final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
208
  log.info(f"正在进行{quant_method}量化")
209
  quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
210
- quant_cmd = f"{quantize_bin} {guff_16} {final_path} {quant_method}"
211
  print(f"syscall:[{quant_cmd}]")
212
 
213
  if not os.path.exists(final_path):
@@ -294,12 +297,9 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
294
  model_path = download_and_merge_model(base_model_name, lora_model_name, output_dir, device)
295
 
296
 
297
- # 量化模型
298
- for quant_method in quant_methods:
299
- quantize(output_dir, repo_name, quant_method=quant_method)
300
-
301
  create_readme(repo_name, base_model_name, lora_model_name, quant_methods)
302
 
 
303
  # 上传合并后的模型和量化模型
304
  api.upload_large_folder(
305
  folder_path=model_path,
@@ -310,6 +310,26 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
310
  )
311
  log.info("Upload completed.")
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  # rm -rf model_path
314
  shutil.rmtree(model_path)
315
  log.info("Removed model from local")
 
53
  log.info(f"Total system memory: {MEMORY}GB")
54
 
55
  model_size_gb = get_model_size_in_gb(model_name)
56
+ required_memory_gb = model_size_gb * 2.5
 
57
 
58
 
59
  log.info(f"Estimated required memory for model: {required_memory_gb:.1f}GB")
 
123
  """
124
  os.makedirs("temp", exist_ok=True)
125
  log.info("Loading base model...")
126
+ model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16, cache_dir="temp")
127
  log.info("Loading adapter tokenizer...")
128
  adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name, trust_remote_code=True, device_map="auto", force_download=True)
129
  log.info("Resizing token embeddings...")
130
  added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
131
  model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
132
  log.info("Loading LoRA adapter...")
133
+ peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16, cache_dir="temp")
134
  log.info("Merging and unloading model...")
135
  model = peft_model.merge_and_unload()
136
  log.info("Saving model...")
137
  model.save_pretrained(output_dir)
138
  adapter_tokenizer.save_pretrained(output_dir)
139
  del model, peft_model
140
+ shutil.rmtree("temp") # to save space due to huggingface space limit(50GB)
141
  return output_dir
142
 
143
  @timeit
 
192
  os.makedirs(model_output_dir, exist_ok=True)
193
 
194
  # 中间文件保存在 model_output 目录下
195
+ guff_16_path =f"./{repo_id}-f16.gguf"
196
 
197
+ if not os.path.exists(guff_16_path):
198
  log.info(f"正在将模型转换为GGML格式")
199
  convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
200
+ convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16_path}"
201
  print(f"syscall:[{convert_cmd}]")
202
  os.system(convert_cmd)
203
  else:
204
  log.info(f"GGML中间文件已存在,跳过转换")
205
+
206
+ if quant_method == "fp16":
207
+ return guff_16_path # for upload to hub
208
 
209
  # 最终文件保存在 model_output 目录下
210
  final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
211
  log.info(f"正在进行{quant_method}量化")
212
  quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
213
+ quant_cmd = f"{quantize_bin} {guff_16_path} {final_path} {quant_method}"
214
  print(f"syscall:[{quant_cmd}]")
215
 
216
  if not os.path.exists(final_path):
 
297
  model_path = download_and_merge_model(base_model_name, lora_model_name, output_dir, device)
298
 
299
 
 
 
 
 
300
  create_readme(repo_name, base_model_name, lora_model_name, quant_methods)
301
 
302
+
303
  # 上传合并后的模型和量化模型
304
  api.upload_large_folder(
305
  folder_path=model_path,
 
310
  )
311
  log.info("Upload completed.")
312
 
313
+ # remove model for space limit
314
+ shutil.rmtree(model_path)
315
+
316
+ os.makedirs(os.path.join(output_dir, "quantized"), exist_ok=True)
317
+ if len(quant_methods) > 0:
318
+ quantize(output_dir, repo_name, "fp16") # for
319
+ # 量化模型
320
+ for quant_method in quant_methods:
321
+ quantize(output_dir, repo_name, quant_method=quant_method)
322
+ os.system(f"mv ./{repo_name}-f16.gguf ./{output_dir}/quantized/")
323
+
324
+ api.upload_folder(
325
+ folder_path=os.path.join(output_dir, "quantized"),
326
+ path_in_repo="quantized",
327
+ repo_id=repo_name,
328
+ repo_type="model",
329
+ num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
330
+ print_report_every=10,
331
+ )
332
+
333
  # rm -rf model_path
334
  shutil.rmtree(model_path)
335
  log.info("Removed model from local")