Ffftdtd5dtft commited on
Commit
a34ae2a
·
verified ·
1 Parent(s): 3218113

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -143
app.py CHANGED
@@ -1,52 +1,33 @@
1
  import os
2
- import shutil
3
  import subprocess
4
  import signal
5
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
  import gradio as gr
7
-
8
- from huggingface_hub import create_repo, HfApi
9
- from huggingface_hub import snapshot_download
10
- from huggingface_hub import whoami
11
- from huggingface_hub import ModelCard
12
-
13
- from gradio_huggingfacehub_search import HuggingfaceHubSearch
14
-
15
  from apscheduler.schedulers.background import BackgroundScheduler
16
-
17
  from textwrap import dedent
18
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
21
  def generate_importance_matrix(model_path, train_data_path):
22
  imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
-
24
  os.chdir("llama.cpp")
25
-
26
- print(f"Current working directory: {os.getcwd()}")
27
- print(f"Files in the current directory: {os.listdir('.')}")
28
-
29
  if not os.path.isfile(f"../{model_path}"):
30
  raise Exception(f"Model file not found: {model_path}")
31
-
32
- print("Running imatrix command...")
33
  process = subprocess.Popen(imatrix_command, shell=True)
34
-
35
  try:
36
- process.wait(timeout=60) # added wait
37
  except subprocess.TimeoutExpired:
38
- print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
  process.send_signal(signal.SIGINT)
40
  try:
41
- process.wait(timeout=5) # grace period
42
  except subprocess.TimeoutExpired:
43
- print("Imatrix proc still didn't term. Forcefully terminating process...")
44
  process.kill()
45
-
46
  os.chdir("..")
47
 
48
- print("Importance matrix generation completed.")
49
-
50
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
  if oauth_token.token is None:
52
  raise ValueError("You have to be logged in.")
@@ -56,24 +37,16 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
56
  split_cmd += f" --split-max-size {split_max_size}"
57
  split_cmd += f" {model_path} {model_path.split('.')[0]}"
58
 
59
- print(f"Split command: {split_cmd}")
60
-
61
  result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
62
- print(f"Split command stdout: {result.stdout}")
63
- print(f"Split command stderr: {result.stderr}")
64
 
65
  if result.returncode != 0:
66
  raise Exception(f"Error splitting the model: {result.stderr}")
67
- print("Model split successfully!")
68
-
69
 
70
  sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
71
  if sharded_model_files:
72
- print(f"Sharded model files: {sharded_model_files}")
73
  api = HfApi(token=oauth_token.token)
74
  for file in sharded_model_files:
75
  file_path = os.path.join('.', file)
76
- print(f"Uploading file: {file_path}")
77
  try:
78
  api.upload_file(
79
  path_or_fileobj=file_path,
@@ -84,86 +57,52 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
84
  raise Exception(f"Error uploading file {file_path}: {e}")
85
  else:
86
  raise Exception("No sharded files found.")
87
-
88
- print("Sharded model has been uploaded successfully!")
89
 
90
  def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
91
  if oauth_token.token is None:
92
  raise ValueError("You must be logged in to use GGUF-my-repo")
 
93
  model_name = model_id.split('/')[-1]
94
  fp16 = f"{model_name}.fp16.gguf"
95
 
96
  try:
97
  api = HfApi(token=oauth_token.token)
98
 
99
- # Descargar el modelo completo
100
  dl_pattern = ["*.md", "*.json", "*.model"]
101
-
102
- # Añadir soporte para distintos tipos de modelos (texto, imagen, audio, etc.)
103
- model_types = [
104
- "*.safetensors",
105
- "*.bin",
106
- "*.pt",
107
- "*.onnx",
108
- "*.h5",
109
- "*.tflite",
110
- "*.ckpt",
111
- "*.pb",
112
- "*.tar",
113
- "*.xml",
114
- "*.caffemodel",
115
- ]
116
-
117
  dl_pattern.extend(model_types)
118
-
119
- # Descargar todos los archivos relevantes del modelo
120
  api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
121
- print("Model downloaded successfully!")
122
- print(f"Current working directory: {os.getcwd()}")
123
- print(f"Model directory contents: {os.listdir(model_name)}")
124
 
125
  conversion_script = "convert_hf_to_gguf.py"
126
  fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
127
  result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
128
- print(result)
129
  if result.returncode != 0:
130
  raise Exception(f"Error converting to fp16: {result.stderr}")
131
- print("Model converted to fp16 successfully!")
132
- print(f"Converted model path: {fp16}")
133
 
134
  imatrix_path = "llama.cpp/imatrix.dat"
135
 
136
  if use_imatrix:
137
- if train_data_file:
138
- train_data_path = train_data_file.name
139
- else:
140
- train_data_path = "groups_merged.txt" #fallback calibration dataset
141
-
142
- print(f"Training data file path: {train_data_path}")
143
-
144
  if not os.path.isfile(train_data_path):
145
  raise Exception(f"Training data file not found: {train_data_path}")
146
-
147
  generate_importance_matrix(fp16, train_data_path)
148
- else:
149
- print("Not using imatrix quantization.")
150
  username = whoami(oauth_token.token)["name"]
151
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
152
  quantized_gguf_path = quantized_gguf_name
 
153
  if use_imatrix:
154
  quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
155
  else:
156
  quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
 
157
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
158
  if result.returncode != 0:
159
  raise Exception(f"Error quantizing: {result.stderr}")
160
- print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
161
- print(f"Quantized model path: {quantized_gguf_path}")
162
 
163
- # Crear repositorio vacío
164
  new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
165
  new_repo_id = new_repo_url.repo_id
166
- print("Repo created successfully!", new_repo_url)
167
 
168
  try:
169
  card = ModelCard.load(model_id, token=oauth_token.token)
@@ -179,99 +118,49 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
179
  # {new_repo_id}
180
  This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
181
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
182
-
183
  ## Use with llama.cpp
184
  Install llama.cpp through brew (works on Mac and Linux)
185
-
186
- ```bash
187
- brew install llama.cpp
188
-
189
- ```
190
- Invoke the llama.cpp server or the CLI.
191
-
192
- ### CLI:
193
  ```bash
194
- llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
195
  ```
196
-
197
- ### Server:
 
198
  ```bash
199
- llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
200
- ```
201
-
202
- Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
203
- Step 1: Clone llama.cpp from GitHub.
204
- ```
205
- git clone https://github.com/ggerganov/llama.cpp
206
  ```
207
- Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
208
- ```
209
- cd llama.cpp
210
- ```
211
- Step 3: Quantize your downloaded fp16 model into a gguf for inference.
212
-
213
  ```bash
214
- ./llama.cpp/convert-hf-to-gguf.py /path/to/your/hf-model --outtype f16 --outfile llama.gguf
215
  ```
216
- ## License
217
- {card.data.license if card.data.license else "The original license applied to the model {model_id}"}
218
- ## Limitations and Biases
219
- The original limitations and biases of the model {model_id} apply to this quantized GGUF model as well.
220
  """
221
  )
 
222
 
223
- # Subir la tarjeta del modelo
224
- api.upload_file(
225
- path_or_fileobj=card.to_json_string().encode("utf-8"),
226
- path_in_repo="README.md",
227
- repo_id=new_repo_id,
228
- )
229
- print("Model card uploaded!")
230
-
231
- # Verifica si se debe hacer split del modelo
232
  if split_model:
233
- split_upload_model(
234
- model_path=quantized_gguf_path,
235
- repo_id=new_repo_id,
236
- oauth_token=oauth_token,
237
- split_max_tensors=split_max_tensors,
238
- split_max_size=split_max_size
239
- )
240
  else:
241
- print(f"Uploading quantized model to {new_repo_id}...")
242
  api.upload_file(
243
- path_or_fileobj=quantized_gguf_path,
244
  path_in_repo=quantized_gguf_name,
245
  repo_id=new_repo_id,
 
246
  )
247
- print("Model uploaded successfully!")
248
-
249
- shutil.rmtree(model_name)
250
- print("Cleaned up local files.")
251
-
252
- print(f"Process completed successfully! Your quantized GGUF model is available at: https://huggingface.co/{new_repo_id}")
253
- return f"Model successfully quantized and uploaded to {new_repo_id}!"
254
-
255
  except Exception as e:
256
- print(f"Exception during processing: {e}")
257
- return f"An error occurred: {str(e)}"
258
 
259
  def setup_scheduler():
260
  scheduler = BackgroundScheduler()
261
- scheduler.add_job(restart_space, 'interval', hours=6)
262
  scheduler.start()
 
263
 
264
- def restart_space():
265
- api = HfApi(token=HF_TOKEN)
266
- api.restart_space(repo_id="ggml-org/gguf-my-repo", hardware="cpu-basic")
267
- print("Space restarted successfully!")
268
-
269
- # Setup Gradio interface with updated support
270
  with gr.Blocks() as demo:
271
- model_id = HuggingfaceHubSearch(label="Select a model from HuggingFace Hub").launch()
272
- q_method = gr.Dropdown(choices=["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], label="Quantization method")
273
  use_imatrix = gr.Checkbox(label="Use imatrix quantization")
274
- imatrix_q_method = gr.Dropdown(choices=["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], label="Imatrix Quantization method", visible=False)
275
  train_data_file = gr.File(label="Upload calibration dataset for imatrix")
276
  private_repo = gr.Checkbox(label="Make repo private")
277
  split_model = gr.Checkbox(label="Split model before uploading")
 
1
  import os
 
2
  import subprocess
3
  import signal
 
4
  import gradio as gr
5
+ from huggingface_hub import create_repo, HfApi, snapshot_download, whoami, ModelCard
 
 
 
 
 
 
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
 
7
  from textwrap import dedent
8
 
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
10
 
11
  def generate_importance_matrix(model_path, train_data_path):
12
  imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
 
13
  os.chdir("llama.cpp")
14
+
 
 
 
15
  if not os.path.isfile(f"../{model_path}"):
16
  raise Exception(f"Model file not found: {model_path}")
17
+
 
18
  process = subprocess.Popen(imatrix_command, shell=True)
19
+
20
  try:
21
+ process.wait(timeout=60)
22
  except subprocess.TimeoutExpired:
 
23
  process.send_signal(signal.SIGINT)
24
  try:
25
+ process.wait(timeout=5)
26
  except subprocess.TimeoutExpired:
 
27
  process.kill()
28
+
29
  os.chdir("..")
30
 
 
 
31
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
32
  if oauth_token.token is None:
33
  raise ValueError("You have to be logged in.")
 
37
  split_cmd += f" --split-max-size {split_max_size}"
38
  split_cmd += f" {model_path} {model_path.split('.')[0]}"
39
 
 
 
40
  result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
 
 
41
 
42
  if result.returncode != 0:
43
  raise Exception(f"Error splitting the model: {result.stderr}")
 
 
44
 
45
  sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
46
  if sharded_model_files:
 
47
  api = HfApi(token=oauth_token.token)
48
  for file in sharded_model_files:
49
  file_path = os.path.join('.', file)
 
50
  try:
51
  api.upload_file(
52
  path_or_fileobj=file_path,
 
57
  raise Exception(f"Error uploading file {file_path}: {e}")
58
  else:
59
  raise Exception("No sharded files found.")
 
 
60
 
61
  def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
62
  if oauth_token.token is None:
63
  raise ValueError("You must be logged in to use GGUF-my-repo")
64
+
65
  model_name = model_id.split('/')[-1]
66
  fp16 = f"{model_name}.fp16.gguf"
67
 
68
  try:
69
  api = HfApi(token=oauth_token.token)
70
 
 
71
  dl_pattern = ["*.md", "*.json", "*.model"]
72
+ model_types = ["*.safetensors", "*.bin", "*.pt", "*.onnx", "*.h5", "*.tflite", "*.ckpt", "*.pb", "*.tar", "*.xml", "*.caffemodel"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  dl_pattern.extend(model_types)
 
 
74
  api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
 
 
 
75
 
76
  conversion_script = "convert_hf_to_gguf.py"
77
  fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
78
  result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
79
+
80
  if result.returncode != 0:
81
  raise Exception(f"Error converting to fp16: {result.stderr}")
 
 
82
 
83
  imatrix_path = "llama.cpp/imatrix.dat"
84
 
85
  if use_imatrix:
86
+ train_data_path = train_data_file.name if train_data_file else "groups_merged.txt"
 
 
 
 
 
 
87
  if not os.path.isfile(train_data_path):
88
  raise Exception(f"Training data file not found: {train_data_path}")
 
89
  generate_importance_matrix(fp16, train_data_path)
90
+
 
91
  username = whoami(oauth_token.token)["name"]
92
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
93
  quantized_gguf_path = quantized_gguf_name
94
+
95
  if use_imatrix:
96
  quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
97
  else:
98
  quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
99
+
100
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
101
  if result.returncode != 0:
102
  raise Exception(f"Error quantizing: {result.stderr}")
 
 
103
 
 
104
  new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
105
  new_repo_id = new_repo_url.repo_id
 
106
 
107
  try:
108
  card = ModelCard.load(model_id, token=oauth_token.token)
 
118
  # {new_repo_id}
119
  This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
120
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
121
+
122
  ## Use with llama.cpp
123
  Install llama.cpp through brew (works on Mac and Linux)
 
 
 
 
 
 
 
 
124
  ```bash
125
+ brew install gguf
126
  ```
127
+
128
+ ## Use llama.cpp quantized model
129
+ - Download the model:
130
  ```bash
131
+ curl -L -o {quantized_gguf_name} https://huggingface.co/{new_repo_id}/raw/main/{quantized_gguf_name}
 
 
 
 
 
 
132
  ```
133
+
 
 
 
 
 
134
  ```bash
135
+ ./main -m {quantized_gguf_name} --prompt "Tell me about gguf"
136
  ```
 
 
 
 
137
  """
138
  )
139
+ card.save(new_repo_id, token=oauth_token.token)
140
 
 
 
 
 
 
 
 
 
 
141
  if split_model:
142
+ split_upload_model(quantized_gguf_name, new_repo_id, oauth_token, split_max_tensors, split_max_size)
 
 
 
 
 
 
143
  else:
 
144
  api.upload_file(
145
+ path_or_fileobj=quantized_gguf_name,
146
  path_in_repo=quantized_gguf_name,
147
  repo_id=new_repo_id,
148
+ token=oauth_token.token,
149
  )
150
+ return f"Done processing {new_repo_id}"
 
 
 
 
 
 
 
151
  except Exception as e:
152
+ return f"Error processing model: {str(e)}"
 
153
 
154
  def setup_scheduler():
155
  scheduler = BackgroundScheduler()
 
156
  scheduler.start()
157
+ return scheduler
158
 
 
 
 
 
 
 
159
  with gr.Blocks() as demo:
160
+ model_id = gr.Textbox(label="Enter Model ID", placeholder="Enter model ID from HuggingFace Hub")
161
+ q_method = gr.Dropdown(choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"], label="Quantization method")
162
  use_imatrix = gr.Checkbox(label="Use imatrix quantization")
163
+ imatrix_q_method = gr.Dropdown(choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"], label="Imatrix Quantization method", visible=False)
164
  train_data_file = gr.File(label="Upload calibration dataset for imatrix")
165
  private_repo = gr.Checkbox(label="Make repo private")
166
  split_model = gr.Checkbox(label="Split model before uploading")