Kaileh57 commited on
Commit
49c6950
·
1 Parent(s): 29e0bb8
Files changed (6) hide show
  1. Dockerfile +24 -0
  2. app.py +474 -566
  3. groups_merged.txt +0 -0
  4. requirements.txt +3 -4
  5. setup.sh +0 -48
  6. start.sh +31 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ cmake \
9
+ git \
10
+ wget \
11
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements and install Python dependencies
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy the rest of the code
18
+ COPY . .
19
+
20
+ # Make start script executable
21
+ RUN chmod +x start.sh
22
+
23
+ # Run the start script
24
+ CMD ["./start.sh"]
app.py CHANGED
@@ -4,639 +4,547 @@ import signal
4
  import time
5
  import json
6
  from datetime import datetime
 
7
  import threading
8
- import logging
 
 
9
  import gradio as gr
10
- from huggingface_hub import HfApi, login, whoami
11
- from pathlib import Path
12
- import shutil
13
 
14
- # Configure logging
15
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
- logger = logging.getLogger(__name__)
17
 
18
- # Constants
19
- SOURCE_REPO = "Sculptor-AI/Ursa_Minor"
20
- HF_TOKEN = os.environ.get("HF_TOKEN")
21
- CONVERSION_SCRIPT = "./llama.cpp/convert.py"
22
- MODEL_CACHE_DIR = "model_cache"
23
- TEMP_DIR = "temp_outputs"
24
- # Restored full quantization set, sorted from smallest to largest
25
  QUANT_CONFIGS = [
26
- {"name": "Q2_K", "size_gb": 0.8, "notes": "smallest size"},
27
- {"name": "Q3_K_S", "size_gb": 0.9, "notes": "small size"},
28
- {"name": "Q3_K_M", "size_gb": 0.9, "notes": "lower quality"},
29
- {"name": "Q3_K_L", "size_gb": 1.0, "notes": ""},
30
- {"name": "IQ4_XS", "size_gb": 1.0, "notes": ""},
31
- {"name": "Q4_K_S", "size_gb": 1.0, "notes": "fast, recommended"},
32
- {"name": "Q4_K_M", "size_gb": 1.1, "notes": "fast, recommended"},
33
- {"name": "Q5_K_S", "size_gb": 1.2, "notes": "good balance"},
34
- {"name": "Q5_K_M", "size_gb": 1.2, "notes": ""},
35
- {"name": "Q6_K", "size_gb": 1.4, "notes": "very good quality"},
36
- {"name": "Q8_0", "size_gb": 1.7, "notes": "fast, best quality"},
37
- {"name": "f16", "size_gb": 3.2, "notes": "16 bpw, full precision"}
38
  ]
39
 
40
- # State variables
41
- state = {
42
- "last_checked": None,
 
 
 
43
  "last_commit_hash": None,
44
- "is_up_to_date": True,
45
- "is_processing": False,
46
  "current_quant": None,
 
47
  "progress": 0,
48
- "total_quants": len(QUANT_CONFIGS),
49
- "completed_quants": [],
50
- "failed_quants": [],
51
- "out_of_memory": False,
52
- "last_error": None,
53
- "status_message": "Ready to check for updates"
54
  }
55
 
56
- # Initialize HF API
57
- hf_api = HfApi(token=HF_TOKEN)
 
 
 
 
 
 
58
 
59
- # Set up llama.cpp tools on first run
60
- if not os.path.exists("./llama.cpp/convert.py"):
61
- try:
62
- logger.info("Setting up llama.cpp tools...")
63
- subprocess.run(["bash", "setup.sh"], check=True)
64
- logger.info("Setup completed successfully")
65
- except subprocess.CalledProcessError as e:
66
- logger.error(f"Error setting up llama.cpp tools: {e}")
67
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- # Helper functions
70
- def save_state():
71
- with open("state.json", "w") as f:
72
- # Create a serializable copy of the state
73
- serializable_state = state.copy()
74
- serializable_state["last_checked"] = str(serializable_state["last_checked"]) if serializable_state["last_checked"] else None
75
- json.dump(serializable_state, f)
76
-
77
- def load_state():
78
- global state
79
  try:
80
- if os.path.exists("state.json"):
81
- with open("state.json", "r") as f:
82
- loaded_state = json.load(f)
83
- # Convert string back to datetime if it exists
84
- if loaded_state.get("last_checked"):
85
- loaded_state["last_checked"] = datetime.fromisoformat(loaded_state["last_checked"])
86
- state.update(loaded_state)
87
- except Exception as e:
88
- logger.error(f"Error loading state: {e}")
 
 
 
 
 
 
 
 
 
 
89
 
90
- def get_latest_commit():
 
91
  try:
92
- repo_info = hf_api.repo_info(repo_id=SOURCE_REPO)
93
- return repo_info.sha
94
  except Exception as e:
95
- logger.error(f"Error getting latest commit: {e}")
96
  return None
97
 
98
  def check_for_updates():
99
- global state
100
-
101
- state["last_checked"] = datetime.now()
102
- latest_commit = get_latest_commit()
103
 
104
- if latest_commit and latest_commit != state["last_commit_hash"]:
105
- logger.info(f"New commit detected: {latest_commit}")
106
- state["last_commit_hash"] = latest_commit
107
- state["is_up_to_date"] = False
108
- state["status_message"] = f"Updates detected in {SOURCE_REPO}. Ready to generate quantizations."
109
- else:
110
- state["is_up_to_date"] = True
111
- state["status_message"] = f"No updates detected in {SOURCE_REPO}. Last checked: {state['last_checked'].strftime('%Y-%m-%d %H:%M:%S')}"
112
 
113
- save_state()
114
- return state["status_message"]
115
-
116
- def download_model():
117
  try:
118
- # Create cache directory if it doesn't exist
119
- os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
120
-
121
- # Clean up any previous downloads to save space
122
- if os.path.exists(os.path.join(MODEL_CACHE_DIR, os.path.basename(SOURCE_REPO))):
123
- shutil.rmtree(os.path.join(MODEL_CACHE_DIR, os.path.basename(SOURCE_REPO)))
124
-
125
- # Get model repo information to find the smallest safetensors file
126
- logger.info(f"Getting repository information for {SOURCE_REPO}")
127
- files = hf_api.list_repo_files(repo_id=SOURCE_REPO)
128
-
129
- # Filter for safetensors files (which are the model weights)
130
- safetensors_files = [f for f in files if f.endswith(".safetensors")]
131
-
132
- if not safetensors_files:
133
- raise Exception(f"No safetensors files found in {SOURCE_REPO}")
134
-
135
- # Download only required files instead of the entire repo to save space
136
- # This includes model config and one weights file
137
- required_files = [
138
- "config.json",
139
- "tokenizer.json",
140
- "tokenizer_config.json",
141
- safetensors_files[0] # Just take the first weights file
142
- ]
143
-
144
- # Create the model directory
145
- model_dir = os.path.join(MODEL_CACHE_DIR, os.path.basename(SOURCE_REPO))
146
- os.makedirs(model_dir, exist_ok=True)
147
 
148
- # Download only the required files
149
- for file in required_files:
150
- if file in files:
151
- logger.info(f"Downloading {file}")
152
- hf_api.hf_hub_download(
153
- repo_id=SOURCE_REPO,
154
- filename=file,
155
- local_dir=model_dir,
156
- token=HF_TOKEN
157
- )
158
 
159
- return model_dir
 
 
 
 
 
 
 
 
160
  except Exception as e:
161
- logger.error(f"Error downloading model: {e}")
162
- state["last_error"] = str(e)
163
- return None
 
164
 
165
- def process_quantization():
166
- global state
167
-
168
- if state["is_processing"]:
169
- return "Already processing quantizations. Please wait."
170
-
171
- state["is_processing"] = True
172
- state["progress"] = 0
173
- state["completed_quants"] = []
174
- state["failed_quants"] = []
175
- state["out_of_memory"] = False
176
- state["last_error"] = None
177
- state["status_message"] = "Starting quantization process..."
178
-
179
- # Start the processing in a separate thread
180
- thread = threading.Thread(target=quantization_worker)
181
- thread.daemon = True
182
- thread.start()
183
 
184
- return "Quantization process started. Please wait for it to complete."
185
-
186
- def quantization_worker():
187
- global state
188
-
189
- try:
190
- # Download the model
191
- model_path = download_model()
192
- if not model_path:
193
- state["is_processing"] = False
194
- state["status_message"] = "Failed to download model. Check logs for details."
195
- return
196
-
197
- # Create temporary output directory
198
- os.makedirs(TEMP_DIR, exist_ok=True)
199
-
200
- # Get model name from the source repo
201
- model_name = os.path.basename(SOURCE_REPO).lower()
202
-
203
- # Process each quantization configuration - we'll do one at a time to save memory
204
- total_quants = len(QUANT_CONFIGS)
205
-
206
- for i, quant_config in enumerate(QUANT_CONFIGS):
207
- if state["out_of_memory"]:
208
- # Skip further processing if we've hit memory limits
209
- break
210
-
211
- quant_name = quant_config["name"]
212
- state["current_quant"] = quant_name
213
- state["progress"] = (i / total_quants) * 100
214
- state["status_message"] = f"Processing {quant_name} quantization ({i+1}/{total_quants})"
215
 
216
- logger.info(f"Processing quantization: {quant_name}")
 
 
 
 
217
 
218
- try:
219
- # Free up memory between quantizations - this is crucial for the free tier
220
- if i > 0:
221
- # Clean up previous files
222
- for file in os.listdir(TEMP_DIR):
223
- file_path = os.path.join(TEMP_DIR, file)
224
- if os.path.isfile(file_path):
225
- os.remove(file_path)
226
-
227
- # Output path for this quantization
228
- quant_output_path = os.path.join(TEMP_DIR, f"{model_name}-{quant_name.lower()}.gguf")
229
 
230
- # Check available disk space before starting
 
231
  try:
232
- statvfs = os.statvfs(TEMP_DIR)
233
- free_space_gb = (statvfs.f_frsize * statvfs.f_bavail) / (1024 * 1024 * 1024)
234
- logger.info(f"Available disk space: {free_space_gb:.2f} GB")
235
 
236
- # Skip if we don't have enough disk space
237
- if free_space_gb < quant_config["size_gb"] * 1.5: # 50% buffer
238
- logger.warning(f"Not enough disk space for {quant_name} quantization. Need {quant_config['size_gb'] * 1.5:.2f} GB, have {free_space_gb:.2f} GB")
239
- state["failed_quants"].append(f"{quant_name} (disk space)")
240
- continue
241
- except Exception as e:
242
- logger.warning(f"Could not check disk space: {e}")
243
-
244
- # Run the conversion+quantization in one step to save memory
245
- # We'll use direct conversion to the target quantization format
246
- logger.info(f"Converting and quantizing directly to {quant_name}")
247
-
248
- # Command to convert and quantize in one step
249
- quantize_cmd = [
250
- "python",
251
- "./llama.cpp/convert.py",
252
- model_path,
253
- "--outfile", quant_output_path,
254
- "--outtype", quant_name.lower()
255
- ]
256
-
257
- # Create a process for monitoring memory usage
258
- quantize_process = subprocess.Popen(
259
- quantize_cmd,
260
- shell=False,
261
- stdout=subprocess.PIPE,
262
- stderr=subprocess.PIPE,
263
- text=True
264
- )
265
-
266
- # Poll the process and monitor system resources
267
- while quantize_process.poll() is None:
268
- # Check if we're getting low on memory
269
  try:
270
- with open('/proc/meminfo', 'r') as f:
271
- meminfo = f.read()
272
-
273
- # Extract available memory
274
- available_mem = 0
275
- for line in meminfo.split('\n'):
276
- if 'MemAvailable:' in line:
277
- available_mem = int(line.split()[1]) / 1024 # Convert to MB
278
- break
279
-
280
- # If memory is critically low (less than 500MB), kill the process
281
- if available_mem < 500:
282
- logger.warning(f"Memory critically low ({available_mem:.2f} MB). Terminating quantization.")
283
- quantize_process.terminate()
284
- state["out_of_memory"] = True
285
- state["failed_quants"].append(f"{quant_name} (OOM)")
286
- break
287
  except Exception as e:
288
- logger.warning(f"Could not check memory usage: {e}")
 
 
289
 
290
- # Wait a bit before checking again
291
- time.sleep(5)
292
-
293
- # Check if the process completed successfully
294
- if quantize_process.poll() is None:
295
- # Process is still running, kill it
296
- quantize_process.terminate()
297
- try:
298
- quantize_process.wait(timeout=10)
299
- except subprocess.TimeoutExpired:
300
- quantize_process.kill()
301
 
302
- raise Exception("Quantization process timed out or was terminated")
303
-
304
- # Get process output
305
- stdout, stderr = quantize_process.communicate()
306
-
307
- if quantize_process.returncode != 0:
308
- if "out of memory" in stderr.lower() or "allocation failed" in stderr.lower() or "not enough memory" in stderr.lower():
309
- logger.warning(f"Out of memory during {quant_name} quantization")
310
- state["out_of_memory"] = True
311
- state["failed_quants"].append(f"{quant_name} (OOM)")
312
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  else:
314
- raise Exception(f"Error during {quant_name} quantization: {stderr}")
315
-
316
- # Check if the file was created and has reasonable size
317
- if not os.path.exists(quant_output_path) or os.path.getsize(quant_output_path) < 1000000:
318
- raise Exception(f"Quantization produced invalid or empty file")
319
-
320
- # Create or update repository
321
- repo_name = f"{model_name}-{quant_name.lower()}-gguf"
322
- username = hf_api.whoami()["name"]
323
- repo_id = f"{username}/{repo_name}"
324
-
325
- try:
326
- # Check if repo exists
327
- hf_api.repo_info(repo_id=repo_id)
328
- logger.info(f"Repository {repo_id} already exists")
329
- except Exception:
330
- # Create repo if it doesn't exist
331
- logger.info(f"Creating repository {repo_id}")
332
- hf_api.create_repo(repo_id=repo_id, exist_ok=True)
333
-
334
- # Upload quantized model
335
- logger.info(f"Uploading quantized model to {repo_id}")
336
-
337
- # Create a simple README first (it's smaller)
338
- readme_content = f"""# {model_name.capitalize()} - {quant_name} GGUF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
- This repository contains a {quant_name} quantized GGUF version of [{SOURCE_REPO}](https://huggingface.co/{SOURCE_REPO}).
 
 
341
 
342
- ## Details
 
 
 
 
 
343
 
344
- - **Quantization Type:** {quant_name}
345
- - **Approximate Size:** {quant_config['size_gb']} GB
346
- - **Notes:** {quant_config['notes']}
347
- - **Original Model:** [Sculptor-AI/Ursa_Minor](https://huggingface.co/{SOURCE_REPO})
348
- - **Auto-generated by:** GGUF Quantizer Space
349
 
350
- ## Usage with llama.cpp
 
 
 
351
 
 
352
  ```bash
353
- # CLI
354
- llama-cli --hf-repo {repo_id} --hf-file {model_name}-{quant_name.lower()}.gguf -p "Your prompt here"
 
 
355
 
356
- # Server
357
- llama-server --hf-repo {repo_id} --hf-file {model_name}-{quant_name.lower()}.gguf -c 2048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  ```
 
 
 
 
359
  """
360
-
361
- readme_path = os.path.join(TEMP_DIR, "README.md")
362
- with open(readme_path, "w") as f:
363
- f.write(readme_content)
364
-
365
- # Upload README first (it's smaller)
366
- hf_api.upload_file(
367
- path_or_fileobj=readme_path,
368
- path_in_repo="README.md",
369
- repo_id=repo_id
370
- )
371
-
372
- # Then upload the model with LFS - this might take a while
373
- try:
374
- upload_start_time = time.time()
375
- max_upload_time = 60 * 60 # 1 hour max upload time
376
-
377
- # Create a thread to monitor the upload
378
- upload_success = [False]
379
- upload_error = [None]
380
- upload_done = [False]
381
-
382
- def upload_file_with_timeout():
383
- try:
384
- hf_api.upload_file(
385
- path_or_fileobj=quant_output_path,
386
- path_in_repo=f"{model_name}-{quant_name.lower()}.gguf",
387
- repo_id=repo_id
388
- )
389
- upload_success[0] = True
 
 
 
390
  except Exception as e:
391
- upload_error[0] = e
392
- finally:
393
- upload_done[0] = True
394
-
395
- upload_thread = threading.Thread(target=upload_file_with_timeout)
396
- upload_thread.daemon = True
397
- upload_thread.start()
398
 
399
- # Wait for upload to complete or timeout
400
- while not upload_done[0]:
401
- if time.time() - upload_start_time > max_upload_time:
402
- logger.warning(f"Upload timed out after {max_upload_time/60:.1f} minutes")
403
- break
404
- time.sleep(10)
405
-
406
- if upload_success[0]:
407
- state["completed_quants"].append(quant_name)
408
- logger.info(f"Successfully processed {quant_name} quantization")
409
- else:
410
- error_msg = str(upload_error[0]) if upload_error[0] else "Upload timed out"
411
- logger.error(f"Failed to upload quantized model: {error_msg}")
412
- state["failed_quants"].append(f"{quant_name} (upload failed)")
413
- state["last_error"] = error_msg
414
- except Exception as upload_error:
415
- logger.error(f"Failed to upload quantized model: {upload_error}")
416
- state["failed_quants"].append(f"{quant_name} (upload failed)")
417
- state["last_error"] = str(upload_error)
418
 
419
- # Delete the large file immediately after upload to save space
420
- try:
421
- os.remove(quant_output_path)
422
- except Exception as rm_error:
423
- logger.warning(f"Could not remove temporary file: {rm_error}")
424
-
425
- except subprocess.TimeoutExpired as timeout_error:
426
- logger.error(f"Timeout during {quant_name} quantization: {timeout_error}")
427
- state["failed_quants"].append(f"{quant_name} (timeout)")
428
- state["last_error"] = f"Quantization timed out after 30 minutes"
429
- except Exception as e:
430
- logger.error(f"Error processing {quant_name} quantization: {e}")
431
- state["failed_quants"].append(quant_name)
432
- state["last_error"] = str(e)
433
-
434
- # Final cleanup
435
- try:
436
- shutil.rmtree(TEMP_DIR)
437
- except Exception as e:
438
- logger.warning(f"Error cleaning up temporary files: {e}")
439
-
440
- # Clean up model cache to save space
441
- try:
442
- shutil.rmtree(MODEL_CACHE_DIR)
443
  except Exception as e:
444
- logger.warning(f"Error cleaning up model cache: {e}")
 
 
 
445
 
446
- state["progress"] = 100
447
- state["is_up_to_date"] = True
448
- state["is_processing"] = False
449
-
450
- if state["out_of_memory"]:
451
- last_successful = state["completed_quants"][-1] if state["completed_quants"] else "None"
452
- state["status_message"] = f"Quantization process stopped due to memory limitations after {last_successful}. Smaller quantizations completed successfully."
453
- elif state["failed_quants"]:
454
- state["status_message"] = f"Quantization process completed with some failures. {len(state['completed_quants'])}/{total_quants} quantizations were successful."
455
- else:
456
- state["status_message"] = f"Quantization process completed successfully. All {len(state['completed_quants'])}/{total_quants} quantizations were created."
457
-
458
- except Exception as e:
459
- logger.error(f"Error in quantization worker: {e}")
460
- state["is_processing"] = False
461
- state["last_error"] = str(e)
462
- state["status_message"] = f"Error during quantization process: {str(e)}"
463
-
464
- save_state()
465
 
466
- # Create Gradio interface
467
- def create_interface():
468
- with gr.Blocks(title="Ursa_Minor GGUF Quantizer", css="footer {visibility: hidden}") as demo:
469
- with gr.Row():
470
- gr.Markdown("# Ursa_Minor GGUF Auto Quantizer")
 
 
 
 
 
 
 
 
 
471
 
472
  with gr.Row():
473
  with gr.Column(scale=2):
474
- status_md = gr.Markdown(value=f"### Status: {state['status_message']}")
475
 
 
476
  with gr.Row():
477
  check_button = gr.Button("Check for Updates", variant="primary")
478
- process_button = gr.Button("Generate Quantizations", variant="secondary")
479
-
480
- with gr.Row():
481
- last_check = gr.Markdown(value=f"Last Checked: {state['last_checked'].strftime('%Y-%m-%d %H:%M:%S') if state['last_checked'] else 'Never'}")
482
-
483
- with gr.Row():
484
- up_to_date = gr.Markdown(value=f"Up to Date: {'Yes' if state['is_up_to_date'] else 'No'}")
485
-
486
- with gr.Accordion("Details", open=True):
487
- with gr.Row():
488
- progress = gr.Slider(
489
- minimum=0,
490
- maximum=100,
491
- value=state["progress"],
492
- label="Progress",
493
- interactive=False
494
- )
495
-
496
- current_task = gr.Markdown(value="")
497
-
498
- with gr.Row():
499
- completed_md = gr.Markdown(value="### Completed Quantizations")
500
- completed_list = gr.Markdown(value="None")
501
-
502
- with gr.Row():
503
- failed_md = gr.Markdown(value="### Failed Quantizations")
504
- failed_list = gr.Markdown(value="None")
505
-
506
- with gr.Row():
507
- error_md = gr.Markdown(value="### Last Error")
508
- error_text = gr.Markdown(value="None")
509
-
510
- with gr.Column(scale=1):
511
- gr.Markdown("### Quantization Types")
512
- quant_table = gr.DataFrame(
513
- value=[[q["name"], f"{q['size_gb']} GB", q["notes"]] for q in QUANT_CONFIGS],
514
- headers=["Type", "Size", "Notes"],
515
- interactive=False
516
- )
517
 
518
- # Functions to update the UI
519
  def update_status():
520
- # Simply update the text components without changing button properties
521
- status_text = f"### Status: {state['status_message']}"
522
- last_check_text = f"Last Checked: {state['last_checked'].strftime('%Y-%m-%d %H:%M:%S') if state['last_checked'] else 'Never'}"
523
- up_to_date_text = f"Up to Date: {'Yes' if state['is_up_to_date'] else 'No'}"
524
-
525
- current_task_text = ""
526
- if state["is_processing"]:
527
- current_quant = state["current_quant"] or "Preparing"
528
- current_task_text = f"Current Task: Processing {current_quant} quantization"
529
-
530
- completed_text = "None"
531
- if state["completed_quants"]:
532
- completed_items = []
533
- for q in state["completed_quants"]:
534
- model_name = os.path.basename(SOURCE_REPO).lower()
535
- username = hf_api.whoami()["name"]
536
- repo_id = f"{username}/{model_name}-{q.lower()}-gguf"
537
- completed_items.append(f"- [{q}](https://huggingface.co/{repo_id})")
538
- completed_text = "\n".join(completed_items)
539
-
540
- failed_text = "None"
541
- if state["failed_quants"]:
542
- failed_items = []
543
- for q in state["failed_quants"]:
544
- if "(" in q: # Check if it has a reason in parentheses
545
- name, reason = q.split(" (", 1)
546
- reason = reason.rstrip(")")
547
- failed_items.append(f"- {name} (Reason: {reason})")
548
- else:
549
- failed_items.append(f"- {q}")
550
- failed_text = "\n".join(failed_items)
551
-
552
- error_text = "None"
553
- if state["last_error"]:
554
- error_text = f"```\n{state['last_error']}\n```"
555
-
556
- return [
557
- status_text,
558
- last_check_text,
559
- up_to_date_text,
560
- state["progress"],
561
- current_task_text,
562
- completed_text,
563
- failed_text,
564
- error_text
565
- ]
566
 
567
- # Register event handlers
568
- check_button.click(
569
- fn=lambda: check_for_updates(),
570
- outputs=[status_md]
571
- ).then(
572
- fn=update_status,
573
- outputs=[
574
- status_md,
575
- last_check,
576
- up_to_date,
577
- progress,
578
- current_task,
579
- completed_list,
580
- failed_list,
581
- error_text
582
- ]
583
- )
584
 
585
- process_button.click(
586
- fn=lambda: process_quantization(),
587
- outputs=[status_md]
588
- ).then(
589
- fn=update_status,
590
- outputs=[
591
- status_md,
592
- last_check,
593
- up_to_date,
594
- progress,
595
- current_task,
596
- completed_list,
597
- failed_list,
598
- error_text
599
- ]
600
- )
601
 
602
- # Add an interval for updating the UI during processing
603
- demo.load(
604
- fn=update_status,
605
- outputs=[
606
- status_md,
607
- last_check,
608
- up_to_date,
609
- progress,
610
- current_task,
611
- completed_list,
612
- failed_list,
613
- error_text
614
- ]
615
- )
616
 
617
- # Schedule periodic checks for updates - but less frequently for free tier
618
- def scheduled_check():
619
- while True:
620
- try:
621
- if not state["is_processing"]:
622
- check_for_updates()
623
- except Exception as e:
624
- logger.error(f"Error in scheduled check: {e}")
625
- # Check less frequently to avoid waking up the space too often
626
- time.sleep(14400) # Check every 4 hours instead of hourly
627
 
628
- # Only start the scheduler thread if we're not in a debugging environment
629
- if not os.environ.get("GRADIO_DEBUG"):
630
- scheduler_thread = threading.Thread(target=scheduled_check)
631
- scheduler_thread.daemon = True
632
- scheduler_thread.start()
633
- logger.info("Started background update checker")
634
 
635
- return demo
 
 
 
636
 
637
- # Initialize state from disk
638
- load_state()
 
 
 
 
 
 
 
 
 
 
 
639
 
640
- # Create and launch the interface
641
- demo = create_interface()
642
- demo.queue(max_size=10).launch(debug=True, show_api=False)
 
 
4
  import time
5
  import json
6
  from datetime import datetime
7
+ from pathlib import Path
8
  import threading
9
+ import traceback
10
+
11
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
12
  import gradio as gr
 
 
 
13
 
14
+ from huggingface_hub import HfApi, commit_info, list_repo_files, hf_hub_download, login, whoami
15
+ from apscheduler.schedulers.background import BackgroundScheduler
 
16
 
17
+ # MODEL_REPO to monitor
18
+ SOURCE_MODEL_REPO = "Sculptor-AI/Ursa_Minor"
19
+ CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
20
+ STATUS_FILE = "status.json"
21
+
22
+ # Quantization configurations in order of processing
 
23
  QUANT_CONFIGS = [
24
+ {"type": "Q2_K", "size_gb": 0.8, "notes": ""},
25
+ {"type": "Q3_K_S", "size_gb": 0.9, "notes": ""},
26
+ {"type": "Q3_K_M", "size_gb": 0.9, "notes": "lower quality"},
27
+ {"type": "Q3_K_L", "size_gb": 1.0, "notes": ""},
28
+ {"type": "IQ4_XS", "size_gb": 1.0, "notes": ""},
29
+ {"type": "Q4_K_S", "size_gb": 1.0, "notes": "fast, recommended"},
30
+ {"type": "Q4_K_M", "size_gb": 1.1, "notes": "fast, recommended"},
31
+ {"type": "Q5_K_S", "size_gb": 1.2, "notes": ""},
32
+ {"type": "Q5_K_M", "size_gb": 1.2, "notes": ""},
33
+ {"type": "Q6_K", "size_gb": 1.4, "notes": "very good quality"},
34
+ {"type": "Q8_0", "size_gb": 1.7, "notes": "fast, best quality"},
35
+ {"type": "f16", "size_gb": 3.2, "notes": "16 bpw, overkill"}
36
  ]
37
 
38
+ # Global variables for process state
39
+ processing_lock = threading.Lock()
40
+ current_status = {
41
+ "status": "Not started",
42
+ "last_check": None,
43
+ "last_updated": None,
44
  "last_commit_hash": None,
 
 
45
  "current_quant": None,
46
+ "quant_status": {},
47
  "progress": 0,
48
+ "error": None,
49
+ "log": []
 
 
 
 
50
  }
51
 
52
+ def escape(s: str) -> str:
53
+ """Escape HTML for logging"""
54
+ s = s.replace("&", "&amp;")
55
+ s = s.replace("<", "&lt;")
56
+ s = s.replace(">", "&gt;")
57
+ s = s.replace('"', "&quot;")
58
+ s = s.replace("\n", "<br/>")
59
+ return s
60
 
61
+ def log_message(message: str, error: bool = False):
62
+ """Add message to log with timestamp"""
63
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
64
+ log_entry = f"[{timestamp}] {message}"
65
+ print(log_entry)
66
+ current_status["log"].append(log_entry)
67
+ if error:
68
+ current_status["error"] = message
69
+
70
+ # Keep log size manageable
71
+ if len(current_status["log"]) > 100:
72
+ current_status["log"] = current_status["log"][-100:]
73
+
74
+ # Save current status to file
75
+ save_status()
76
+
77
+ def save_status():
78
+ """Save current status to file"""
79
+ with open(STATUS_FILE, 'w') as f:
80
+ json.dump(current_status, f)
81
+
82
+ def load_status():
83
+ """Load status from file if it exists"""
84
+ global current_status
85
+ if os.path.exists(STATUS_FILE):
86
+ try:
87
+ with open(STATUS_FILE, 'r') as f:
88
+ current_status = json.load(f)
89
+ except Exception as e:
90
+ log_message(f"Error loading status file: {str(e)}", error=True)
91
+
92
+ def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
93
+ """Generate importance matrix for a model"""
94
+ imatrix_command = [
95
+ "./llama.cpp/llama-imatrix",
96
+ "-m", model_path,
97
+ "-f", train_data_path,
98
+ "-ngl", "99",
99
+ "--output-frequency", "10",
100
+ "-o", output_path,
101
+ ]
102
+
103
+ if not os.path.isfile(model_path):
104
+ raise Exception(f"Model file not found: {model_path}")
105
+
106
+ log_message(f"Running imatrix command for {model_path}...")
107
+ process = subprocess.Popen(imatrix_command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
108
 
 
 
 
 
 
 
 
 
 
 
109
  try:
110
+ # Monitor the process for output to provide updates
111
+ for line in process.stdout:
112
+ log_message(f"imatrix: {line.strip()}")
113
+
114
+ process.wait(timeout=3600) # 1 hour timeout
115
+ except subprocess.TimeoutExpired:
116
+ log_message("Imatrix computation timed out. Sending SIGINT to allow graceful termination...", error=True)
117
+ process.send_signal(signal.SIGINT)
118
+ try:
119
+ process.wait(timeout=60) # 1 minute grace period
120
+ except subprocess.TimeoutExpired:
121
+ log_message("Imatrix process still didn't terminate. Forcefully terminating process...", error=True)
122
+ process.kill()
123
+
124
+ stderr = process.stderr.read()
125
+ if stderr:
126
+ log_message(f"Imatrix stderr: {stderr}")
127
+
128
+ log_message("Importance matrix generation completed.")
129
 
130
+ def get_last_commit(repo_id: str):
131
+ """Get the last commit hash of a repository"""
132
  try:
133
+ info = commit_info(repo_id)
134
+ return info.commit_id
135
  except Exception as e:
136
+ log_message(f"Error getting commit info: {str(e)}", error=True)
137
  return None
138
 
139
  def check_for_updates():
140
+ """Check if the source model has been updated"""
141
+ if processing_lock.locked():
142
+ log_message("Already processing, skipping update check")
143
+ return False
144
 
145
+ current_status["status"] = "Checking for updates"
146
+ current_status["last_check"] = datetime.now().isoformat()
 
 
 
 
 
 
147
 
 
 
 
 
148
  try:
149
+ # Get the latest commit hash
150
+ latest_commit = get_last_commit(SOURCE_MODEL_REPO)
151
+ if latest_commit is None:
152
+ current_status["status"] = "Error checking for updates"
153
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ log_message(f"Latest commit hash: {latest_commit}")
156
+ log_message(f"Previous commit hash: {current_status.get('last_commit_hash')}")
 
 
 
 
 
 
 
 
157
 
158
+ if current_status.get("last_commit_hash") != latest_commit:
159
+ current_status["status"] = "Update detected"
160
+ current_status["last_commit_hash"] = latest_commit
161
+ save_status()
162
+ return True
163
+ else:
164
+ current_status["status"] = "Up to date"
165
+ save_status()
166
+ return False
167
  except Exception as e:
168
+ log_message(f"Error checking for updates: {str(e)}", error=True)
169
+ current_status["status"] = "Error checking for updates"
170
+ save_status()
171
+ return False
172
 
173
+ def process_model():
174
+ """Process the model to create all quantized versions"""
175
+ if processing_lock.locked():
176
+ log_message("Already processing, cannot start another process")
177
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
+ with processing_lock:
180
+ try:
181
+ # Validate authentication
182
+ try:
183
+ user_info = whoami()
184
+ log_message(f"Processing as user: {user_info['name']}")
185
+ except Exception as e:
186
+ log_message(f"Authentication error: {str(e)}. Please make sure you're logged in.", error=True)
187
+ current_status["status"] = "Authentication error"
188
+ save_status()
189
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
+ api = HfApi()
192
+ model_name = SOURCE_MODEL_REPO.split('/')[-1]
193
+ current_status["status"] = "Processing"
194
+ current_status["progress"] = 0
195
+ save_status()
196
 
197
+ # Prepare directories
198
+ if not os.path.exists("downloads"):
199
+ os.makedirs("downloads")
200
+ if not os.path.exists("outputs"):
201
+ os.makedirs("outputs")
202
+
203
+ log_message(f"Starting model processing for {SOURCE_MODEL_REPO}")
204
+
205
+ # Create temp directories for processing
206
+ with Path("outputs").resolve() as outdir:
207
+ log_message(f"Output directory: {outdir}")
208
 
209
+ # Download the model
210
+ log_message(f"Downloading model from {SOURCE_MODEL_REPO}")
211
  try:
212
+ local_dir = Path("downloads") / model_name
213
+ log_message(f"Local directory: {local_dir}")
 
214
 
215
+ # Check and download pattern
216
+ dl_pattern = ["*.md", "*.json", "*.model"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  try:
218
+ files = list_repo_files(SOURCE_MODEL_REPO)
219
+ has_safetensors = any(file.endswith(".safetensors") for file in files)
220
+ pattern = "*.safetensors" if has_safetensors else "*.bin"
221
+ dl_pattern.append(pattern)
222
+ log_message(f"Using download pattern: {dl_pattern}")
 
 
 
 
 
 
 
 
 
 
 
 
223
  except Exception as e:
224
+ log_message(f"Error checking repo files: {str(e)}", error=True)
225
+ dl_pattern.append("*.safetensors")
226
+ dl_pattern.append("*.bin")
227
 
228
+ # Download the model
229
+ api.snapshot_download(
230
+ repo_id=SOURCE_MODEL_REPO,
231
+ local_dir=local_dir,
232
+ local_dir_use_symlinks=False,
233
+ allow_patterns=dl_pattern
234
+ )
235
+ log_message("Model downloaded successfully!")
 
 
 
236
 
237
+ # Check for adapter config - if it's a LoRA adapter, this won't work
238
+ config_dir = local_dir / "config.json"
239
+ adapter_config_dir = local_dir / "adapter_config.json"
240
+ if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
241
+ raise Exception('adapter_config.json is present. If you are converting a LoRA adapter to GGUF, please use a different tool.')
242
+
243
+ # Convert to FP16 first
244
+ fp16_path = str(outdir / f"{model_name}.fp16.gguf")
245
+ log_message(f"Converting model to FP16: {fp16_path}")
246
+
247
+ result = subprocess.run([
248
+ "python", CONVERSION_SCRIPT, str(local_dir), "--outtype", "f16", "--outfile", fp16_path
249
+ ], shell=False, capture_output=True, text=True)
250
+
251
+ if result.returncode != 0:
252
+ raise Exception(f"Error converting to fp16: {result.stderr}")
253
+
254
+ log_message("Model converted to fp16 successfully!")
255
+
256
+ # Generate importance matrix for IQ quantizations
257
+ imatrix_path = str(outdir / "imatrix.dat")
258
+ train_data_path = "llama.cpp/groups_merged.txt" # Default calibration dataset
259
+
260
+ if not os.path.isfile(train_data_path):
261
+ log_message(f"Warning: Training data file not found: {train_data_path}. Some quantizations may not work.", error=True)
262
  else:
263
+ try:
264
+ generate_importance_matrix(fp16_path, train_data_path, imatrix_path)
265
+ except Exception as e:
266
+ log_message(f"Error generating importance matrix: {str(e)}", error=True)
267
+ imatrix_path = None
268
+
269
+ # Process each quantization type
270
+ total_quants = len(QUANT_CONFIGS)
271
+ for i, quant_config in enumerate(QUANT_CONFIGS):
272
+ quant_type = quant_config["type"]
273
+ current_status["current_quant"] = quant_type
274
+ current_status["progress"] = int((i / total_quants) * 100)
275
+ save_status()
276
+
277
+ log_message(f"Processing quantization {i+1}/{total_quants}: {quant_type}")
278
+
279
+ try:
280
+ # Check if this is an IQ quantization
281
+ is_iq_quant = quant_type.startswith("IQ")
282
+
283
+ # Skip if we don't have imatrix and this is an IQ quant
284
+ if is_iq_quant and (imatrix_path is None or not os.path.exists(imatrix_path)):
285
+ log_message(f"Skipping {quant_type} as importance matrix is not available", error=True)
286
+ current_status["quant_status"][quant_type] = "Skipped - No imatrix"
287
+ continue
288
+
289
+ # Set up the repo name
290
+ username = user_info["name"]
291
+ repo_name = f"{model_name}-{quant_type}-GGUF"
292
+ repo_id = f"{username}/{repo_name}"
293
+
294
+ # Set up output path
295
+ quant_file_name = f"{model_name.lower()}-{quant_type.lower()}.gguf"
296
+ if is_iq_quant and quant_type != "f16":
297
+ quant_file_name = f"{model_name.lower()}-{quant_type.lower()}-imat.gguf"
298
+
299
+ quant_file_path = str(outdir / quant_file_name)
300
+
301
+ # Run quantization
302
+ if is_iq_quant and quant_type != "f16":
303
+ quantize_cmd = [
304
+ "./llama.cpp/llama-quantize",
305
+ "--imatrix", imatrix_path, fp16_path, quant_file_path, quant_type
306
+ ]
307
+ else:
308
+ quantize_cmd = [
309
+ "./llama.cpp/llama-quantize",
310
+ fp16_path, quant_file_path, quant_type
311
+ ]
312
+
313
+ log_message(f"Running quantization command: {' '.join(quantize_cmd)}")
314
+ result = subprocess.run(quantize_cmd, shell=False, capture_output=True, text=True)
315
+
316
+ if result.returncode != 0:
317
+ if "out of memory" in result.stderr.lower():
318
+ log_message(f"Out of memory error quantizing {quant_type}. Skipping larger models.", error=True)
319
+ current_status["quant_status"][quant_type] = "Failed - Out of memory"
320
+ # Break the loop to skip larger models
321
+ break
322
+ else:
323
+ raise Exception(f"Error quantizing {quant_type}: {result.stderr}")
324
+
325
+ log_message(f"Quantized successfully with {quant_type}!")
326
+
327
+ # Create the repo if it doesn't exist
328
+ log_message(f"Creating/updating repo {repo_id}")
329
+ try:
330
+ repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
331
+ log_message(f"Repo URL: {repo_url}")
332
+ except Exception as e:
333
+ log_message(f"Error creating repo: {str(e)}", error=True)
334
+ current_status["quant_status"][quant_type] = "Failed - Repo creation error"
335
+ continue
336
+
337
+ # Create README with model info
338
+ log_message("Creating README")
339
+ readme_content = f"""# {repo_name}
340
+ This model was converted to GGUF format from [`{SOURCE_MODEL_REPO}`](https://huggingface.co/{SOURCE_MODEL_REPO}) using llama.cpp.
341
 
342
+ ## Quantization: {quant_type}
343
+ Approximate size: {quant_config['size_gb']} GB
344
+ Notes: {quant_config['notes']}
345
 
346
+ ## Use with llama.cpp
347
+ Install llama.cpp through brew (works on Mac and Linux)
348
+
349
+ ```bash
350
+ brew install llama.cpp
351
+ ```
352
 
353
+ Invoke the llama.cpp server or the CLI.
 
 
 
 
354
 
355
+ ### CLI:
356
+ ```bash
357
+ llama-cli --hf-repo {repo_id} --hf-file {quant_file_name} -p "The meaning to life and the universe is"
358
+ ```
359
 
360
+ ### Server:
361
  ```bash
362
+ llama-server --hf-repo {repo_id} --hf-file {quant_file_name} -c 2048
363
+ ```
364
+
365
+ Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
366
 
367
+ Step 1: Clone llama.cpp from GitHub.
368
+ ```
369
+ git clone https://github.com/ggerganov/llama.cpp
370
+ ```
371
+
372
+ Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
373
+ ```
374
+ cd llama.cpp && LLAMA_CURL=1 make
375
+ ```
376
+
377
+ Step 3: Run inference through the main binary.
378
+ ```
379
+ ./llama-cli --hf-repo {repo_id} --hf-file {quant_file_name} -p "The meaning to life and the universe is"
380
+ ```
381
+ or
382
+ ```
383
+ ./llama-server --hf-repo {repo_id} --hf-file {quant_file_name} -c 2048
384
  ```
385
+
386
+ ## Auto-generated
387
+ This model version was automatically generated when updates were detected in the source repository.
388
+ Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
389
  """
390
+ readme_path = outdir / "README.md"
391
+ with open(readme_path, 'w') as f:
392
+ f.write(readme_content)
393
+
394
+ # Upload the quantized model and README
395
+ log_message(f"Uploading quantized model: {quant_file_path}")
396
+ try:
397
+ api.upload_file(
398
+ path_or_fileobj=quant_file_path,
399
+ path_in_repo=quant_file_name,
400
+ repo_id=repo_id,
401
+ )
402
+
403
+ api.upload_file(
404
+ path_or_fileobj=str(readme_path),
405
+ path_in_repo="README.md",
406
+ repo_id=repo_id,
407
+ )
408
+
409
+ if os.path.isfile(imatrix_path) and is_iq_quant:
410
+ log_message(f"Uploading imatrix.dat")
411
+ api.upload_file(
412
+ path_or_fileobj=imatrix_path,
413
+ path_in_repo="imatrix.dat",
414
+ repo_id=repo_id,
415
+ )
416
+
417
+ log_message(f"Successfully uploaded {quant_type} quantization!")
418
+ current_status["quant_status"][quant_type] = "Success"
419
+ except Exception as e:
420
+ log_message(f"Error uploading files: {str(e)}", error=True)
421
+ current_status["quant_status"][quant_type] = f"Failed - Upload error: {str(e)}"
422
+
423
  except Exception as e:
424
+ log_message(f"Error processing {quant_type}: {str(e)}", error=True)
425
+ current_status["quant_status"][quant_type] = f"Failed: {str(e)}"
426
+ # Continue with the next quantization
 
 
 
 
427
 
428
+ # Update status after completion
429
+ current_status["status"] = "Completed"
430
+ current_status["progress"] = 100
431
+ current_status["last_updated"] = datetime.now().isoformat()
432
+ log_message("Model processing completed!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
+ except Exception as e:
435
+ log_message(f"Error during model processing: {str(e)}", error=True)
436
+ current_status["status"] = "Error"
437
+ current_status["error"] = str(e)
438
+ traceback.print_exc()
439
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  except Exception as e:
441
+ log_message(f"Error: {str(e)}", error=True)
442
+ current_status["status"] = "Error"
443
+ current_status["error"] = str(e)
444
+ traceback.print_exc()
445
 
446
+ finally:
447
+ save_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
+ def check_and_process():
450
+ """Check for updates and process if needed"""
451
+ log_message("Running scheduled check for updates")
452
+ if check_for_updates():
453
+ log_message("Updates detected, starting processing")
454
+ threading.Thread(target=process_model).start()
455
+ else:
456
+ log_message("No updates detected")
457
+
458
+ def create_ui():
459
+ """Create the Gradio interface"""
460
+ with gr.Blocks(css="body { margin: 0; padding: 0; }") as demo:
461
+ gr.Markdown("# 🦙 Automatic GGUF Quantization for Ursa_Minor")
462
+ gr.Markdown(f"This space automatically creates quantized GGUF versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/{SOURCE_MODEL_REPO}) model whenever it's updated.")
463
 
464
  with gr.Row():
465
  with gr.Column(scale=2):
466
+ status_info = gr.HTML(label="Status", value="<p>Loading status...</p>")
467
 
468
+ with gr.Column(scale=1):
469
  with gr.Row():
470
  check_button = gr.Button("Check for Updates", variant="primary")
471
+ process_button = gr.Button("Force Processing", variant="secondary")
472
+
473
+ progress_bar = gr.Progress(label="Progress")
474
+
475
+ with gr.Tab("Quantization Status"):
476
+ quant_status = gr.DataFrame(
477
+ headers=["Type", "Size (GB)", "Notes", "Status"],
478
+ value=lambda: [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS],
479
+ label="Quantization Status"
480
+ )
481
+
482
+ with gr.Tab("Logs"):
483
+ logs = gr.HTML(label="Logs", value="<p>Loading logs...</p>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
 
485
  def update_status():
486
+ """Update the status display"""
487
+ status_html = f"""
488
+ <div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px;">
489
+ <h3>Current Status: <span style="color: {'green' if current_status['status'] == 'Up to date' else 'blue' if current_status['status'] == 'Processing' else 'red' if 'Error' in current_status['status'] else 'orange'}">{current_status['status']}</span></h3>
490
+ <p><strong>Last Checked:</strong> {current_status.get('last_check', 'Never').replace('T', ' ').split('.')[0] if current_status.get('last_check') else 'Never'}</p>
491
+ <p><strong>Last Updated:</strong> {current_status.get('last_updated', 'Never').replace('T', ' ').split('.')[0] if current_status.get('last_updated') else 'Never'}</p>
492
+ <p><strong>Current Quantization:</strong> {current_status.get('current_quant', 'None')}</p>
493
+ {f'<p style="color: red;"><strong>Error:</strong> {current_status["error"]}</p>' if current_status.get('error') else ''}
494
+ </div>
495
+ """
496
+ return status_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
 
498
+ def update_logs():
499
+ """Update the logs display"""
500
+ logs_html = "<div style='height: 400px; overflow-y: auto; background-color: #f9f9f9; padding: 10px; font-family: monospace; white-space: pre-wrap;'>"
501
+ for log in current_status["log"]:
502
+ if "Error" in log or "error" in log:
503
+ logs_html += f"<div style='color: red;'>{log}</div>"
504
+ else:
505
+ logs_html += f"<div>{log}</div>"
506
+ logs_html += "</div>"
507
+ return logs_html
 
 
 
 
 
 
 
508
 
509
+ def on_check_button():
510
+ """Handle check button click"""
511
+ if check_for_updates():
512
+ threading.Thread(target=process_model).start()
513
+ return update_status(), [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], update_logs()
 
 
 
 
 
 
 
 
 
 
 
514
 
515
+ def on_process_button():
516
+ """Handle process button click"""
517
+ threading.Thread(target=process_model).start()
518
+ return update_status(), [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], update_logs()
 
 
 
 
 
 
 
 
 
 
519
 
520
+ check_button.click(on_check_button, outputs=[status_info, quant_status, logs])
521
+ process_button.click(on_process_button, outputs=[status_info, quant_status, logs])
 
 
 
 
 
 
 
 
522
 
523
+ # Set up periodic refresh
524
+ demo.load(update_status, outputs=[status_info])
525
+ demo.load(lambda: [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], outputs=[quant_status])
526
+ demo.load(update_logs, outputs=[logs])
 
 
527
 
528
+ refresh_interval = 5 # seconds
529
+ gr.HTML("<script>setInterval(function(){ Array.from(document.querySelectorAll('button[id*=Refresh-Button]')).forEach(b => b.click()); }, " + str(refresh_interval * 1000) + ");</script>")
530
+
531
+ return demo
532
 
533
+ # Initialize
534
+ def initialize():
535
+ """Initialize the application"""
536
+ # Load status from file
537
+ load_status()
538
+
539
+ # Schedule regular checks for updates
540
+ scheduler = BackgroundScheduler()
541
+ scheduler.add_job(check_and_process, 'interval', minutes=60) # Check every hour
542
+ scheduler.start()
543
+
544
+ # Run initial check
545
+ threading.Thread(target=check_and_process).start()
546
 
547
+ if __name__ == "__main__":
548
+ initialize()
549
+ demo = create_ui()
550
+ demo.queue(concurrency_count=1).launch()
groups_merged.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- gradio>=4.12.0
2
- huggingface_hub>=0.19.0
3
- pandas>=2.0.0
4
- numpy>=1.24.0
 
1
+ gradio>=3.50.2
2
+ huggingface_hub>=0.17.1
3
+ apscheduler>=3.10.1
 
setup.sh DELETED
@@ -1,48 +0,0 @@
1
- #!/bin/bash
2
- set -e
3
-
4
- echo "Setting up for real GGUF quantization..."
5
-
6
- # Clone llama.cpp
7
- if [ ! -d "llama.cpp" ]; then
8
- echo "Cloning llama.cpp repository..."
9
- git clone --depth=1 https://github.com/ggerganov/llama.cpp
10
- fi
11
-
12
- cd llama.cpp
13
-
14
- # Get conversion script
15
- echo "Setting up conversion script..."
16
- if [ -f "convert.py" ]; then
17
- echo "Found existing convert.py script"
18
- elif [ -f "convert-hf-to-gguf.py" ]; then
19
- echo "Found convert-hf-to-gguf.py"
20
- cp convert-hf-to-gguf.py convert.py
21
- elif [ -f "examples/convert-hf-to-gguf.py" ]; then
22
- echo "Found examples/convert-hf-to-gguf.py"
23
- cp examples/convert-hf-to-gguf.py convert.py
24
- else
25
- echo "Cannot find conversion script. Using Python alternative."
26
- # Install required packages
27
- pip install -q transformers torch
28
- fi
29
-
30
- # Install required packages for the conversion script
31
- pip install -q transformers torch
32
-
33
- # Initialize state file
34
- cd ..
35
- if [ ! -f "state.json" ]; then
36
- echo "Initializing state file..."
37
- echo '{"last_checked": null, "last_commit_hash": null, "is_up_to_date": true, "is_processing": false, "current_quant": null, "progress": 0, "total_quants": 12, "completed_quants": [], "failed_quants": [], "out_of_memory": false, "last_error": null, "status_message": "Ready to check for updates"}' > state.json
38
- fi
39
-
40
- # Create necessary directories
41
- echo "Creating directories..."
42
- mkdir -p model_cache
43
- mkdir -p temp_outputs
44
-
45
- echo "Setup completed successfully"
46
-
47
-
48
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
start.sh ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Clone llama.cpp if not exists
4
+ if [ ! -d "llama.cpp" ]; then
5
+ echo "Cloning llama.cpp repository..."
6
+ git clone https://github.com/ggerganov/llama.cpp
7
+ fi
8
+
9
+ # Copy calibration data if not exists
10
+ if [ ! -f "llama.cpp/groups_merged.txt" ]; then
11
+ echo "Copying calibration data..."
12
+ cp groups_merged.txt llama.cpp/groups_merged.txt
13
+ fi
14
+
15
+ # Disable CUDA for HF spaces (not supported in free tier)
16
+ # We should still build with optimizations for CPU
17
+ export GGML_CUDA=OFF
18
+ export GGML_AVX=1
19
+ export GGML_AVX2=1
20
+
21
+ cd llama.cpp
22
+ echo "Building llama.cpp tools..."
23
+ cmake -B build -DBUILD_SHARED_LIBS=OFF
24
+ cmake --build build --config Release -j --target llama-quantize llama-gguf-split llama-imatrix
25
+ echo "Copying built binaries..."
26
+ cp ./build/bin/llama-* ./ 2>/dev/null || cp ./build/llama-* ./ 2>/dev/null
27
+ rm -rf build
28
+
29
+ cd ..
30
+ echo "Starting Gradio app..."
31
+ python app.py