Kaileh57 commited on
Commit
8df3b1d
·
1 Parent(s): 0a23172
Files changed (5) hide show
  1. README.md +38 -1
  2. app.py +99 -345
  3. monitor.py +259 -0
  4. quantize.py +197 -0
  5. requirements.txt +6 -5
README.md CHANGED
@@ -10,4 +10,41 @@ pinned: false
10
  short_description: Automatically quantizes Sculptor models
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  short_description: Automatically quantizes Sculptor models
11
  ---
12
 
13
+ # Ursa Minor Quantization Monitor
14
+
15
+ This Space automatically generates quantized versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/Sculptor-AI/Ursa_Minor) model and uploads them to the [Sculptor-AI/Ursa_Minor_Quantized](https://huggingface.co/Sculptor-AI/Ursa_Minor_Quantized) repository.
16
+
17
+ ## Features
18
+
19
+ - Monitors the source repository for updates
20
+ - Automatically generates quantized versions when the source model is updated
21
+ - Displays a progress bar during quantization
22
+ - Shows an "up to date" indicator when all quantizations are complete
23
+ - Handles out-of-memory errors gracefully
24
+
25
+ ## Quantization Types
26
+
27
+ The following quantizations are generated in order from smallest to largest:
28
+
29
+ | Type | Size (GB) | Notes |
30
+ |------|-----------|-------|
31
+ | GGUF Q2_K | 0.8 | |
32
+ | GGUF Q3_K_S | 0.9 | |
33
+ | GGUF Q3_K_M | 0.9 | lower quality |
34
+ | GGUF Q3_K_L | 1.0 | |
35
+ | GGUF IQ4_XS | 1.0 | |
36
+ | GGUF Q4_K_S | 1.0 | fast, recommended |
37
+ | GGUF Q4_K_M | 1.1 | fast, recommended |
38
+ | GGUF Q5_K_S | 1.2 | |
39
+ | GGUF Q5_K_M | 1.2 | |
40
+ | GGUF Q6_K | 1.4 | very good quality |
41
+ | GGUF Q8_0 | 1.7 | fast, best quality |
42
+ | GGUF f16 | 3.2 | 16 bpw, overkill |
43
+
44
+ ## Setup
45
+
46
+ To run this Space, you need to set an `HF_TOKEN` environment variable with write access to the destination repository.
47
+
48
+ ## Note About Free Compute Tier
49
+
50
+ The Hugging Face free compute tier has limited memory. This Space is designed to handle out-of-memory errors gracefully, but larger quantizations may fail due to memory constraints. If you need to generate larger quantizations, consider upgrading to a paid compute tier.
app.py CHANGED
@@ -1,360 +1,114 @@
1
- import os
2
- import sys
3
  import gradio as gr
4
- import subprocess
5
- import tempfile
6
- import shutil
7
- from huggingface_hub import HfApi, login, Repository
8
  import time
 
9
  import threading
10
 
11
- # Initialize Hugging Face API
12
- hf_token = os.environ.get("HF_TOKEN")
13
- api = HfApi(token=hf_token)
14
- if hf_token:
15
- login(token=hf_token)
16
- else:
17
- print("WARNING: HF_TOKEN not set. You'll be limited to public repositories.")
18
-
19
- # Define quantization options
20
- QUANT_TYPES = {
21
- "Q4_K_M": "q4_k_m", # 4-bit, good quality and size
22
- "Q5_K_M": "q5_k_m", # 5-bit, better quality
23
- "Q8_0": "q8_0" # 8-bit, high quality
24
- }
25
-
26
- def install_llama_cpp():
27
- """Install llama.cpp if not already installed"""
28
- if not os.path.exists("llama.cpp"):
29
- print("Installing llama.cpp...")
30
- # Clone llama.cpp
31
- subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "--depth=1"], check=True)
32
- # Build llama.cpp (minimal build for conversion only)
33
- os.chdir("llama.cpp")
34
- subprocess.run(["make", "clean"], check=True)
35
- subprocess.run(["make", "convert"], check=True)
36
- os.chdir("..")
37
- print("llama.cpp installed successfully")
38
- else:
39
- print("llama.cpp already installed")
40
-
41
- def clone_repo_shallow(repo_id, target_dir):
42
- """Clone only the necessary files from a repo to save space"""
43
- print(f"Cloning {repo_id} to {target_dir}...")
44
-
45
- # Create a sparse checkout to save space
46
- cmd = [
47
- "git", "clone",
48
- "--depth=1",
49
- "--filter=blob:none",
50
- f"https://huggingface.co/{repo_id}",
51
- target_dir
52
- ]
53
 
54
- subprocess.run(cmd, check=True)
55
- print(f"Repository {repo_id} cloned successfully")
56
-
57
- def find_model_files(directory):
58
- """Find model files in the repository"""
59
- # Look for common model file patterns
60
- model_files = []
61
-
62
- # Safetensors is preferred (usually smaller)
63
- for pattern in ["*.safetensors", "consolidated.*.pt", "pytorch_model.bin", "*.bin"]:
64
- cmd = ["find", directory, "-name", pattern]
65
- result = subprocess.run(cmd, capture_output=True, text=True)
66
- if result.stdout:
67
- model_files.extend(result.stdout.strip().split('\n'))
68
-
69
- # Filter out empty strings and sort by size (prefer smaller files for HF format)
70
- model_files = [f for f in model_files if f]
71
- if not model_files:
72
- return []
73
 
74
- # Check for model configuration
75
- config_file = None
76
- cmd = ["find", directory, "-name", "config.json"]
77
- result = subprocess.run(cmd, capture_output=True, text=True)
78
- if result.stdout:
79
- config_file = result.stdout.strip().split('\n')[0]
80
 
81
- return model_files, config_file
82
-
83
- def quantize_model(repo_id, quant_types, progress=gr.Progress()):
84
- """Quantize a model with llama.cpp and push to Hugging Face"""
85
- # Install llama.cpp if needed
86
- install_llama_cpp()
87
 
88
- # Create temporary directories for processing
89
- with tempfile.TemporaryDirectory() as temp_dir:
90
- progress(0.1, "Cloning repository...")
91
- model_dir = os.path.join(temp_dir, "model")
92
- output_dir = os.path.join(temp_dir, "output")
93
- os.makedirs(model_dir, exist_ok=True)
94
- os.makedirs(output_dir, exist_ok=True)
95
-
96
- try:
97
- # Clone the source repository
98
- clone_repo_shallow(repo_id, model_dir)
99
-
100
- # Find model files
101
- progress(0.2, "Looking for model files...")
102
- model_file_info = find_model_files(model_dir)
103
- if not model_file_info:
104
- return "No model files found in the repository."
105
-
106
- model_files, config_file = model_file_info
107
- model_file = model_files[0] # Use the first model file found
108
-
109
- progress(0.3, "Determining model type...")
110
- # Try to determine model type
111
- model_type = "llama" # Default model type
112
- if config_file:
113
- with open(config_file, 'r') as f:
114
- import json
115
- config = json.load(f)
116
- if 'model_type' in config:
117
- config_model_type = config['model_type'].lower()
118
- # Map model type to llama.cpp supported types
119
- type_mapping = {
120
- 'llama': 'llama',
121
- 'mistral': 'llama',
122
- 'mixtral': 'llama',
123
- 'falcon': 'falcon',
124
- 'mpt': 'mpt',
125
- 'gpt_neox': 'gptneox',
126
- 'gptj': 'gptj',
127
- 'bloom': 'bloom'
128
- }
129
- model_type = type_mapping.get(config_model_type, 'llama')
130
-
131
- # Create output repository name
132
- repo_name = repo_id.split('/')[-1]
133
- target_repo_id = f"{repo_id}-gguf"
134
-
135
- # Create the output repository if it doesn't exist
136
- progress(0.4, "Creating target repository...")
137
- try:
138
- api.create_repo(repo_id=target_repo_id, exist_ok=True)
139
- except Exception as e:
140
- return f"Error creating repository: {str(e)}"
141
-
142
- success_count = 0
143
- progress_step = 0.5 / len(quant_types)
144
- progress_value = 0.4
145
-
146
- # Process each quantization type
147
- for quant_name, quant_type in quant_types.items():
148
- progress_value += progress_step
149
- progress(progress_value, f"Processing {quant_name} quantization...")
150
-
151
- output_file = os.path.join(output_dir, f"{repo_name}-{quant_name}.gguf")
152
-
153
- # Convert to GGUF format
154
- print(f"Converting to {quant_name}...")
155
- convert_cmd = [
156
- "python3",
157
- os.path.join("llama.cpp", "convert.py"),
158
- f"--model-type", model_type,
159
- f"--outtype", "f16",
160
- f"--outfile", output_file
161
- ]
162
-
163
- # Add model path
164
- convert_cmd.append(model_file)
165
-
166
- try:
167
- # First convert to GGUF format (without quantization)
168
- subprocess.run(convert_cmd, check=True)
169
-
170
- # Then quantize if needed
171
- if quant_type != "f16":
172
- quant_output = output_file.replace(".gguf", f"-{quant_type}.gguf")
173
- quantize_cmd = [
174
- os.path.join("llama.cpp", "quantize"),
175
- output_file,
176
- quant_output,
177
- quant_type
178
- ]
179
- subprocess.run(quantize_cmd, check=True)
180
- # Replace the output file with the quantized version
181
- os.remove(output_file)
182
- os.rename(quant_output, output_file)
183
-
184
- # Upload to HF
185
- progress(progress_value + (progress_step * 0.7), f"Uploading {quant_name}...")
186
- api.upload_file(
187
- path_or_fileobj=output_file,
188
- path_in_repo=f"{repo_name}-{quant_name}.gguf",
189
- repo_id=target_repo_id,
190
- commit_message=f"Add {quant_name} quantized version"
191
- )
192
-
193
- success_count += 1
194
- except Exception as e:
195
- print(f"Error processing {quant_name}: {str(e)}")
196
-
197
- progress(1.0, "Completed!")
198
- if success_count > 0:
199
- return f"Successfully created {success_count} quantized versions in {target_repo_id}"
200
  else:
201
- return "Failed to create any quantized versions."
202
 
203
- except Exception as e:
204
- return f"Error: {str(e)}"
205
-
206
- # Webhook handler - this will be called when the repo is updated
207
- def setup_webhook(repo_id, target_repo=None, webhook_url=None):
208
- """Set up a webhook for repository updates"""
209
- if not hf_token:
210
- return "HF_TOKEN not set. Cannot set up webhook."
211
-
212
- if not target_repo:
213
- target_repo = f"{repo_id}-gguf"
214
 
215
- # Create the webhook URL for this space
216
- if not webhook_url:
217
- # Get the current space name from HF_SPACE_ID
218
- space_id = os.environ.get("HF_SPACE_ID")
219
- if not space_id:
220
- return "Cannot determine current Space ID. Please specify webhook_url manually."
221
-
222
- webhook_url = f"https://huggingface.co/spaces/{space_id}/webhook"
223
-
224
- try:
225
- # Add webhook to the source repository
226
- api.add_webhook(
227
- repo_id=repo_id,
228
- webhook_url=webhook_url,
229
- webhook_type="repo-update"
230
- )
231
- return f"Webhook set up for {repo_id} -> {webhook_url}"
232
- except Exception as e:
233
- return f"Error setting up webhook: {str(e)}"
234
-
235
- # Create Gradio interface
236
- with gr.Blocks() as interface:
237
- gr.Markdown("# GGUF Quantizer (Free Tier)")
238
- gr.Markdown("Automatically create GGUF quantized versions of Hugging Face models")
239
-
240
- with gr.Tab("Quantize Model"):
241
- with gr.Row():
242
- repo_id = gr.Textbox(label="Model Repository ID (e.g., 'mistralai/Mistral-7B-v0.1')")
243
-
244
- with gr.Row():
245
- q4_k_m = gr.Checkbox(label="Q4_K_M (4-bit, balanced quality/size)", value=True)
246
- q5_k_m = gr.Checkbox(label="Q5_K_M (5-bit, higher quality)", value=False)
247
- q8_0 = gr.Checkbox(label="Q8_0 (8-bit, highest quality)", value=False)
248
-
249
- quantize_btn = gr.Button("Quantize Model")
250
- output = gr.Textbox(label="Status")
251
-
252
- def process_quantize(repo_id, q4_k_m, q5_k_m, q8_0, progress=gr.Progress()):
253
- selected_types = {}
254
- if q4_k_m:
255
- selected_types["Q4_K_M"] = "q4_k_m"
256
- if q5_k_m:
257
- selected_types["Q5_K_M"] = "q5_k_m"
258
- if q8_0:
259
- selected_types["Q8_0"] = "q8_0"
260
-
261
- if not selected_types:
262
- return "Please select at least one quantization type"
263
-
264
- return quantize_model(repo_id, selected_types, progress)
265
-
266
- quantize_btn.click(
267
- process_quantize,
268
- inputs=[repo_id, q4_k_m, q5_k_m, q8_0],
269
- outputs=output
270
- )
271
 
272
- with gr.Tab("Setup Webhook"):
273
- gr.Markdown("""
274
- ## Set up automatic quantization
275
-
276
- This will set up a webhook to trigger quantization whenever the source repository is updated.
277
- Note: This requires HF_TOKEN to be set in Space secrets.
278
- """)
279
-
280
- webhook_repo_id = gr.Textbox(label="Source Repository ID")
281
- webhook_btn = gr.Button("Set Up Webhook")
282
- webhook_output = gr.Textbox(label="Webhook Status")
283
-
284
- webhook_btn.click(
285
- setup_webhook,
286
- inputs=[webhook_repo_id],
287
- outputs=webhook_output
288
- )
289
 
290
- with gr.Tab("Instructions"):
291
- gr.Markdown("""
292
- ## Instructions
293
-
294
- ### How to use this Space:
295
-
296
- 1. **Manual Quantization**: Enter a model repository ID and select quantization types
297
- 2. **Automatic Quantization**: Set up a webhook to trigger quantization when the model is updated
298
-
299
- ### Adding HF_TOKEN to Space Secrets:
300
-
301
- 1. Go to your Space Settings
302
- 2. Click on "Repository Secrets"
303
- 3. Add a new secret with key `HF_TOKEN` and your Hugging Face API token as value
304
-
305
- ### Limitations (Free Tier):
306
-
307
- - Limited memory: Very large models may fail to process
308
- - Limited storage: Files are processed in streaming mode, but temp files still need space
309
- - Limited compute: Quantization may take longer than on paid tiers
310
- - Jobs might be interrupted if they run too long
311
- """)
312
-
313
- # Start Flask server to handle webhooks
314
- from flask import Flask, request, jsonify
315
- import threading
316
-
317
- app = Flask(__name__)
318
-
319
- @app.route('/webhook', methods=['POST'])
320
- def handle_webhook():
321
- try:
322
- payload = request.json
323
-
324
- # Check if this is a repo update event
325
- event_type = payload.get('event')
326
- if event_type == 'repo-update':
327
- repo_id = payload.get('repo', {}).get('name')
328
-
329
- if repo_id:
330
- # Run quantization in background
331
- threading.Thread(target=lambda: quantize_model(
332
- repo_id,
333
- {"Q4_K_M": "q4_k_m"} # Default to just Q4_K_M to save resources
334
- )).start()
335
-
336
- return jsonify({"status": "quantization scheduled"})
337
-
338
- return jsonify({"status": "event ignored"})
339
- except Exception as e:
340
- return jsonify({"status": "error", "message": str(e)})
341
-
342
- # Launch both the Gradio and Flask apps
343
- import nest_asyncio
344
- import uvicorn
345
- from threading import Thread
346
-
347
- nest_asyncio.apply()
348
-
349
- # Launch the Gradio interface
350
- def launch_gradio():
351
- interface.launch(debug=False)
352
-
353
- # Launch the Flask webhook handler
354
- def launch_flask():
355
- uvicorn.run(app, host="0.0.0.0", port=7860)
356
 
357
- # Use the main Gradio interface as primary
358
  if __name__ == "__main__":
359
- Thread(target=launch_flask).start()
360
- launch_gradio()
 
 
 
1
  import gradio as gr
2
+ import json
3
+ import os
 
 
4
  import time
5
+ from monitor import setup_monitor, check_repo_updates, get_status
6
  import threading
7
 
8
+ # Initialize status
9
+ if not os.path.exists("status.json"):
10
+ status = {
11
+ "last_checked": None,
12
+ "is_up_to_date": False,
13
+ "current_quantization": None,
14
+ "completed_quantizations": [],
15
+ "failed_quantizations": [],
16
+ "progress": 0,
17
+ "status_message": "Initializing...",
18
+ "out_of_memory": False,
19
+ "last_successful_quant": None
20
+ }
21
+ with open("status.json", "w") as f:
22
+ json.dump(status, f)
23
+
24
+ # Start the monitoring thread
25
+ monitor_thread = threading.Thread(target=setup_monitor, daemon=True)
26
+ monitor_thread.start()
27
+
28
+ # Define the Gradio interface
29
+ with gr.Blocks() as demo:
30
+ gr.Markdown("# Ursa Minor Quantization Monitor")
31
+ gr.Markdown("This Space automatically generates quantized versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/Sculptor-AI/Ursa_Minor) model.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ with gr.Row():
34
+ with gr.Column():
35
+ status_indicator = gr.Markdown("Loading status...")
36
+ last_checked = gr.Markdown("Last checked: Never")
37
+
38
+ with gr.Column():
39
+ check_button = gr.Button("Check for updates now")
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ with gr.Row():
42
+ progress_bar = gr.Progress(label="Quantization Progress")
 
 
 
 
43
 
44
+ with gr.Row():
45
+ completed_box = gr.Dataframe(
46
+ headers=["Quantization", "Size (GB)", "Status", "Notes"],
47
+ datatype=["str", "str", "str", "str"],
48
+ label="Quantization Status"
49
+ )
50
 
51
+ # Function to update the UI
52
+ def update_ui():
53
+ status = get_status()
54
+
55
+ # Update status indicator
56
+ if status["out_of_memory"]:
57
+ status_text = f"⚠️ **Out of Memory Error** - The Space ran out of memory while processing {status['last_successful_quant']}. Try using a paid compute tier for larger models."
58
+ elif status["is_up_to_date"]:
59
+ status_text = "✅ **Up to date** - All quantizations are complete."
60
+ elif status["current_quantization"]:
61
+ status_text = f"🔄 **Processing** - Currently quantizing {status['current_quantization']}."
62
+ else:
63
+ status_text = "⏳ **Waiting** - Checking for updates..."
64
+
65
+ # Update last checked time
66
+ last_checked_text = f"Last checked: {status['last_checked'] if status['last_checked'] else 'Never'}"
67
+
68
+ # Update progress bar
69
+ progress_value = status["progress"] / 100 if status["progress"] else 0
70
+
71
+ # Update quantization status table
72
+ quantization_types = [
73
+ ["GGUF Q2_K", "0.8", "", ""],
74
+ ["GGUF Q3_K_S", "0.9", "", ""],
75
+ ["GGUF Q3_K_M", "0.9", "", "lower quality"],
76
+ ["GGUF Q3_K_L", "1.0", "", ""],
77
+ ["GGUF IQ4_XS", "1.0", "", ""],
78
+ ["GGUF Q4_K_S", "1.0", "", "fast, recommended"],
79
+ ["GGUF Q4_K_M", "1.1", "", "fast, recommended"],
80
+ ["GGUF Q5_K_S", "1.2", "", ""],
81
+ ["GGUF Q5_K_M", "1.2", "", ""],
82
+ ["GGUF Q6_K", "1.4", "", "very good quality"],
83
+ ["GGUF Q8_0", "1.7", "", "fast, best quality"],
84
+ ["GGUF f16", "3.2", "", "16 bpw, overkill"]
85
+ ]
86
+
87
+ # Update status for each quantization
88
+ for quant in quantization_types:
89
+ quant_name = quant[0]
90
+ if quant_name in status["completed_quantizations"]:
91
+ quant[2] = "✅ Complete"
92
+ elif quant_name in status["failed_quantizations"]:
93
+ quant[2] = "❌ Failed"
94
+ elif quant_name == status["current_quantization"]:
95
+ quant[2] = "🔄 In progress"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  else:
97
+ quant[2] = " Waiting"
98
 
99
+ return status_text, last_checked_text, progress_value, quantization_types
 
 
 
 
 
 
 
 
 
 
100
 
101
+ # Function to handle manual update check
102
+ def check_updates():
103
+ check_repo_updates(force=True)
104
+ return update_ui()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # Connect buttons and set up timed refresh
107
+ check_button.click(check_updates, outputs=[status_indicator, last_checked, progress_bar, completed_box])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ # Auto-refresh every 10 seconds
110
+ demo.load(update_ui, outputs=[status_indicator, last_checked, progress_bar, completed_box], every=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ # Launch the app
113
  if __name__ == "__main__":
114
+ demo.launch()
 
monitor.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import requests
5
+ from datetime import datetime
6
+ from apscheduler.schedulers.background import BackgroundScheduler
7
+ from quantize import quantize_model
8
+
9
+ # Define the quantization types in order from smallest to largest
10
+ QUANTIZATION_TYPES = [
11
+ "GGUF Q2_K",
12
+ "GGUF Q3_K_S",
13
+ "GGUF Q3_K_M",
14
+ "GGUF Q3_K_L",
15
+ "GGUF IQ4_XS",
16
+ "GGUF Q4_K_S",
17
+ "GGUF Q4_K_M",
18
+ "GGUF Q5_K_S",
19
+ "GGUF Q5_K_M",
20
+ "GGUF Q6_K",
21
+ "GGUF Q8_0",
22
+ "GGUF f16"
23
+ ]
24
+
25
+ # Mapping of quantization types to llama.cpp quantization parameters
26
+ QUANT_PARAMS = {
27
+ "GGUF Q2_K": "q2_k",
28
+ "GGUF Q3_K_S": "q3_k_s",
29
+ "GGUF Q3_K_M": "q3_k_m",
30
+ "GGUF Q3_K_L": "q3_k_l",
31
+ "GGUF IQ4_XS": "iq4_xs",
32
+ "GGUF Q4_K_S": "q4_k_s",
33
+ "GGUF Q4_K_M": "q4_k_m",
34
+ "GGUF Q5_K_S": "q5_k_s",
35
+ "GGUF Q5_K_M": "q5_k_m",
36
+ "GGUF Q6_K": "q6_k",
37
+ "GGUF Q8_0": "q8_0",
38
+ "GGUF f16": "f16"
39
+ }
40
+
41
+ # Source and destination repositories
42
+ SOURCE_REPO = "Sculptor-AI/Ursa_Minor"
43
+ DESTINATION_REPO = "Sculptor-AI/Ursa_Minor_Quantized" # This should be created in advance
44
+
45
+ def get_status():
46
+ """Read the current status from the status file"""
47
+ try:
48
+ with open("status.json", "r") as f:
49
+ return json.load(f)
50
+ except Exception as e:
51
+ print(f"Error reading status: {e}")
52
+ return {
53
+ "last_checked": None,
54
+ "is_up_to_date": False,
55
+ "current_quantization": None,
56
+ "completed_quantizations": [],
57
+ "failed_quantizations": [],
58
+ "progress": 0,
59
+ "status_message": "Error reading status",
60
+ "out_of_memory": False,
61
+ "last_successful_quant": None
62
+ }
63
+
64
+ def update_status(updates):
65
+ """Update the status file with the provided updates"""
66
+ try:
67
+ status = get_status()
68
+ status.update(updates)
69
+ with open("status.json", "w") as f:
70
+ json.dump(status, f)
71
+ except Exception as e:
72
+ print(f"Error updating status: {e}")
73
+
74
+ def get_repo_last_modified(repo_id):
75
+ """Get the last modified date of the repository"""
76
+ try:
77
+ url = f"https://huggingface.co/api/models/{repo_id}"
78
+ response = requests.get(url)
79
+ response.raise_for_status()
80
+ data = response.json()
81
+ return data.get("lastModified")
82
+ except Exception as e:
83
+ print(f"Error checking repository: {e}")
84
+ return None
85
+
86
+ def check_repo_updates(force=False):
87
+ """Check if the source repository has been updated and start quantization if needed"""
88
+ now = datetime.now().isoformat()
89
+ update_status({"last_checked": now})
90
+ print(f"Checking for updates to {SOURCE_REPO}...")
91
+
92
+ # Get current status
93
+ status = get_status()
94
+
95
+ # If we're already processing, don't check for updates
96
+ if status["current_quantization"] and not force:
97
+ print("Already processing, skipping update check")
98
+ return
99
+
100
+ # If we had an out of memory error and this isn't a forced check, skip
101
+ if status["out_of_memory"] and not force:
102
+ print("Previous run had an out of memory error, skipping automatic update check")
103
+ return
104
+
105
+ # Check if the source repo has been updated
106
+ last_modified = get_repo_last_modified(SOURCE_REPO)
107
+
108
+ if not last_modified:
109
+ print("Couldn't get repository information, skipping update")
110
+ return
111
+
112
+ # Determine if we need to process quantizations
113
+ need_to_process = False
114
+ if force:
115
+ print("Forced update check, processing quantizations")
116
+ need_to_process = True
117
+ elif "source_last_modified" not in status or status["source_last_modified"] != last_modified:
118
+ print("Source repository has been updated, processing quantizations")
119
+ need_to_process = True
120
+ update_status({"source_last_modified": last_modified})
121
+ else:
122
+ print("Source repository hasn't changed, no processing needed")
123
+
124
+ # Check if all quantizations are complete
125
+ all_completed = all(quant in status["completed_quantizations"] for quant in QUANTIZATION_TYPES)
126
+ if all_completed:
127
+ update_status({"is_up_to_date": True})
128
+
129
+ return
130
+
131
+ # Reset status for a new processing run
132
+ if need_to_process:
133
+ update_status({
134
+ "is_up_to_date": False,
135
+ "progress": 0,
136
+ "out_of_memory": False,
137
+ "status_message": "Starting quantization process...",
138
+ "completed_quantizations": [],
139
+ "failed_quantizations": [],
140
+ "current_quantization": None
141
+ })
142
+
143
+ # Start the first quantization
144
+ start_next_quantization()
145
+
146
+ def start_next_quantization():
147
+ """Start the next quantization in the queue"""
148
+ status = get_status()
149
+
150
+ # Check if we had an out of memory error
151
+ if status["out_of_memory"]:
152
+ print("Previous run had an out of memory error, not starting next quantization")
153
+ return
154
+
155
+ # Find the next quantization to process
156
+ completed = set(status["completed_quantizations"])
157
+ failed = set(status["failed_quantizations"])
158
+ processed = completed.union(failed)
159
+
160
+ next_quant = None
161
+ for quant in QUANTIZATION_TYPES:
162
+ if quant not in processed:
163
+ next_quant = quant
164
+ break
165
+
166
+ if not next_quant:
167
+ # All quantizations are complete
168
+ update_status({
169
+ "is_up_to_date": True,
170
+ "current_quantization": None,
171
+ "progress": 100,
172
+ "status_message": "All quantizations complete"
173
+ })
174
+ print("All quantizations complete!")
175
+ return
176
+
177
+ # Start the next quantization
178
+ update_status({
179
+ "current_quantization": next_quant,
180
+ "progress": 0,
181
+ "status_message": f"Starting {next_quant} quantization..."
182
+ })
183
+
184
+ print(f"Starting quantization: {next_quant}")
185
+
186
+ try:
187
+ # Run the quantization
188
+ success = quantize_model(
189
+ SOURCE_REPO,
190
+ DESTINATION_REPO,
191
+ next_quant,
192
+ QUANT_PARAMS[next_quant]
193
+ )
194
+
195
+ if success:
196
+ # Quantization completed successfully
197
+ print(f"Quantization {next_quant} completed successfully")
198
+ status = get_status()
199
+ completed = status["completed_quantizations"]
200
+ completed.append(next_quant)
201
+
202
+ update_status({
203
+ "completed_quantizations": completed,
204
+ "current_quantization": None,
205
+ "last_successful_quant": next_quant,
206
+ "progress": 100,
207
+ "status_message": f"Completed {next_quant} quantization"
208
+ })
209
+
210
+ # Start the next quantization
211
+ start_next_quantization()
212
+ else:
213
+ # Quantization failed
214
+ print(f"Quantization {next_quant} failed")
215
+ status = get_status()
216
+ failed = status["failed_quantizations"]
217
+ failed.append(next_quant)
218
+
219
+ update_status({
220
+ "failed_quantizations": failed,
221
+ "current_quantization": None,
222
+ "progress": 0,
223
+ "status_message": f"Failed {next_quant} quantization"
224
+ })
225
+
226
+ # Try the next quantization
227
+ start_next_quantization()
228
+
229
+ except MemoryError:
230
+ # Handle out of memory error
231
+ print(f"Out of memory error during {next_quant} quantization")
232
+ status = get_status()
233
+ failed = status["failed_quantizations"]
234
+ failed.append(next_quant)
235
+
236
+ update_status({
237
+ "failed_quantizations": failed,
238
+ "current_quantization": None,
239
+ "out_of_memory": True,
240
+ "progress": 0,
241
+ "status_message": f"Out of memory during {next_quant} quantization"
242
+ })
243
+
244
+ def setup_monitor():
245
+ """Set up the scheduled monitoring"""
246
+ scheduler = BackgroundScheduler()
247
+ # Check for updates every hour
248
+ scheduler.add_job(check_repo_updates, 'interval', hours=1)
249
+ scheduler.start()
250
+
251
+ # Do an initial check
252
+ check_repo_updates()
253
+
254
+ try:
255
+ # Keep the thread alive
256
+ while True:
257
+ time.sleep(60)
258
+ except (KeyboardInterrupt, SystemExit):
259
+ scheduler.shutdown()
quantize.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import tempfile
4
+ import time
5
+ import json
6
+ import shutil
7
+ from huggingface_hub import HfApi, Repository, snapshot_download
8
+ from tqdm import tqdm
9
+
10
+ def update_progress(progress):
11
+ """Update the progress in the status file"""
12
+ try:
13
+ with open("status.json", "r") as f:
14
+ status = json.load(f)
15
+
16
+ status["progress"] = progress
17
+
18
+ with open("status.json", "w") as f:
19
+ json.dump(status, f)
20
+ except Exception as e:
21
+ print(f"Error updating progress: {e}")
22
+
23
+ def quantize_model(source_repo, dest_repo, quant_name, quant_type):
24
+ """
25
+ Download the model, quantize it, and upload to the destination repo
26
+
27
+ Args:
28
+ source_repo: HF repo ID for the source model
29
+ dest_repo: HF repo ID for the destination repo
30
+ quant_name: Name of the quantization (for display)
31
+ quant_type: llama.cpp quantization parameter
32
+
33
+ Returns:
34
+ bool: True if successful, False otherwise
35
+ """
36
+ try:
37
+ update_progress(5)
38
+
39
+ # Create temporary directories
40
+ with tempfile.TemporaryDirectory() as temp_dir:
41
+ model_dir = os.path.join(temp_dir, "model")
42
+ output_dir = os.path.join(temp_dir, "output")
43
+ os.makedirs(output_dir, exist_ok=True)
44
+
45
+ # Update status
46
+ with open("status.json", "r") as f:
47
+ status = json.load(f)
48
+ status["status_message"] = f"Downloading {source_repo}..."
49
+ with open("status.json", "w") as f:
50
+ json.dump(status, f)
51
+
52
+ # Download the model
53
+ print(f"Downloading {source_repo}...")
54
+ snapshot_download(
55
+ repo_id=source_repo,
56
+ local_dir=model_dir,
57
+ local_dir_use_symlinks=False
58
+ )
59
+
60
+ update_progress(30)
61
+
62
+ # Find the model file (assuming it's a .bin file)
63
+ model_files = [f for f in os.listdir(model_dir) if f.endswith(".bin")]
64
+ if not model_files:
65
+ print("No model file found")
66
+ return False
67
+
68
+ model_file = os.path.join(model_dir, model_files[0])
69
+ output_file = os.path.join(output_dir, f"Ursa_Minor-{quant_type}.gguf")
70
+
71
+ # Update status
72
+ with open("status.json", "r") as f:
73
+ status = json.load(f)
74
+ status["status_message"] = f"Quantizing to {quant_name}..."
75
+ with open("status.json", "w") as f:
76
+ json.dump(status, f)
77
+
78
+ # Run quantization
79
+ print(f"Quantizing to {quant_type}...")
80
+ command = [
81
+ "python", "-m", "llama_cpp.quantize",
82
+ model_file,
83
+ output_file,
84
+ f"--{quant_type}"
85
+ ]
86
+
87
+ try:
88
+ # Start the quantization process
89
+ process = subprocess.Popen(
90
+ command,
91
+ stdout=subprocess.PIPE,
92
+ stderr=subprocess.STDOUT,
93
+ universal_newlines=True
94
+ )
95
+
96
+ # Monitor output for progress
97
+ for line in process.stdout:
98
+ print(line, end="")
99
+ if "Quantizing tensors" in line and ":" in line:
100
+ try:
101
+ # Parse progress from output
102
+ parts = line.split(":")
103
+ if len(parts) >= 2:
104
+ progress_str = parts[1].strip()
105
+ if "/" in progress_str:
106
+ current, total = map(int, progress_str.split("/"))
107
+ progress = 30 + int(60 * current / total)
108
+ update_progress(progress)
109
+ except Exception as e:
110
+ print(f"Error parsing progress: {e}")
111
+
112
+ # Wait for process to complete
113
+ process.wait()
114
+
115
+ if process.returncode != 0:
116
+ print(f"Quantization failed with return code {process.returncode}")
117
+ return False
118
+
119
+ except MemoryError:
120
+ print("Out of memory during quantization")
121
+ raise
122
+ except Exception as e:
123
+ print(f"Error during quantization: {e}")
124
+ return False
125
+
126
+ update_progress(90)
127
+
128
+ # Upload to Hugging Face
129
+ print(f"Uploading {quant_name} to {dest_repo}...")
130
+
131
+ # Update status
132
+ with open("status.json", "r") as f:
133
+ status = json.load(f)
134
+ status["status_message"] = f"Uploading {quant_name} to Hugging Face..."
135
+ with open("status.json", "w") as f:
136
+ json.dump(status, f)
137
+
138
+ # Login to HF if token is available
139
+ token = os.environ.get("HF_TOKEN")
140
+ if not token:
141
+ print("HF_TOKEN environment variable not set")
142
+ return False
143
+
144
+ api = HfApi(token=token)
145
+
146
+ # Create the repo if it doesn't exist
147
+ try:
148
+ api.create_repo(
149
+ repo_id=dest_repo,
150
+ exist_ok=True,
151
+ private=False
152
+ )
153
+ except Exception as e:
154
+ print(f"Error creating repo: {e}")
155
+ return False
156
+
157
+ # Clone the repo
158
+ repo_dir = os.path.join(temp_dir, "repo")
159
+ repo = Repository(
160
+ local_dir=repo_dir,
161
+ clone_from=dest_repo,
162
+ token=token
163
+ )
164
+
165
+ # Copy the quantized model to the repo
166
+ output_file_name = os.path.basename(output_file)
167
+ shutil.copy(output_file, os.path.join(repo_dir, output_file_name))
168
+
169
+ # Create or update README.md
170
+ readme_path = os.path.join(repo_dir, "README.md")
171
+ if os.path.exists(readme_path):
172
+ with open(readme_path, "r") as f:
173
+ readme_content = f.read()
174
+ else:
175
+ readme_content = f"# Ursa Minor Quantized Models\n\nThis repository contains quantized versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/Sculptor-AI/Ursa_Minor) model.\n\n## Available Quantizations\n\n"
176
+
177
+ # Add or update the quantization entry in the README
178
+ quant_entry = f"- **{quant_name}**: [{output_file_name}](/{dest_repo}/blob/main/{output_file_name})\n"
179
+ if quant_entry not in readme_content:
180
+ readme_content += quant_entry
181
+ with open(readme_path, "w") as f:
182
+ f.write(readme_content)
183
+
184
+ # Commit and push
185
+ repo.git_add()
186
+ repo.git_commit(f"Add {quant_name} quantization")
187
+ repo.git_push()
188
+
189
+ update_progress(100)
190
+ return True
191
+
192
+ except MemoryError:
193
+ # Special handling for memory errors
194
+ raise
195
+ except Exception as e:
196
+ print(f"Error in quantization process: {e}")
197
+ return False
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- gradio>=3.41.0
2
- huggingface_hub>=0.16.0
3
- flask>=2.0.0
4
- nest_asyncio>=1.5.6
5
- uvicorn>=0.22.0
 
 
1
+ gradio>=3.40.1
2
+ huggingface_hub>=0.16.4
3
+ requests>=2.31.0
4
+ apscheduler>=3.10.1
5
+ tqdm>=4.66.1
6
+ llama-cpp-python>=0.2.10