Kaileh57 commited on
Commit
0a23172
·
1 Parent(s): 0b6befa

Setup app files

Browse files
Files changed (2) hide show
  1. app.py +360 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import gradio as gr
4
+ import subprocess
5
+ import tempfile
6
+ import shutil
7
+ from huggingface_hub import HfApi, login, Repository
8
+ import time
9
+ import threading
10
+
11
+ # Initialize Hugging Face API
12
+ hf_token = os.environ.get("HF_TOKEN")
13
+ api = HfApi(token=hf_token)
14
+ if hf_token:
15
+ login(token=hf_token)
16
+ else:
17
+ print("WARNING: HF_TOKEN not set. You'll be limited to public repositories.")
18
+
19
+ # Define quantization options
20
+ QUANT_TYPES = {
21
+ "Q4_K_M": "q4_k_m", # 4-bit, good quality and size
22
+ "Q5_K_M": "q5_k_m", # 5-bit, better quality
23
+ "Q8_0": "q8_0" # 8-bit, high quality
24
+ }
25
+
26
+ def install_llama_cpp():
27
+ """Install llama.cpp if not already installed"""
28
+ if not os.path.exists("llama.cpp"):
29
+ print("Installing llama.cpp...")
30
+ # Clone llama.cpp
31
+ subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "--depth=1"], check=True)
32
+ # Build llama.cpp (minimal build for conversion only)
33
+ os.chdir("llama.cpp")
34
+ subprocess.run(["make", "clean"], check=True)
35
+ subprocess.run(["make", "convert"], check=True)
36
+ os.chdir("..")
37
+ print("llama.cpp installed successfully")
38
+ else:
39
+ print("llama.cpp already installed")
40
+
41
+ def clone_repo_shallow(repo_id, target_dir):
42
+ """Clone only the necessary files from a repo to save space"""
43
+ print(f"Cloning {repo_id} to {target_dir}...")
44
+
45
+ # Create a sparse checkout to save space
46
+ cmd = [
47
+ "git", "clone",
48
+ "--depth=1",
49
+ "--filter=blob:none",
50
+ f"https://huggingface.co/{repo_id}",
51
+ target_dir
52
+ ]
53
+
54
+ subprocess.run(cmd, check=True)
55
+ print(f"Repository {repo_id} cloned successfully")
56
+
57
+ def find_model_files(directory):
58
+ """Find model files in the repository"""
59
+ # Look for common model file patterns
60
+ model_files = []
61
+
62
+ # Safetensors is preferred (usually smaller)
63
+ for pattern in ["*.safetensors", "consolidated.*.pt", "pytorch_model.bin", "*.bin"]:
64
+ cmd = ["find", directory, "-name", pattern]
65
+ result = subprocess.run(cmd, capture_output=True, text=True)
66
+ if result.stdout:
67
+ model_files.extend(result.stdout.strip().split('\n'))
68
+
69
+ # Filter out empty strings and sort by size (prefer smaller files for HF format)
70
+ model_files = [f for f in model_files if f]
71
+ if not model_files:
72
+ return []
73
+
74
+ # Check for model configuration
75
+ config_file = None
76
+ cmd = ["find", directory, "-name", "config.json"]
77
+ result = subprocess.run(cmd, capture_output=True, text=True)
78
+ if result.stdout:
79
+ config_file = result.stdout.strip().split('\n')[0]
80
+
81
+ return model_files, config_file
82
+
83
+ def quantize_model(repo_id, quant_types, progress=gr.Progress()):
84
+ """Quantize a model with llama.cpp and push to Hugging Face"""
85
+ # Install llama.cpp if needed
86
+ install_llama_cpp()
87
+
88
+ # Create temporary directories for processing
89
+ with tempfile.TemporaryDirectory() as temp_dir:
90
+ progress(0.1, "Cloning repository...")
91
+ model_dir = os.path.join(temp_dir, "model")
92
+ output_dir = os.path.join(temp_dir, "output")
93
+ os.makedirs(model_dir, exist_ok=True)
94
+ os.makedirs(output_dir, exist_ok=True)
95
+
96
+ try:
97
+ # Clone the source repository
98
+ clone_repo_shallow(repo_id, model_dir)
99
+
100
+ # Find model files
101
+ progress(0.2, "Looking for model files...")
102
+ model_file_info = find_model_files(model_dir)
103
+ if not model_file_info:
104
+ return "No model files found in the repository."
105
+
106
+ model_files, config_file = model_file_info
107
+ model_file = model_files[0] # Use the first model file found
108
+
109
+ progress(0.3, "Determining model type...")
110
+ # Try to determine model type
111
+ model_type = "llama" # Default model type
112
+ if config_file:
113
+ with open(config_file, 'r') as f:
114
+ import json
115
+ config = json.load(f)
116
+ if 'model_type' in config:
117
+ config_model_type = config['model_type'].lower()
118
+ # Map model type to llama.cpp supported types
119
+ type_mapping = {
120
+ 'llama': 'llama',
121
+ 'mistral': 'llama',
122
+ 'mixtral': 'llama',
123
+ 'falcon': 'falcon',
124
+ 'mpt': 'mpt',
125
+ 'gpt_neox': 'gptneox',
126
+ 'gptj': 'gptj',
127
+ 'bloom': 'bloom'
128
+ }
129
+ model_type = type_mapping.get(config_model_type, 'llama')
130
+
131
+ # Create output repository name
132
+ repo_name = repo_id.split('/')[-1]
133
+ target_repo_id = f"{repo_id}-gguf"
134
+
135
+ # Create the output repository if it doesn't exist
136
+ progress(0.4, "Creating target repository...")
137
+ try:
138
+ api.create_repo(repo_id=target_repo_id, exist_ok=True)
139
+ except Exception as e:
140
+ return f"Error creating repository: {str(e)}"
141
+
142
+ success_count = 0
143
+ progress_step = 0.5 / len(quant_types)
144
+ progress_value = 0.4
145
+
146
+ # Process each quantization type
147
+ for quant_name, quant_type in quant_types.items():
148
+ progress_value += progress_step
149
+ progress(progress_value, f"Processing {quant_name} quantization...")
150
+
151
+ output_file = os.path.join(output_dir, f"{repo_name}-{quant_name}.gguf")
152
+
153
+ # Convert to GGUF format
154
+ print(f"Converting to {quant_name}...")
155
+ convert_cmd = [
156
+ "python3",
157
+ os.path.join("llama.cpp", "convert.py"),
158
+ f"--model-type", model_type,
159
+ f"--outtype", "f16",
160
+ f"--outfile", output_file
161
+ ]
162
+
163
+ # Add model path
164
+ convert_cmd.append(model_file)
165
+
166
+ try:
167
+ # First convert to GGUF format (without quantization)
168
+ subprocess.run(convert_cmd, check=True)
169
+
170
+ # Then quantize if needed
171
+ if quant_type != "f16":
172
+ quant_output = output_file.replace(".gguf", f"-{quant_type}.gguf")
173
+ quantize_cmd = [
174
+ os.path.join("llama.cpp", "quantize"),
175
+ output_file,
176
+ quant_output,
177
+ quant_type
178
+ ]
179
+ subprocess.run(quantize_cmd, check=True)
180
+ # Replace the output file with the quantized version
181
+ os.remove(output_file)
182
+ os.rename(quant_output, output_file)
183
+
184
+ # Upload to HF
185
+ progress(progress_value + (progress_step * 0.7), f"Uploading {quant_name}...")
186
+ api.upload_file(
187
+ path_or_fileobj=output_file,
188
+ path_in_repo=f"{repo_name}-{quant_name}.gguf",
189
+ repo_id=target_repo_id,
190
+ commit_message=f"Add {quant_name} quantized version"
191
+ )
192
+
193
+ success_count += 1
194
+ except Exception as e:
195
+ print(f"Error processing {quant_name}: {str(e)}")
196
+
197
+ progress(1.0, "Completed!")
198
+ if success_count > 0:
199
+ return f"Successfully created {success_count} quantized versions in {target_repo_id}"
200
+ else:
201
+ return "Failed to create any quantized versions."
202
+
203
+ except Exception as e:
204
+ return f"Error: {str(e)}"
205
+
206
+ # Webhook handler - this will be called when the repo is updated
207
+ def setup_webhook(repo_id, target_repo=None, webhook_url=None):
208
+ """Set up a webhook for repository updates"""
209
+ if not hf_token:
210
+ return "HF_TOKEN not set. Cannot set up webhook."
211
+
212
+ if not target_repo:
213
+ target_repo = f"{repo_id}-gguf"
214
+
215
+ # Create the webhook URL for this space
216
+ if not webhook_url:
217
+ # Get the current space name from HF_SPACE_ID
218
+ space_id = os.environ.get("HF_SPACE_ID")
219
+ if not space_id:
220
+ return "Cannot determine current Space ID. Please specify webhook_url manually."
221
+
222
+ webhook_url = f"https://huggingface.co/spaces/{space_id}/webhook"
223
+
224
+ try:
225
+ # Add webhook to the source repository
226
+ api.add_webhook(
227
+ repo_id=repo_id,
228
+ webhook_url=webhook_url,
229
+ webhook_type="repo-update"
230
+ )
231
+ return f"Webhook set up for {repo_id} -> {webhook_url}"
232
+ except Exception as e:
233
+ return f"Error setting up webhook: {str(e)}"
234
+
235
+ # Create Gradio interface
236
+ with gr.Blocks() as interface:
237
+ gr.Markdown("# GGUF Quantizer (Free Tier)")
238
+ gr.Markdown("Automatically create GGUF quantized versions of Hugging Face models")
239
+
240
+ with gr.Tab("Quantize Model"):
241
+ with gr.Row():
242
+ repo_id = gr.Textbox(label="Model Repository ID (e.g., 'mistralai/Mistral-7B-v0.1')")
243
+
244
+ with gr.Row():
245
+ q4_k_m = gr.Checkbox(label="Q4_K_M (4-bit, balanced quality/size)", value=True)
246
+ q5_k_m = gr.Checkbox(label="Q5_K_M (5-bit, higher quality)", value=False)
247
+ q8_0 = gr.Checkbox(label="Q8_0 (8-bit, highest quality)", value=False)
248
+
249
+ quantize_btn = gr.Button("Quantize Model")
250
+ output = gr.Textbox(label="Status")
251
+
252
+ def process_quantize(repo_id, q4_k_m, q5_k_m, q8_0, progress=gr.Progress()):
253
+ selected_types = {}
254
+ if q4_k_m:
255
+ selected_types["Q4_K_M"] = "q4_k_m"
256
+ if q5_k_m:
257
+ selected_types["Q5_K_M"] = "q5_k_m"
258
+ if q8_0:
259
+ selected_types["Q8_0"] = "q8_0"
260
+
261
+ if not selected_types:
262
+ return "Please select at least one quantization type"
263
+
264
+ return quantize_model(repo_id, selected_types, progress)
265
+
266
+ quantize_btn.click(
267
+ process_quantize,
268
+ inputs=[repo_id, q4_k_m, q5_k_m, q8_0],
269
+ outputs=output
270
+ )
271
+
272
+ with gr.Tab("Setup Webhook"):
273
+ gr.Markdown("""
274
+ ## Set up automatic quantization
275
+
276
+ This will set up a webhook to trigger quantization whenever the source repository is updated.
277
+ Note: This requires HF_TOKEN to be set in Space secrets.
278
+ """)
279
+
280
+ webhook_repo_id = gr.Textbox(label="Source Repository ID")
281
+ webhook_btn = gr.Button("Set Up Webhook")
282
+ webhook_output = gr.Textbox(label="Webhook Status")
283
+
284
+ webhook_btn.click(
285
+ setup_webhook,
286
+ inputs=[webhook_repo_id],
287
+ outputs=webhook_output
288
+ )
289
+
290
+ with gr.Tab("Instructions"):
291
+ gr.Markdown("""
292
+ ## Instructions
293
+
294
+ ### How to use this Space:
295
+
296
+ 1. **Manual Quantization**: Enter a model repository ID and select quantization types
297
+ 2. **Automatic Quantization**: Set up a webhook to trigger quantization when the model is updated
298
+
299
+ ### Adding HF_TOKEN to Space Secrets:
300
+
301
+ 1. Go to your Space Settings
302
+ 2. Click on "Repository Secrets"
303
+ 3. Add a new secret with key `HF_TOKEN` and your Hugging Face API token as value
304
+
305
+ ### Limitations (Free Tier):
306
+
307
+ - Limited memory: Very large models may fail to process
308
+ - Limited storage: Files are processed in streaming mode, but temp files still need space
309
+ - Limited compute: Quantization may take longer than on paid tiers
310
+ - Jobs might be interrupted if they run too long
311
+ """)
312
+
313
+ # Start Flask server to handle webhooks
314
+ from flask import Flask, request, jsonify
315
+ import threading
316
+
317
+ app = Flask(__name__)
318
+
319
+ @app.route('/webhook', methods=['POST'])
320
+ def handle_webhook():
321
+ try:
322
+ payload = request.json
323
+
324
+ # Check if this is a repo update event
325
+ event_type = payload.get('event')
326
+ if event_type == 'repo-update':
327
+ repo_id = payload.get('repo', {}).get('name')
328
+
329
+ if repo_id:
330
+ # Run quantization in background
331
+ threading.Thread(target=lambda: quantize_model(
332
+ repo_id,
333
+ {"Q4_K_M": "q4_k_m"} # Default to just Q4_K_M to save resources
334
+ )).start()
335
+
336
+ return jsonify({"status": "quantization scheduled"})
337
+
338
+ return jsonify({"status": "event ignored"})
339
+ except Exception as e:
340
+ return jsonify({"status": "error", "message": str(e)})
341
+
342
+ # Launch both the Gradio and Flask apps
343
+ import nest_asyncio
344
+ import uvicorn
345
+ from threading import Thread
346
+
347
+ nest_asyncio.apply()
348
+
349
+ # Launch the Gradio interface
350
+ def launch_gradio():
351
+ interface.launch(debug=False)
352
+
353
+ # Launch the Flask webhook handler
354
+ def launch_flask():
355
+ uvicorn.run(app, host="0.0.0.0", port=7860)
356
+
357
+ # Use the main Gradio interface as primary
358
+ if __name__ == "__main__":
359
+ Thread(target=launch_flask).start()
360
+ launch_gradio()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=3.41.0
2
+ huggingface_hub>=0.16.0
3
+ flask>=2.0.0
4
+ nest_asyncio>=1.5.6
5
+ uvicorn>=0.22.0