Spaces:
Running
Running
Setup app files
Browse files- app.py +360 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import gradio as gr
|
4 |
+
import subprocess
|
5 |
+
import tempfile
|
6 |
+
import shutil
|
7 |
+
from huggingface_hub import HfApi, login, Repository
|
8 |
+
import time
|
9 |
+
import threading
|
10 |
+
|
11 |
+
# Initialize Hugging Face API
|
12 |
+
hf_token = os.environ.get("HF_TOKEN")
|
13 |
+
api = HfApi(token=hf_token)
|
14 |
+
if hf_token:
|
15 |
+
login(token=hf_token)
|
16 |
+
else:
|
17 |
+
print("WARNING: HF_TOKEN not set. You'll be limited to public repositories.")
|
18 |
+
|
19 |
+
# Define quantization options
|
20 |
+
QUANT_TYPES = {
|
21 |
+
"Q4_K_M": "q4_k_m", # 4-bit, good quality and size
|
22 |
+
"Q5_K_M": "q5_k_m", # 5-bit, better quality
|
23 |
+
"Q8_0": "q8_0" # 8-bit, high quality
|
24 |
+
}
|
25 |
+
|
26 |
+
def install_llama_cpp():
|
27 |
+
"""Install llama.cpp if not already installed"""
|
28 |
+
if not os.path.exists("llama.cpp"):
|
29 |
+
print("Installing llama.cpp...")
|
30 |
+
# Clone llama.cpp
|
31 |
+
subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "--depth=1"], check=True)
|
32 |
+
# Build llama.cpp (minimal build for conversion only)
|
33 |
+
os.chdir("llama.cpp")
|
34 |
+
subprocess.run(["make", "clean"], check=True)
|
35 |
+
subprocess.run(["make", "convert"], check=True)
|
36 |
+
os.chdir("..")
|
37 |
+
print("llama.cpp installed successfully")
|
38 |
+
else:
|
39 |
+
print("llama.cpp already installed")
|
40 |
+
|
41 |
+
def clone_repo_shallow(repo_id, target_dir):
|
42 |
+
"""Clone only the necessary files from a repo to save space"""
|
43 |
+
print(f"Cloning {repo_id} to {target_dir}...")
|
44 |
+
|
45 |
+
# Create a sparse checkout to save space
|
46 |
+
cmd = [
|
47 |
+
"git", "clone",
|
48 |
+
"--depth=1",
|
49 |
+
"--filter=blob:none",
|
50 |
+
f"https://huggingface.co/{repo_id}",
|
51 |
+
target_dir
|
52 |
+
]
|
53 |
+
|
54 |
+
subprocess.run(cmd, check=True)
|
55 |
+
print(f"Repository {repo_id} cloned successfully")
|
56 |
+
|
57 |
+
def find_model_files(directory):
|
58 |
+
"""Find model files in the repository"""
|
59 |
+
# Look for common model file patterns
|
60 |
+
model_files = []
|
61 |
+
|
62 |
+
# Safetensors is preferred (usually smaller)
|
63 |
+
for pattern in ["*.safetensors", "consolidated.*.pt", "pytorch_model.bin", "*.bin"]:
|
64 |
+
cmd = ["find", directory, "-name", pattern]
|
65 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
66 |
+
if result.stdout:
|
67 |
+
model_files.extend(result.stdout.strip().split('\n'))
|
68 |
+
|
69 |
+
# Filter out empty strings and sort by size (prefer smaller files for HF format)
|
70 |
+
model_files = [f for f in model_files if f]
|
71 |
+
if not model_files:
|
72 |
+
return []
|
73 |
+
|
74 |
+
# Check for model configuration
|
75 |
+
config_file = None
|
76 |
+
cmd = ["find", directory, "-name", "config.json"]
|
77 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
78 |
+
if result.stdout:
|
79 |
+
config_file = result.stdout.strip().split('\n')[0]
|
80 |
+
|
81 |
+
return model_files, config_file
|
82 |
+
|
83 |
+
def quantize_model(repo_id, quant_types, progress=gr.Progress()):
|
84 |
+
"""Quantize a model with llama.cpp and push to Hugging Face"""
|
85 |
+
# Install llama.cpp if needed
|
86 |
+
install_llama_cpp()
|
87 |
+
|
88 |
+
# Create temporary directories for processing
|
89 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
90 |
+
progress(0.1, "Cloning repository...")
|
91 |
+
model_dir = os.path.join(temp_dir, "model")
|
92 |
+
output_dir = os.path.join(temp_dir, "output")
|
93 |
+
os.makedirs(model_dir, exist_ok=True)
|
94 |
+
os.makedirs(output_dir, exist_ok=True)
|
95 |
+
|
96 |
+
try:
|
97 |
+
# Clone the source repository
|
98 |
+
clone_repo_shallow(repo_id, model_dir)
|
99 |
+
|
100 |
+
# Find model files
|
101 |
+
progress(0.2, "Looking for model files...")
|
102 |
+
model_file_info = find_model_files(model_dir)
|
103 |
+
if not model_file_info:
|
104 |
+
return "No model files found in the repository."
|
105 |
+
|
106 |
+
model_files, config_file = model_file_info
|
107 |
+
model_file = model_files[0] # Use the first model file found
|
108 |
+
|
109 |
+
progress(0.3, "Determining model type...")
|
110 |
+
# Try to determine model type
|
111 |
+
model_type = "llama" # Default model type
|
112 |
+
if config_file:
|
113 |
+
with open(config_file, 'r') as f:
|
114 |
+
import json
|
115 |
+
config = json.load(f)
|
116 |
+
if 'model_type' in config:
|
117 |
+
config_model_type = config['model_type'].lower()
|
118 |
+
# Map model type to llama.cpp supported types
|
119 |
+
type_mapping = {
|
120 |
+
'llama': 'llama',
|
121 |
+
'mistral': 'llama',
|
122 |
+
'mixtral': 'llama',
|
123 |
+
'falcon': 'falcon',
|
124 |
+
'mpt': 'mpt',
|
125 |
+
'gpt_neox': 'gptneox',
|
126 |
+
'gptj': 'gptj',
|
127 |
+
'bloom': 'bloom'
|
128 |
+
}
|
129 |
+
model_type = type_mapping.get(config_model_type, 'llama')
|
130 |
+
|
131 |
+
# Create output repository name
|
132 |
+
repo_name = repo_id.split('/')[-1]
|
133 |
+
target_repo_id = f"{repo_id}-gguf"
|
134 |
+
|
135 |
+
# Create the output repository if it doesn't exist
|
136 |
+
progress(0.4, "Creating target repository...")
|
137 |
+
try:
|
138 |
+
api.create_repo(repo_id=target_repo_id, exist_ok=True)
|
139 |
+
except Exception as e:
|
140 |
+
return f"Error creating repository: {str(e)}"
|
141 |
+
|
142 |
+
success_count = 0
|
143 |
+
progress_step = 0.5 / len(quant_types)
|
144 |
+
progress_value = 0.4
|
145 |
+
|
146 |
+
# Process each quantization type
|
147 |
+
for quant_name, quant_type in quant_types.items():
|
148 |
+
progress_value += progress_step
|
149 |
+
progress(progress_value, f"Processing {quant_name} quantization...")
|
150 |
+
|
151 |
+
output_file = os.path.join(output_dir, f"{repo_name}-{quant_name}.gguf")
|
152 |
+
|
153 |
+
# Convert to GGUF format
|
154 |
+
print(f"Converting to {quant_name}...")
|
155 |
+
convert_cmd = [
|
156 |
+
"python3",
|
157 |
+
os.path.join("llama.cpp", "convert.py"),
|
158 |
+
f"--model-type", model_type,
|
159 |
+
f"--outtype", "f16",
|
160 |
+
f"--outfile", output_file
|
161 |
+
]
|
162 |
+
|
163 |
+
# Add model path
|
164 |
+
convert_cmd.append(model_file)
|
165 |
+
|
166 |
+
try:
|
167 |
+
# First convert to GGUF format (without quantization)
|
168 |
+
subprocess.run(convert_cmd, check=True)
|
169 |
+
|
170 |
+
# Then quantize if needed
|
171 |
+
if quant_type != "f16":
|
172 |
+
quant_output = output_file.replace(".gguf", f"-{quant_type}.gguf")
|
173 |
+
quantize_cmd = [
|
174 |
+
os.path.join("llama.cpp", "quantize"),
|
175 |
+
output_file,
|
176 |
+
quant_output,
|
177 |
+
quant_type
|
178 |
+
]
|
179 |
+
subprocess.run(quantize_cmd, check=True)
|
180 |
+
# Replace the output file with the quantized version
|
181 |
+
os.remove(output_file)
|
182 |
+
os.rename(quant_output, output_file)
|
183 |
+
|
184 |
+
# Upload to HF
|
185 |
+
progress(progress_value + (progress_step * 0.7), f"Uploading {quant_name}...")
|
186 |
+
api.upload_file(
|
187 |
+
path_or_fileobj=output_file,
|
188 |
+
path_in_repo=f"{repo_name}-{quant_name}.gguf",
|
189 |
+
repo_id=target_repo_id,
|
190 |
+
commit_message=f"Add {quant_name} quantized version"
|
191 |
+
)
|
192 |
+
|
193 |
+
success_count += 1
|
194 |
+
except Exception as e:
|
195 |
+
print(f"Error processing {quant_name}: {str(e)}")
|
196 |
+
|
197 |
+
progress(1.0, "Completed!")
|
198 |
+
if success_count > 0:
|
199 |
+
return f"Successfully created {success_count} quantized versions in {target_repo_id}"
|
200 |
+
else:
|
201 |
+
return "Failed to create any quantized versions."
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
return f"Error: {str(e)}"
|
205 |
+
|
206 |
+
# Webhook handler - this will be called when the repo is updated
|
207 |
+
def setup_webhook(repo_id, target_repo=None, webhook_url=None):
|
208 |
+
"""Set up a webhook for repository updates"""
|
209 |
+
if not hf_token:
|
210 |
+
return "HF_TOKEN not set. Cannot set up webhook."
|
211 |
+
|
212 |
+
if not target_repo:
|
213 |
+
target_repo = f"{repo_id}-gguf"
|
214 |
+
|
215 |
+
# Create the webhook URL for this space
|
216 |
+
if not webhook_url:
|
217 |
+
# Get the current space name from HF_SPACE_ID
|
218 |
+
space_id = os.environ.get("HF_SPACE_ID")
|
219 |
+
if not space_id:
|
220 |
+
return "Cannot determine current Space ID. Please specify webhook_url manually."
|
221 |
+
|
222 |
+
webhook_url = f"https://huggingface.co/spaces/{space_id}/webhook"
|
223 |
+
|
224 |
+
try:
|
225 |
+
# Add webhook to the source repository
|
226 |
+
api.add_webhook(
|
227 |
+
repo_id=repo_id,
|
228 |
+
webhook_url=webhook_url,
|
229 |
+
webhook_type="repo-update"
|
230 |
+
)
|
231 |
+
return f"Webhook set up for {repo_id} -> {webhook_url}"
|
232 |
+
except Exception as e:
|
233 |
+
return f"Error setting up webhook: {str(e)}"
|
234 |
+
|
235 |
+
# Create Gradio interface
|
236 |
+
with gr.Blocks() as interface:
|
237 |
+
gr.Markdown("# GGUF Quantizer (Free Tier)")
|
238 |
+
gr.Markdown("Automatically create GGUF quantized versions of Hugging Face models")
|
239 |
+
|
240 |
+
with gr.Tab("Quantize Model"):
|
241 |
+
with gr.Row():
|
242 |
+
repo_id = gr.Textbox(label="Model Repository ID (e.g., 'mistralai/Mistral-7B-v0.1')")
|
243 |
+
|
244 |
+
with gr.Row():
|
245 |
+
q4_k_m = gr.Checkbox(label="Q4_K_M (4-bit, balanced quality/size)", value=True)
|
246 |
+
q5_k_m = gr.Checkbox(label="Q5_K_M (5-bit, higher quality)", value=False)
|
247 |
+
q8_0 = gr.Checkbox(label="Q8_0 (8-bit, highest quality)", value=False)
|
248 |
+
|
249 |
+
quantize_btn = gr.Button("Quantize Model")
|
250 |
+
output = gr.Textbox(label="Status")
|
251 |
+
|
252 |
+
def process_quantize(repo_id, q4_k_m, q5_k_m, q8_0, progress=gr.Progress()):
|
253 |
+
selected_types = {}
|
254 |
+
if q4_k_m:
|
255 |
+
selected_types["Q4_K_M"] = "q4_k_m"
|
256 |
+
if q5_k_m:
|
257 |
+
selected_types["Q5_K_M"] = "q5_k_m"
|
258 |
+
if q8_0:
|
259 |
+
selected_types["Q8_0"] = "q8_0"
|
260 |
+
|
261 |
+
if not selected_types:
|
262 |
+
return "Please select at least one quantization type"
|
263 |
+
|
264 |
+
return quantize_model(repo_id, selected_types, progress)
|
265 |
+
|
266 |
+
quantize_btn.click(
|
267 |
+
process_quantize,
|
268 |
+
inputs=[repo_id, q4_k_m, q5_k_m, q8_0],
|
269 |
+
outputs=output
|
270 |
+
)
|
271 |
+
|
272 |
+
with gr.Tab("Setup Webhook"):
|
273 |
+
gr.Markdown("""
|
274 |
+
## Set up automatic quantization
|
275 |
+
|
276 |
+
This will set up a webhook to trigger quantization whenever the source repository is updated.
|
277 |
+
Note: This requires HF_TOKEN to be set in Space secrets.
|
278 |
+
""")
|
279 |
+
|
280 |
+
webhook_repo_id = gr.Textbox(label="Source Repository ID")
|
281 |
+
webhook_btn = gr.Button("Set Up Webhook")
|
282 |
+
webhook_output = gr.Textbox(label="Webhook Status")
|
283 |
+
|
284 |
+
webhook_btn.click(
|
285 |
+
setup_webhook,
|
286 |
+
inputs=[webhook_repo_id],
|
287 |
+
outputs=webhook_output
|
288 |
+
)
|
289 |
+
|
290 |
+
with gr.Tab("Instructions"):
|
291 |
+
gr.Markdown("""
|
292 |
+
## Instructions
|
293 |
+
|
294 |
+
### How to use this Space:
|
295 |
+
|
296 |
+
1. **Manual Quantization**: Enter a model repository ID and select quantization types
|
297 |
+
2. **Automatic Quantization**: Set up a webhook to trigger quantization when the model is updated
|
298 |
+
|
299 |
+
### Adding HF_TOKEN to Space Secrets:
|
300 |
+
|
301 |
+
1. Go to your Space Settings
|
302 |
+
2. Click on "Repository Secrets"
|
303 |
+
3. Add a new secret with key `HF_TOKEN` and your Hugging Face API token as value
|
304 |
+
|
305 |
+
### Limitations (Free Tier):
|
306 |
+
|
307 |
+
- Limited memory: Very large models may fail to process
|
308 |
+
- Limited storage: Files are processed in streaming mode, but temp files still need space
|
309 |
+
- Limited compute: Quantization may take longer than on paid tiers
|
310 |
+
- Jobs might be interrupted if they run too long
|
311 |
+
""")
|
312 |
+
|
313 |
+
# Start Flask server to handle webhooks
|
314 |
+
from flask import Flask, request, jsonify
|
315 |
+
import threading
|
316 |
+
|
317 |
+
app = Flask(__name__)
|
318 |
+
|
319 |
+
@app.route('/webhook', methods=['POST'])
|
320 |
+
def handle_webhook():
|
321 |
+
try:
|
322 |
+
payload = request.json
|
323 |
+
|
324 |
+
# Check if this is a repo update event
|
325 |
+
event_type = payload.get('event')
|
326 |
+
if event_type == 'repo-update':
|
327 |
+
repo_id = payload.get('repo', {}).get('name')
|
328 |
+
|
329 |
+
if repo_id:
|
330 |
+
# Run quantization in background
|
331 |
+
threading.Thread(target=lambda: quantize_model(
|
332 |
+
repo_id,
|
333 |
+
{"Q4_K_M": "q4_k_m"} # Default to just Q4_K_M to save resources
|
334 |
+
)).start()
|
335 |
+
|
336 |
+
return jsonify({"status": "quantization scheduled"})
|
337 |
+
|
338 |
+
return jsonify({"status": "event ignored"})
|
339 |
+
except Exception as e:
|
340 |
+
return jsonify({"status": "error", "message": str(e)})
|
341 |
+
|
342 |
+
# Launch both the Gradio and Flask apps
|
343 |
+
import nest_asyncio
|
344 |
+
import uvicorn
|
345 |
+
from threading import Thread
|
346 |
+
|
347 |
+
nest_asyncio.apply()
|
348 |
+
|
349 |
+
# Launch the Gradio interface
|
350 |
+
def launch_gradio():
|
351 |
+
interface.launch(debug=False)
|
352 |
+
|
353 |
+
# Launch the Flask webhook handler
|
354 |
+
def launch_flask():
|
355 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
356 |
+
|
357 |
+
# Use the main Gradio interface as primary
|
358 |
+
if __name__ == "__main__":
|
359 |
+
Thread(target=launch_flask).start()
|
360 |
+
launch_gradio()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=3.41.0
|
2 |
+
huggingface_hub>=0.16.0
|
3 |
+
flask>=2.0.0
|
4 |
+
nest_asyncio>=1.5.6
|
5 |
+
uvicorn>=0.22.0
|