Spaces:

Sculptor-AI
/

auto-gguf-quant

Running

App Files Files Community

Kaileh57 commited on Mar 9

Commit

0a23172

1 Parent(s): 0b6befa

Setup app files

Browse files

Files changed (2) hide show

app.py +360 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import os
+import sys
+import gradio as gr
+import subprocess
+import tempfile
+import shutil
+from huggingface_hub import HfApi, login, Repository
+import time
+import threading
+# Initialize Hugging Face API
+hf_token = os.environ.get("HF_TOKEN")
+api = HfApi(token=hf_token)
+if hf_token:
+    login(token=hf_token)
+else:
+    print("WARNING: HF_TOKEN not set. You'll be limited to public repositories.")
+# Define quantization options
+QUANT_TYPES = {
+    "Q4_K_M": "q4_k_m",  # 4-bit, good quality and size
+    "Q5_K_M": "q5_k_m",  # 5-bit, better quality
+    "Q8_0": "q8_0"      # 8-bit, high quality
+}
+def install_llama_cpp():
+    """Install llama.cpp if not already installed"""
+    if not os.path.exists("llama.cpp"):
+        print("Installing llama.cpp...")
+        # Clone llama.cpp
+        subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "--depth=1"], check=True)
+        # Build llama.cpp (minimal build for conversion only)
+        os.chdir("llama.cpp")
+        subprocess.run(["make", "clean"], check=True)
+        subprocess.run(["make", "convert"], check=True)
+        os.chdir("..")
+        print("llama.cpp installed successfully")
+    else:
+        print("llama.cpp already installed")
+def clone_repo_shallow(repo_id, target_dir):
+    """Clone only the necessary files from a repo to save space"""
+    print(f"Cloning {repo_id} to {target_dir}...")
+    # Create a sparse checkout to save space
+    cmd = [
+        "git", "clone",
+        "--depth=1",
+        "--filter=blob:none",
+        f"https://huggingface.co/{repo_id}",
+        target_dir
+    ]
+    subprocess.run(cmd, check=True)
+    print(f"Repository {repo_id} cloned successfully")
+def find_model_files(directory):
+    """Find model files in the repository"""
+    # Look for common model file patterns
+    model_files = []
+    # Safetensors is preferred (usually smaller)
+    for pattern in ["*.safetensors", "consolidated.*.pt", "pytorch_model.bin", "*.bin"]:
+        cmd = ["find", directory, "-name", pattern]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.stdout:
+            model_files.extend(result.stdout.strip().split('\n'))
+    # Filter out empty strings and sort by size (prefer smaller files for HF format)
+    model_files = [f for f in model_files if f]
+    if not model_files:
+        return []
+    # Check for model configuration
+    config_file = None
+    cmd = ["find", directory, "-name", "config.json"]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.stdout:
+        config_file = result.stdout.strip().split('\n')[0]
+    return model_files, config_file
+def quantize_model(repo_id, quant_types, progress=gr.Progress()):
+    """Quantize a model with llama.cpp and push to Hugging Face"""
+    # Install llama.cpp if needed
+    install_llama_cpp()
+    # Create temporary directories for processing
+    with tempfile.TemporaryDirectory() as temp_dir:
+        progress(0.1, "Cloning repository...")
+        model_dir = os.path.join(temp_dir, "model")
+        output_dir = os.path.join(temp_dir, "output")
+        os.makedirs(model_dir, exist_ok=True)
+        os.makedirs(output_dir, exist_ok=True)
+        try:
+            # Clone the source repository
+            clone_repo_shallow(repo_id, model_dir)
+            # Find model files
+            progress(0.2, "Looking for model files...")
+            model_file_info = find_model_files(model_dir)
+            if not model_file_info:
+                return "No model files found in the repository."
+            model_files, config_file = model_file_info
+            model_file = model_files[0]  # Use the first model file found
+            progress(0.3, "Determining model type...")
+            # Try to determine model type
+            model_type = "llama"  # Default model type
+            if config_file:
+                with open(config_file, 'r') as f:
+                    import json
+                    config = json.load(f)
+                    if 'model_type' in config:
+                        config_model_type = config['model_type'].lower()
+                        # Map model type to llama.cpp supported types
+                        type_mapping = {
+                            'llama': 'llama',
+                            'mistral': 'llama',
+                            'mixtral': 'llama',
+                            'falcon': 'falcon',
+                            'mpt': 'mpt',
+                            'gpt_neox': 'gptneox',
+                            'gptj': 'gptj',
+                            'bloom': 'bloom'
+                        }
+                        model_type = type_mapping.get(config_model_type, 'llama')
+            # Create output repository name
+            repo_name = repo_id.split('/')[-1]
+            target_repo_id = f"{repo_id}-gguf"
+            # Create the output repository if it doesn't exist
+            progress(0.4, "Creating target repository...")
+            try:
+                api.create_repo(repo_id=target_repo_id, exist_ok=True)
+            except Exception as e:
+                return f"Error creating repository: {str(e)}"
+            success_count = 0
+            progress_step = 0.5 / len(quant_types)
+            progress_value = 0.4
+            # Process each quantization type
+            for quant_name, quant_type in quant_types.items():
+                progress_value += progress_step
+                progress(progress_value, f"Processing {quant_name} quantization...")
+                output_file = os.path.join(output_dir, f"{repo_name}-{quant_name}.gguf")
+                # Convert to GGUF format
+                print(f"Converting to {quant_name}...")
+                convert_cmd = [
+                    "python3",
+                    os.path.join("llama.cpp", "convert.py"),
+                    f"--model-type", model_type,
+                    f"--outtype", "f16",
+                    f"--outfile", output_file
+                ]
+                # Add model path
+                convert_cmd.append(model_file)
+                try:
+                    # First convert to GGUF format (without quantization)
+                    subprocess.run(convert_cmd, check=True)
+                    # Then quantize if needed
+                    if quant_type != "f16":
+                        quant_output = output_file.replace(".gguf", f"-{quant_type}.gguf")
+                        quantize_cmd = [
+                            os.path.join("llama.cpp", "quantize"),
+                            output_file,
+                            quant_output,
+                            quant_type
+                        ]
+                        subprocess.run(quantize_cmd, check=True)
+                        # Replace the output file with the quantized version
+                        os.remove(output_file)
+                        os.rename(quant_output, output_file)
+                    # Upload to HF
+                    progress(progress_value + (progress_step * 0.7), f"Uploading {quant_name}...")
+                    api.upload_file(
+                        path_or_fileobj=output_file,
+                        path_in_repo=f"{repo_name}-{quant_name}.gguf",
+                        repo_id=target_repo_id,
+                        commit_message=f"Add {quant_name} quantized version"
+                    )
+                    success_count += 1
+                except Exception as e:
+                    print(f"Error processing {quant_name}: {str(e)}")
+            progress(1.0, "Completed!")
+            if success_count > 0:
+                return f"Successfully created {success_count} quantized versions in {target_repo_id}"
+            else:
+                return "Failed to create any quantized versions."
+        except Exception as e:
+            return f"Error: {str(e)}"
+# Webhook handler - this will be called when the repo is updated
+def setup_webhook(repo_id, target_repo=None, webhook_url=None):
+    """Set up a webhook for repository updates"""
+    if not hf_token:
+        return "HF_TOKEN not set. Cannot set up webhook."
+    if not target_repo:
+        target_repo = f"{repo_id}-gguf"
+    # Create the webhook URL for this space
+    if not webhook_url:
+        # Get the current space name from HF_SPACE_ID
+        space_id = os.environ.get("HF_SPACE_ID")
+        if not space_id:
+            return "Cannot determine current Space ID. Please specify webhook_url manually."
+        webhook_url = f"https://huggingface.co/spaces/{space_id}/webhook"
+    try:
+        # Add webhook to the source repository
+        api.add_webhook(
+            repo_id=repo_id,
+            webhook_url=webhook_url,
+            webhook_type="repo-update"
+        )
+        return f"Webhook set up for {repo_id} -> {webhook_url}"
+    except Exception as e:
+        return f"Error setting up webhook: {str(e)}"
+# Create Gradio interface
+with gr.Blocks() as interface:
+    gr.Markdown("# GGUF Quantizer (Free Tier)")
+    gr.Markdown("Automatically create GGUF quantized versions of Hugging Face models")
+    with gr.Tab("Quantize Model"):
+        with gr.Row():
+            repo_id = gr.Textbox(label="Model Repository ID (e.g., 'mistralai/Mistral-7B-v0.1')")
+        with gr.Row():
+            q4_k_m = gr.Checkbox(label="Q4_K_M (4-bit, balanced quality/size)", value=True)
+            q5_k_m = gr.Checkbox(label="Q5_K_M (5-bit, higher quality)", value=False)
+            q8_0 = gr.Checkbox(label="Q8_0 (8-bit, highest quality)", value=False)
+        quantize_btn = gr.Button("Quantize Model")
+        output = gr.Textbox(label="Status")
+        def process_quantize(repo_id, q4_k_m, q5_k_m, q8_0, progress=gr.Progress()):
+            selected_types = {}
+            if q4_k_m:
+                selected_types["Q4_K_M"] = "q4_k_m"
+            if q5_k_m:
+                selected_types["Q5_K_M"] = "q5_k_m"
+            if q8_0:
+                selected_types["Q8_0"] = "q8_0"
+            if not selected_types:
+                return "Please select at least one quantization type"
+            return quantize_model(repo_id, selected_types, progress)
+        quantize_btn.click(
+            process_quantize,
+            inputs=[repo_id, q4_k_m, q5_k_m, q8_0],
+            outputs=output
+        )
+    with gr.Tab("Setup Webhook"):
+        gr.Markdown("""
+        ## Set up automatic quantization
+        This will set up a webhook to trigger quantization whenever the source repository is updated.
+        Note: This requires HF_TOKEN to be set in Space secrets.
+        """)
+        webhook_repo_id = gr.Textbox(label="Source Repository ID")
+        webhook_btn = gr.Button("Set Up Webhook")
+        webhook_output = gr.Textbox(label="Webhook Status")
+        webhook_btn.click(
+            setup_webhook,
+            inputs=[webhook_repo_id],
+            outputs=webhook_output
+        )
+    with gr.Tab("Instructions"):
+        gr.Markdown("""
+        ## Instructions
+        ### How to use this Space:
+        1. **Manual Quantization**: Enter a model repository ID and select quantization types
+        2. **Automatic Quantization**: Set up a webhook to trigger quantization when the model is updated
+        ### Adding HF_TOKEN to Space Secrets:
+        1. Go to your Space Settings
+        2. Click on "Repository Secrets"
+        3. Add a new secret with key `HF_TOKEN` and your Hugging Face API token as value
+        ### Limitations (Free Tier):
+        - Limited memory: Very large models may fail to process
+        - Limited storage: Files are processed in streaming mode, but temp files still need space
+        - Limited compute: Quantization may take longer than on paid tiers
+        - Jobs might be interrupted if they run too long
+        """)
+# Start Flask server to handle webhooks
+from flask import Flask, request, jsonify
+import threading
+app = Flask(__name__)
+@app.route('/webhook', methods=['POST'])
+def handle_webhook():
+    try:
+        payload = request.json
+        # Check if this is a repo update event
+        event_type = payload.get('event')
+        if event_type == 'repo-update':
+            repo_id = payload.get('repo', {}).get('name')
+            if repo_id:
+                # Run quantization in background
+                threading.Thread(target=lambda: quantize_model(
+                    repo_id,
+                    {"Q4_K_M": "q4_k_m"}  # Default to just Q4_K_M to save resources
+                )).start()
+                return jsonify({"status": "quantization scheduled"})
+        return jsonify({"status": "event ignored"})
+    except Exception as e:
+        return jsonify({"status": "error", "message": str(e)})
+# Launch both the Gradio and Flask apps
+import nest_asyncio
+import uvicorn
+from threading import Thread
+nest_asyncio.apply()
+# Launch the Gradio interface
+def launch_gradio():
+    interface.launch(debug=False)
+# Launch the Flask webhook handler
+def launch_flask():
+    uvicorn.run(app, host="0.0.0.0", port=7860)
+# Use the main Gradio interface as primary
+if __name__ == "__main__":
+    Thread(target=launch_flask).start()
+    launch_gradio()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=3.41.0
+huggingface_hub>=0.16.0
+flask>=2.0.0
+nest_asyncio>=1.5.6
+uvicorn>=0.22.0