Spaces:

Sculptor-AI
/

auto-gguf-quant

Sleeping

App Files Files Community

auto-gguf-quant / app.py

Kaileh57

fix

77060e9 4 months ago

raw

history blame contribute delete

30.9 kB

	import os
	import subprocess
	import signal
	import time
	import json
	from datetime import datetime
	from pathlib import Path
	import threading
	import traceback

	os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
	import gradio as gr

	from huggingface_hub import HfApi, list_repo_files, hf_hub_download, login, whoami
	from apscheduler.schedulers.background import BackgroundScheduler

	# MODEL_REPO to monitor
	SOURCE_MODEL_REPO = "Sculptor-AI/Ursa_Minor"
	CONVERSION_SCRIPT = "./llama.cpp/convert-hf-to-gguf.py" # Updated script path
	STATUS_FILE = "status.json"

	# Quantization configurations in order of processing
	QUANT_CONFIGS = [
	{"type": "Q2_K", "size_gb": 0.8, "notes": ""},
	{"type": "Q3_K_S", "size_gb": 0.9, "notes": ""},
	{"type": "Q3_K_M", "size_gb": 0.9, "notes": "lower quality"},
	{"type": "Q3_K_L", "size_gb": 1.0, "notes": ""},
	{"type": "IQ4_XS", "size_gb": 1.0, "notes": ""},
	{"type": "Q4_K_S", "size_gb": 1.0, "notes": "fast, recommended"},
	{"type": "Q4_K_M", "size_gb": 1.1, "notes": "fast, recommended"},
	{"type": "Q5_K_S", "size_gb": 1.2, "notes": ""},
	{"type": "Q5_K_M", "size_gb": 1.2, "notes": ""},
	{"type": "Q6_K", "size_gb": 1.4, "notes": "very good quality"},
	{"type": "Q8_0", "size_gb": 1.7, "notes": "fast, best quality"},
	{"type": "f16", "size_gb": 3.2, "notes": "16 bpw, overkill"}
	]

	# Global variables for process state
	processing_lock = threading.Lock()
	current_status = {
	"status": "Not started",
	"last_check": None,
	"last_updated": None,
	"last_commit_hash": None,
	"current_quant": None,
	"quant_status": {},
	"progress": 0,
	"error": None,
	"log": []
	}

	def escape(s: str) -> str:
	"""Escape HTML for logging"""
	s = s.replace("&", "&")
	s = s.replace("<", "<")
	s = s.replace(">", ">")
	s = s.replace('"', """)
	s = s.replace("\n", "<br/>")
	return s

	def log_message(message: str, error: bool = False):
	"""Add message to log with timestamp"""
	timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
	log_entry = f"[{timestamp}] {message}"
	print(log_entry)
	current_status["log"].append(log_entry)
	if error:
	current_status["error"] = message

	# Keep log size manageable
	if len(current_status["log"]) > 100:
	current_status["log"] = current_status["log"][-100:]

	# Save current status to file
	save_status()

	def save_status():
	"""Save current status to file"""
	with open(STATUS_FILE, 'w') as f:
	json.dump(current_status, f)

	def load_status():
	"""Load status from file if it exists"""
	global current_status
	if os.path.exists(STATUS_FILE):
	try:
	with open(STATUS_FILE, 'r') as f:
	current_status = json.load(f)
	except Exception as e:
	log_message(f"Error loading status file: {str(e)}", error=True)

	def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
	"""Generate importance matrix for a model"""
	imatrix_command = [
	"./llama.cpp/llama-imatrix",
	"-m", model_path,
	"-f", train_data_path,
	"-ngl", "99",
	"--output-frequency", "10",
	"-o", output_path,
	]

	if not os.path.isfile(model_path):
	raise Exception(f"Model file not found: {model_path}")

	log_message(f"Running imatrix command for {model_path}...")
	process = subprocess.Popen(imatrix_command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

	try:
	# Monitor the process for output to provide updates
	for line in process.stdout:
	log_message(f"imatrix: {line.strip()}")

	process.wait(timeout=3600) # 1 hour timeout
	except subprocess.TimeoutExpired:
	log_message("Imatrix computation timed out. Sending SIGINT to allow graceful termination...", error=True)
	process.send_signal(signal.SIGINT)
	try:
	process.wait(timeout=60) # 1 minute grace period
	except subprocess.TimeoutExpired:
	log_message("Imatrix process still didn't terminate. Forcefully terminating process...", error=True)
	process.kill()

	stderr = process.stderr.read()
	if stderr:
	log_message(f"Imatrix stderr: {stderr}")

	log_message("Importance matrix generation completed.")

	def get_last_commit(repo_id: str):
	"""Get the last commit hash of a repository"""
	try:
	api = HfApi()
	# Use the model_info function instead of commit_info
	info = api.model_info(repo_id)
	# Get the commit hash from the info
	return info.sha
	except Exception as e:
	log_message(f"Error getting commit info: {str(e)}", error=True)
	return None

	def check_for_updates():
	"""Check if the source model has been updated"""
	if processing_lock.locked():
	log_message("Already processing, skipping update check")
	return False

	current_status["status"] = "Checking for updates"
	current_status["last_check"] = datetime.now().isoformat()

	try:
	# Get the latest commit hash
	latest_commit = get_last_commit(SOURCE_MODEL_REPO)
	if latest_commit is None:
	current_status["status"] = "Error checking for updates"
	return False

	log_message(f"Latest commit hash: {latest_commit}")
	log_message(f"Previous commit hash: {current_status.get('last_commit_hash')}")

	if current_status.get("last_commit_hash") != latest_commit:
	current_status["status"] = "Update detected"
	current_status["last_commit_hash"] = latest_commit
	save_status()
	return True
	else:
	current_status["status"] = "Up to date"
	save_status()
	return False
	except Exception as e:
	log_message(f"Error checking for updates: {str(e)}", error=True)
	current_status["status"] = "Error checking for updates"
	save_status()
	return False

	def check_llama_cpp():
	"""Check if llama.cpp is properly set up and build if needed"""
	global CONVERSION_SCRIPT # Declare global at the beginning of the function
	try:
	if not os.path.exists("llama.cpp"):
	log_message("llama.cpp directory not found, cloning repository...")
	subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp"], check=True)

	# Check for critical files
	converter_path = os.path.join("llama.cpp", "convert-hf-to-gguf.py")
	if not os.path.exists(converter_path):
	# Try alternative path
	old_converter_path = os.path.join("llama.cpp", "convert_hf_to_gguf.py")
	if os.path.exists(old_converter_path):
	log_message(f"Found converter at {old_converter_path}, using this path")
	CONVERSION_SCRIPT = old_converter_path
	else:
	log_message("Converter script not found, listing files in llama.cpp...")
	files = os.listdir("llama.cpp")
	log_message(f"Files in llama.cpp: {files}")

	# Search for any converter script
	for file in files:
	if file.startswith("convert") and file.endswith(".py"):
	log_message(f"Found alternative converter: {file}")
	CONVERSION_SCRIPT = os.path.join("llama.cpp", file)
	break

	# Build the tools
	log_message("Building llama.cpp tools...")
	os.chdir("llama.cpp")

	# Check if build directory exists
	if not os.path.exists("build"):
	os.makedirs("build")

	# Configure and build
	subprocess.run(["cmake", "-B", "build", "-DBUILD_SHARED_LIBS=OFF"], check=True)
	subprocess.run(["cmake", "--build", "build", "--config", "Release", "-j", "--target", "llama-quantize", "llama-gguf-split", "llama-imatrix"], check=True)

	# Copy binaries
	log_message("Copying built binaries...")
	try:
	# Different builds may put binaries in different places
	if os.path.exists(os.path.join("build", "bin")):
	for binary in ["llama-quantize", "llama-gguf-split", "llama-imatrix"]:
	src = os.path.join("build", "bin", binary)
	if os.path.exists(src):
	subprocess.run(["cp", src, "./"], check=True)
	else:
	for binary in ["llama-quantize", "llama-gguf-split", "llama-imatrix"]:
	src = os.path.join("build", binary)
	if os.path.exists(src):
	subprocess.run(["cp", src, "./"], check=True)
	except Exception as e:
	log_message(f"Error copying binaries: {str(e)}", error=True)

	# Return to the original directory
	os.chdir("..")

	# Make sure we have the calibration data
	if not os.path.exists(os.path.join("llama.cpp", "groups_merged.txt")):
	log_message("Copying calibration data...")
	if os.path.exists("groups_merged.txt"):
	subprocess.run(["cp", "groups_merged.txt", "llama.cpp/"], check=True)

	log_message("llama.cpp setup completed successfully")
	return True
	except Exception as e:
	log_message(f"Error setting up llama.cpp: {str(e)}", error=True)
	traceback.print_exc()
	return False

	def process_model():
	"""Process the model to create all quantized versions"""
	global CONVERSION_SCRIPT # Declare global at the beginning of the function
	if processing_lock.locked():
	log_message("Already processing, cannot start another process")
	return

	with processing_lock:
	try:
	# Check llama.cpp is set up
	if not check_llama_cpp():
	log_message("Failed to set up llama.cpp, aborting", error=True)
	current_status["status"] = "Error setting up llama.cpp"
	save_status()
	return

	# Validate authentication
	try:
	user_info = whoami()
	log_message(f"Processing as user: {user_info['name']}")
	except Exception as e:
	log_message(f"Authentication error: {str(e)}. Please make sure you're logged in.", error=True)
	current_status["status"] = "Authentication error"
	save_status()
	return

	api = HfApi()
	model_name = SOURCE_MODEL_REPO.split('/')[-1]
	current_status["status"] = "Processing"
	current_status["progress"] = 0
	save_status()

	# Prepare directories
	if not os.path.exists("downloads"):
	os.makedirs("downloads")
	if not os.path.exists("outputs"):
	os.makedirs("outputs")

	log_message(f"Starting model processing for {SOURCE_MODEL_REPO}")

	# Create temp directories for processing
	with Path("outputs").resolve() as outdir:
	log_message(f"Output directory: {outdir}")

	# Download the model
	log_message(f"Downloading model from {SOURCE_MODEL_REPO}")
	try:
	local_dir = Path("downloads") / model_name
	log_message(f"Local directory: {local_dir}")

	# Check and download pattern
	dl_pattern = [".md", ".json", "*.model"]
	try:
	files = list_repo_files(SOURCE_MODEL_REPO)
	has_safetensors = any(file.endswith(".safetensors") for file in files)
	pattern = ".safetensors" if has_safetensors else ".bin"
	dl_pattern.append(pattern)
	log_message(f"Using download pattern: {dl_pattern}")
	except Exception as e:
	log_message(f"Error checking repo files: {str(e)}", error=True)
	dl_pattern.append("*.safetensors")
	dl_pattern.append("*.bin")

	# Download the model
	api.snapshot_download(
	repo_id=SOURCE_MODEL_REPO,
	local_dir=local_dir,
	local_dir_use_symlinks=False,
	allow_patterns=dl_pattern
	)
	log_message("Model downloaded successfully!")

	# Check for adapter config - if it's a LoRA adapter, this won't work
	config_dir = local_dir / "config.json"
	adapter_config_dir = local_dir / "adapter_config.json"
	if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
	raise Exception('adapter_config.json is present. If you are converting a LoRA adapter to GGUF, please use a different tool.')

	# Convert to FP16 first
	fp16_path = str(outdir / f"{model_name}.fp16.gguf")
	log_message(f"Converting model to FP16: {fp16_path}")

	# Check if the converter script exists
	if not os.path.exists(CONVERSION_SCRIPT):
	log_message(f"Converter script not found at {CONVERSION_SCRIPT}, searching for alternatives", error=True)
	for root, dirs, files in os.walk("llama.cpp"):
	for file in files:
	if file.startswith("convert") and file.endswith(".py"):
	CONVERSION_SCRIPT = os.path.join(root, file)
	log_message(f"Found converter at {CONVERSION_SCRIPT}")
	break

	log_message(f"Using converter script: {CONVERSION_SCRIPT}")

	result = subprocess.run([
	"python", CONVERSION_SCRIPT, str(local_dir), "--outtype", "f16", "--outfile", fp16_path
	], shell=False, capture_output=True, text=True)

	if result.returncode != 0:
	log_message(f"Converter stderr: {result.stderr}")
	log_message(f"Converter stdout: {result.stdout}")
	raise Exception(f"Error converting to fp16: {result.stderr}")

	log_message("Model converted to fp16 successfully!")

	# Generate importance matrix for IQ quantizations
	imatrix_path = str(outdir / "imatrix.dat")
	train_data_path = "llama.cpp/groups_merged.txt" # Default calibration dataset

	if not os.path.isfile(train_data_path):
	log_message(f"Warning: Training data file not found at {train_data_path}, searching alternatives...")
	# Try to find it elsewhere
	if os.path.exists("groups_merged.txt"):
	train_data_path = "groups_merged.txt"
	log_message(f"Found training data at {train_data_path}")
	else:
	log_message("Calibration data not found. Some quantizations may not work.", error=True)

	try:
	if os.path.isfile(train_data_path):
	generate_importance_matrix(fp16_path, train_data_path, imatrix_path)
	else:
	imatrix_path = None
	except Exception as e:
	log_message(f"Error generating importance matrix: {str(e)}", error=True)
	imatrix_path = None

	# Process each quantization type
	total_quants = len(QUANT_CONFIGS)
	for i, quant_config in enumerate(QUANT_CONFIGS):
	quant_type = quant_config["type"]
	current_status["current_quant"] = quant_type
	current_status["progress"] = int((i / total_quants) * 100)
	save_status()

	log_message(f"Processing quantization {i+1}/{total_quants}: {quant_type}")

	try:
	# Check if this is an IQ quantization
	is_iq_quant = quant_type.startswith("IQ")

	# Skip if we don't have imatrix and this is an IQ quant
	if is_iq_quant and (imatrix_path is None or not os.path.exists(imatrix_path)):
	log_message(f"Skipping {quant_type} as importance matrix is not available", error=True)
	current_status["quant_status"][quant_type] = "Skipped - No imatrix"
	continue

	# Set up the repo name
	username = user_info["name"]
	repo_name = f"{model_name}-{quant_type}-GGUF"
	repo_id = f"{username}/{repo_name}"

	# Set up output path
	quant_file_name = f"{model_name.lower()}-{quant_type.lower()}.gguf"
	if is_iq_quant and quant_type != "f16":
	quant_file_name = f"{model_name.lower()}-{quant_type.lower()}-imat.gguf"

	quant_file_path = str(outdir / quant_file_name)

	# Run quantization
	if is_iq_quant and quant_type != "f16":
	quantize_cmd = [
	"./llama.cpp/llama-quantize",
	"--imatrix", imatrix_path, fp16_path, quant_file_path, quant_type
	]
	else:
	quantize_cmd = [
	"./llama.cpp/llama-quantize",
	fp16_path, quant_file_path, quant_type
	]

	log_message(f"Running quantization command: {' '.join(quantize_cmd)}")
	result = subprocess.run(quantize_cmd, shell=False, capture_output=True, text=True)

	if result.returncode != 0:
	if "out of memory" in result.stderr.lower():
	log_message(f"Out of memory error quantizing {quant_type}. Skipping larger models.", error=True)
	current_status["quant_status"][quant_type] = "Failed - Out of memory"
	# Break the loop to skip larger models
	break
	else:
	raise Exception(f"Error quantizing {quant_type}: {result.stderr}")

	log_message(f"Quantized successfully with {quant_type}!")

	# Create the repo if it doesn't exist
	log_message(f"Creating/updating repo {repo_id}")
	try:
	repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
	log_message(f"Repo URL: {repo_url}")
	except Exception as e:
	log_message(f"Error creating repo: {str(e)}", error=True)
	current_status["quant_status"][quant_type] = "Failed - Repo creation error"
	continue

	# Create README with model info
	log_message("Creating README")
	readme_content = f"""# {repo_name}
	This model was converted to GGUF format from [`{SOURCE_MODEL_REPO}`](https://huggingface.co/{SOURCE_MODEL_REPO}) using llama.cpp.

	## Quantization: {quant_type}
	Approximate size: {quant_config['size_gb']} GB
	Notes: {quant_config['notes']}

	## Use with llama.cpp
	Install llama.cpp through brew (works on Mac and Linux)

	```bash
	brew install llama.cpp
	```

	Invoke the llama.cpp server or the CLI.

	### CLI:
	```bash
	llama-cli --hf-repo {repo_id} --hf-file {quant_file_name} -p "The meaning to life and the universe is"
	```

	### Server:
	```bash
	llama-server --hf-repo {repo_id} --hf-file {quant_file_name} -c 2048
	```

	Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.

	Step 1: Clone llama.cpp from GitHub.
	```
	git clone https://github.com/ggerganov/llama.cpp
	```

	Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
	```
	cd llama.cpp && LLAMA_CURL=1 make
	```

	Step 3: Run inference through the main binary.
	```
	./llama-cli --hf-repo {repo_id} --hf-file {quant_file_name} -p "The meaning to life and the universe is"
	```
	or
	```
	./llama-server --hf-repo {repo_id} --hf-file {quant_file_name} -c 2048
	```

	## Auto-generated
	This model version was automatically generated when updates were detected in the source repository.
	Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
	"""
	readme_path = outdir / "README.md"
	with open(readme_path, 'w') as f:
	f.write(readme_content)

	# Upload the quantized model and README
	log_message(f"Uploading quantized model: {quant_file_path}")
	try:
	api.upload_file(
	path_or_fileobj=quant_file_path,
	path_in_repo=quant_file_name,
	repo_id=repo_id,
	)

	api.upload_file(
	path_or_fileobj=str(readme_path),
	path_in_repo="README.md",
	repo_id=repo_id,
	)

	if os.path.isfile(imatrix_path) and is_iq_quant:
	log_message(f"Uploading imatrix.dat")
	api.upload_file(
	path_or_fileobj=imatrix_path,
	path_in_repo="imatrix.dat",
	repo_id=repo_id,
	)

	log_message(f"Successfully uploaded {quant_type} quantization!")
	current_status["quant_status"][quant_type] = "Success"
	except Exception as e:
	log_message(f"Error uploading files: {str(e)}", error=True)
	current_status["quant_status"][quant_type] = f"Failed - Upload error: {str(e)}"

	except Exception as e:
	log_message(f"Error processing {quant_type}: {str(e)}", error=True)
	current_status["quant_status"][quant_type] = f"Failed: {str(e)}"
	# Continue with the next quantization

	# Update status after completion
	current_status["status"] = "Completed"
	current_status["progress"] = 100
	current_status["last_updated"] = datetime.now().isoformat()
	log_message("Model processing completed!")

	except Exception as e:
	log_message(f"Error during model processing: {str(e)}", error=True)
	current_status["status"] = "Error"
	current_status["error"] = str(e)
	traceback.print_exc()

	except Exception as e:
	log_message(f"Error: {str(e)}", error=True)
	current_status["status"] = "Error"
	current_status["error"] = str(e)
	traceback.print_exc()

	finally:
	save_status()

	def check_and_process():
	"""Check for updates and process if needed"""
	log_message("Running scheduled check for updates")
	if check_for_updates():
	log_message("Updates detected, starting processing")
	threading.Thread(target=process_model).start()
	else:
	log_message("No updates detected")

	def create_ui():
	"""Create the Gradio interface"""
	with gr.Blocks(css="body { margin: 0; padding: 0; }") as demo:
	gr.Markdown("# 🦙 Automatic GGUF Quantization for Ursa_Minor")
	gr.Markdown(f"This space automatically creates quantized GGUF versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/{SOURCE_MODEL_REPO}) model whenever it's updated.")

	with gr.Row():
	with gr.Column(scale=2):
	status_info = gr.HTML(label="Status", value="<p>Loading status...</p>")

	with gr.Column(scale=1):
	with gr.Row():
	check_button = gr.Button("Check for Updates", variant="primary")
	process_button = gr.Button("Force Processing", variant="secondary")

	# Remove the 'label' parameter since it's not supported
	progress_bar = gr.Progress()

	with gr.Tab("Quantization Status"):
	quant_status = gr.DataFrame(
	headers=["Type", "Size (GB)", "Notes", "Status"],
	value=lambda: [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS],
	label="Quantization Status"
	)

	with gr.Tab("Logs"):
	logs = gr.HTML(label="Logs", value="<p>Loading logs...</p>")

	def update_status():
	"""Update the status display"""
	status_html = f"""
	<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px;">
	<h3>Current Status: <span style="color: {'green' if current_status['status'] == 'Up to date' else 'blue' if current_status['status'] == 'Processing' else 'red' if 'Error' in current_status['status'] else 'orange'}">{current_status['status']}</span></h3>
	<p><strong>Last Checked:</strong> {current_status.get('last_check', 'Never').replace('T', ' ').split('.')[0] if current_status.get('last_check') else 'Never'}</p>
	<p><strong>Last Updated:</strong> {current_status.get('last_updated', 'Never').replace('T', ' ').split('.')[0] if current_status.get('last_updated') else 'Never'}</p>
	<p><strong>Current Quantization:</strong> {current_status.get('current_quant', 'None')}</p>
	{f'<p style="color: red;"><strong>Error:</strong> {current_status["error"]}</p>' if current_status.get('error') else ''}
	</div>
	"""
	return status_html

	def update_logs():
	"""Update the logs display"""
	logs_html = "<div style='height: 400px; overflow-y: auto; background-color: #f9f9f9; padding: 10px; font-family: monospace; white-space: pre-wrap;'>"
	for log in current_status["log"]:
	if "Error" in log or "error" in log:
	logs_html += f"<div style='color: red;'>{log}</div>"
	else:
	logs_html += f"<div>{log}</div>"
	logs_html += "</div>"
	return logs_html

	def on_check_button():
	"""Handle check button click"""
	if check_for_updates():
	threading.Thread(target=process_model).start()
	return update_status(), [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], update_logs()

	def on_process_button():
	"""Handle process button click"""
	threading.Thread(target=process_model).start()
	return update_status(), [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], update_logs()

	check_button.click(on_check_button, outputs=[status_info, quant_status, logs])
	process_button.click(on_process_button, outputs=[status_info, quant_status, logs])

	# Set up periodic refresh
	demo.load(update_status, outputs=[status_info])
	demo.load(lambda: [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], outputs=[quant_status])
	demo.load(update_logs, outputs=[logs])

	refresh_interval = 5 # seconds
	gr.HTML("<script>setInterval(function(){ Array.from(document.querySelectorAll('button[id=Refresh-Button]')).forEach(b => b.click()); }, " + str(refresh_interval 1000) + ");</script>")

	return demo

	# Initialize
	def initialize():
	"""Initialize the application"""
	global QUANT_CONFIGS
	# Sort configurations by size (smallest first)
	QUANT_CONFIGS = sorted(QUANT_CONFIGS, key=lambda x: x["size_gb"])
	# Rest of the initialization code...
	load_status()
	check_llama_cpp()
	scheduler = BackgroundScheduler()
	scheduler.add_job(check_and_process, 'interval', minutes=60)
	threading.Thread(target=check_and_process).start()

	if __name__ == "__main__":
	initialize()
	demo = create_ui()
	# Fixed queue parameter
	demo.queue(max_size=1).launch()