Spaces:

SadP0i
/

GGUF-Model-VRAM-Calculator

Running

App Files Files Community

GGUF-Model-VRAM-Calculator / index.html

SadP0i

Upload index.html

615c77c verified 6 months ago

raw

history blame

22.2 kB

	<!DOCTYPE html>
	<html lang="en">

	<head>
	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<script>
	function strToHtml(str) {
	let parser = new DOMParser();
	return parser.parseFromString(str, "text/html");
	}

	//Short, jQuery-independent function to read html table and write them into an Array.
	//Kudos to RobG at StackOverflow
	function tableToObj(table) {
	var rows = table.rows;
	var propCells = rows[0].cells;
	var propNames = [];
	var results = [];
	var obj, row, cells;

	// Use the first row for the property names
	// Could use a header section but result is the same if
	// there is only one header row
	for (var i = 0, iLen = propCells.length; i < iLen; i++) {
	propNames.push(
	(propCells[i].textContent \|\| propCells[i].innerText).trim()
	);
	}

	// Use the rows for data
	// Could use tbody rows here to exclude header & footer
	// but starting from 1 gives required result
	for (var j = 1, jLen = rows.length; j < jLen; j++) {
	cells = rows[j].cells;
	obj = {};

	for (var k = 0; k < iLen; k++) {
	obj[propNames[k]] = (
	cells[k].textContent \|\| cells[k].innerText
	).trim();
	}
	results.push(obj);
	}
	return results;
	}

	function formatGpu(gpus) {
	return gpus.map(
	(g) => `${g["Product Name"]} - ${g["Memory"].split(",")[0]}`
	);
	}

	const gguf_quants = {
	"IQ1_S": 1.56,
	"IQ1_M": 1.75,
	"IQ2_XXS": 2.06,
	"IQ2_XS": 2.31,
	"IQ2_S": 2.5,
	"IQ3_XXS": 3.06,
	"IQ3_XS": 3.3,
	"IQ3_S": 3.44,
	"IQ3_M": 3.66,
	"Q2_K": 3.35,
	"Q3_K_S": 3.5,
	"Q3_K_M": 3.91,
	"Q3_K_L": 4.27,
	"IQ4_XS": 4.25,
	"Q4_0": 4.55,
	"Q4_K_S": 4.58,
	"Q4_K_M": 4.85,
	"Q5_0": 5.54,
	"Q5_K_S": 5.54,
	"Q5_K_M": 5.69,
	"Q6_K": 6.59,
	"Q8_0": 8.5,
	}

	function sanitize(string) {
	const map = {
	'&': '&',
	'<': '<',
	'>': '>',
	'"': '"',
	"'": ''',
	"/": '/',
	};
	const reg = /[&<>"'/]/ig;
	return string.replace(reg, (match) => (map[match]));
	}

	async function modelConfig(hf_model) {
	let config = {}

	try {
	config = await fetch(
	`https://huggingface.co/${hf_model}/raw/main/config.json`
	).then(r => r.json());
	} catch (err) {
	alert(sanitize(err));
	return config;
	}

	let model_size = 0
	try {
	model_size = (await fetch(`https://huggingface.co/${hf_model}/resolve/main/model.safetensors.index.json`).then(r => r.json()))["metadata"]["total_size"] / 2
	if (isNaN(model_size)) {
	throw new Erorr("no size in safetensors metadata")
	}
	} catch (e) {
	try {
	model_size = (await fetch(`https://huggingface.co/${hf_model}/resolve/main/pytorch_model.bin.index.json`).then(r => r.json()))["metadata"]["total_size"] / 2
	if (isNaN(model_size)) {
	throw new Erorr("no size in pytorch metadata")
	}
	} catch {
	let model_page = await fetch(
	"https://corsproxy.io/?" + encodeURIComponent(`https://huggingface.co/${hf_model}`)
	).then(r => r.text())
	let el = document.createElement('html');
	el.innerHTML = model_page
	let params_el = el.querySelector('div[data-target="ModelSafetensorsParams"]')
	if (params_el !== null) {
	model_size = JSON.parse(params_el.attributes.getNamedItem("data-props").value)["safetensors"]["total"]
	} else {
	params_el = el.querySelector('div[data-target="ModelHeader"]')
	model_size = JSON.parse(params_el.attributes.getNamedItem("data-props").value)["model"]["safetensors"]["total"]
	}
	}
	}
	config.parameters = model_size
	return config
	}

	function inputBuffer(context = 8192, model_config, bsz = 512) {
	/* Calculation taken from github:ggerganov/llama.cpp/llama.cpp:11248
	ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
	ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
	ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
	ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
	ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
	ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);

	n_embd is hidden size (github:ggeranov/llama.cpp/convert.py:248)
	*/
	const inp_tokens = bsz
	const inp_embd = model_config["hidden_size"] * bsz
	const inp_pos = bsz
	const inp_KQ_mask = context * bsz
	const inp_K_shift = context
	const inp_sum = bsz

	return inp_tokens + inp_embd + inp_pos + inp_KQ_mask + inp_K_shift + inp_sum
	}

	function computeBuffer(context = 8192, model_config, bsz = 512) {
	if (bsz != 512) {
	alert("batch size other than 512 is currently not supported for the compute buffer, using batchsize 512 for compute buffer calculation, end result result will be an overestimatition")
	}
	return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
	}

	function kvCache(context = 8192, model_config, cache_bit = 16) {
	const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
	const n_embd_gqa = model_config["hidden_size"] / n_gqa
	const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
	const size = 2 * n_elements
	return size * (cache_bit / 8)
	}

	function contextSize(context = 8192, model_config, bsz = 512, cache_bit = 16) {
	return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
	}

	function modelSize(model_config, bpw = 4.5) {
	return Number.parseFloat((model_config["parameters"] * bpw / 8).toFixed(2))
	}

	async function calculateSizes(format, context_loc) {

	format = "gguf"

	try {
	const model_config = await modelConfig(document.getElementById("modelsearch").value)
	const context = parseInt(document.getElementById("contextsize").value)
	let bsz = 512
	let cache_bit = 16
	let bpw = 0
	if (format === "gguf") {
	bsz = parseInt(document.getElementById("batchsize").value)
	bpw = gguf_quants[document.getElementById("quantsize").innerText]

	} else if (format == "exl2") {
	cache_bit = Number.parseInt(document.getElementById("kvCache").value)
	bpw = Number.parseFloat(document.getElementById("bpw").value)
	}

	const model_size = modelSize(model_config, bpw)
	const context_size = contextSize(context, model_config, bsz, cache_bit)
	const total_size = ((model_size + context_size) / 2 ** 30)
	document.getElementById("resultmodel").innerText = (model_size / 2 ** 30).toFixed(2)
	document.getElementById("resultcontext").innerText = (context_size / 2 ** 30).toFixed(2)
	const result_total_el = document.getElementById("resulttotal");
	result_total_el.innerText = total_size.toFixed(2)

	const allocated_vram = Number.parseInt(document.getElementById("maxvram").value);
	const vram = allocated_vram
	if (vram - total_size > 0.5) {
	result_total_el.style.backgroundColor = "#bef264"
	} else if (vram - total_size > 0) {
	result_total_el.style.backgroundColor = "#facc15"
	} else {
	result_total_el.style.backgroundColor = "#ef4444"
	}

	const layer_size = ((model_size / 2 ** 30) / model_config["num_hidden_layers"])
	const layer_size_el = document.getElementById("layersize");
	layer_size_el.innerText = layer_size.toFixed(2)

	const context_dealloc = context_loc === "vram" ? (context_size / 2 ** 30) : 0;
	const layers_offload = Math.floor((allocated_vram - context_dealloc) / layer_size)

	const layers_offload_el = document.getElementById("layersoffload");
	layers_offload_el.innerText = `${layers_offload > model_config["num_hidden_layers"] ? model_config["num_hidden_layers"] : Math.max(0, layers_offload)}/${model_config["num_hidden_layers"]}`

	} catch (e) {
	alert(e);
	}
	}
	</script>
	<link href="./styles.css" rel="stylesheet">
	<title>Can I split it? - GGUF VRAM Calculator</title>
	</head>

	<body class="p-8">
	<div x-data="{ format: 'gguf', context_loc: 'vram' }" class="flex flex-col max-h-screen items-center mt-16 gap-10">
	<div style="text-align: center;">
	<h1 class="text-xl font-semibold leading-6 text-gray-900">
	GGUF Model, Can I split it?
	</h1>
	<h3 class="font-semibold leading-6 text-gray-900">
	Based on <a href="https://huggingface.co/NyxKrage" style="color: blue;">NyxKrage</a>'s <a
	href="https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator" style="color: blue;">LLM VRAM
	calculator</a>
	</h3>
	</div>
	<div class="flex flex-col gap-10">
	<div class="w-auto flex flex-col gap-4">
	<div class="relative">
	<label for="maxvram"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Max Allocated VRAM
	</label>
	<input value="24" type="number" name="maxvram" id="maxvram" step="1"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6" />
	</div>

	<!-- Model Selector -->


	<div class="flex flex-row gap-4 relative">
	<label for="contextsize"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Model (unquantized)
	</label>
	<div
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	x-data="{
	open: false,
	value: 'Nexusflow/Starling-LM-7B-beta',
	results: null,
	toggle() {
	if (this.open) {
	return this.close()
	}

	this.$refs.input.focus()

	this.open = true
	},
	close(focusAfter) {
	if (! this.open) return

	this.open = false

	focusAfter && focusAfter.focus()
	}
	}" x-on:keydown.escape.prevent.stop="close($refs.input)" x-id="['model-typeahead']"
	class="relative">
	<!-- Input -->
	<input id="modelsearch" x-ref="input" x-on:click="toggle()"
	@keypress.debounce.150ms="results = (await
	fetch('https://huggingface.co/api/quicksearch?type=model&q=' +
	encodeURIComponent(value)).then(r => r.json())).models.filter(m => !m.id.includes('GGUF') && !m.id.includes('AWQ') && !m.id.includes('GPTQ') && !m.id.includes('exl2'));"
	:aria-expanded="open" :aria-controls="$id('model-typeahead')" x-model="value"
	class="flex justify-between items-center gap-2 w-full" />

	<!-- Panel -->
	<div x-ref="panel" x-show="open" x-transition.origin.top.left x-on:click.outside="close($refs.input)"
	:id="$id('model-typeahead')" style="display: none"
	class="absolute left-0 mt-4 w-full rounded-md bg-white shadow-sm ring-1 ring-inset ring-gray-300 z-10">
	<template x-for="result in results">
	<a @click="value = result.id; close($refs.input)" x-text="result.id"
	class="flex cursor-pointer items-center gap-2 w-full first-of-type:rounded-t-md last-of-type:rounded-b-md px-4 py-2.5 text-left text-sm hover:bg-gray-500/5 disabled:text-gray-500"></a>
	</template>
	</div>
	</div>
	</div>


	<!-- Context Size Selector -->
	<div class="relative">
	<label for="contextsize"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Context Size
	</label>
	<input value="8192" type="number" name="contextsize" id="contextsize" step="1024"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6" />
	</div>

	<div class="relative">
	<label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">Context
	offloaded to</label>
	<fieldset x-model="context_loc"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
	<legend class="sr-only">Context location</legend>
	<div class="space-y-4 sm:flex sm:items-center sm:space-x-10 sm:space-y-0">
	<div class="flex items-center">
	<input id="context-vram" name="context-allocation" type="radio" value="vram" checked
	class="h-4 w-4 border-gray-300 text-indigo-600 focus:ring-indigo-600" />
	<label for="context-vram" class="ml-3 block text-sm font-medium leading-6 text-gray-900">VRAM</label>
	</div>
	<div class="flex items-center">
	<input id="context-ram" name="context-allocation" type="radio" value="ram"
	class="h-4 w-4 border-gray-300 text-indigo-600 focus:ring-indigo-600" />
	<label for="context-ram" class="ml-3 block text-sm font-medium leading-6 text-gray-900">RAM</label>
	</div>
	</div>
	</fieldset>
	</div>

	<!-- GGUF Options -->
	<div x-show="format === 'gguf'" class="relative">
	<div class="flex flex-row gap-4">
	<label for="contextsize"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Quantization Size
	</label>
	<div
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	x-data="{
	open: false,
	value: '',
	toggle() {
	if (this.open) {
	return this.close()
	}

	this.$refs.button.focus()

	this.open = true
	},
	close(focusAfter) {
	if (! this.open) return

	this.open = false

	focusAfter && focusAfter.focus()
	}
	}" x-on:keydown.escape.prevent.stop="close($refs.button)" x-id="['dropdown-button']" class="relative">
	<!-- Button -->
	<button x-ref="button" x-on:click="toggle()" :aria-expanded="open" :aria-controls="$id('dropdown-button')"
	type="button" id="quantsize" x-text="value.length === 0 ? 'Q4_K_S' : value"
	class="flex justify-between items-center gap-2 w-full">
	Q4_K_S

	<!-- Heroicon: chevron-down -->
	<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 text-gray-400" viewBox="0 0 20 20"
	fill="currentColor">
	<path fill-rule="evenodd"
	d="M5.293 7.293a1 1 0 011.414 0L10 10.586l3.293-3.293a1 1 0 111.414 1.414l-4 4a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414z"
	clip-rule="evenodd" />
	</svg>
	</button>

	<!-- Panel -->
	<div x-data="{ quants: [
	'IQ1_S',
	'IQ1_M',
	'IQ2_XXS',
	'IQ2_XS',
	'IQ2_S',
	'IQ3_XXS',
	'IQ3_XS',
	'IQ3_S',
	'IQ3_M',
	'Q2_K',
	'Q3_K_S',
	'Q3_K_M',
	'Q3_K_L',
	'IQ4_XS',
	'Q4_0',
	'Q4_K_S',
	'Q4_K_M',
	'Q5_0',
	'Q5_K_S',
	'Q5_K_M',
	'Q6_K',
	'Q8_0'
	]}" x-ref="panel" x-show="open" x-transition.origin.top.left x-on:click.outside="close($refs.button)"
	:id="$id('dropdown-button')" style="display: none"
	class="absolute left-0 mt-4 w-full rounded-md bg-white shadow-sm ring-1 ring-inset ring-gray-300 z-10">
	<template x-for="quant in quants">
	<a @click="value = quant; close($refs.button)" x-text="quant"
	class="flex cursor-pointer items-center gap-2 w-full first-of-type:rounded-t-md last-of-type:rounded-b-md px-4 py-2.5 text-left text-sm hover:bg-gray-500/5 disabled:text-gray-500"></a>
	</template>
	</div>
	</div>
	<div class="relative">
	<label for="batchsize"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Batch Size
	</label>
	<input value="512" type="number" step="128" id="batchsize"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6" />
	</div>
	</div>
	</div>
	<button type="button"
	class="rounded-md bg-slate-800 px-3 py-2 text-sm font-semibold text-white shadow-sm hover:bg-slate-700 focus-visible:outline focus-visible:outline-2 focus-visible:outline-offset-2 focus-visible:outline-indigo-600"
	@click="calculateSizes(format, context_loc)">
	Submit
	</button>
	</div>
	<div class="w-auto flex flex-col gap-4">
	<div class="relative">
	<label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Model Size (GB)
	</label>
	<div id="resultmodel"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
	4.20</div>
	</div>
	<div class="relative">
	<label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Context Size (GB)
	</label>
	<div id="resultcontext"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
	6.90</div>
	</div>
	<div class="relative">
	<label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Total Size (GB)
	</label>
	<div id="resulttotal"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
	420.69</div>
	</div>
	<div class="relative">
	<label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Layer size (GB)
	</label>
	<div id="layersize"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
	42.69</div>
	</div>
	<div class="relative">
	<label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
	Layers offloaded to GPU (out of total)
	</label>
	<div id="layersoffload"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
	42</div>
	</div>
	</div>
	</div>
	</div>
	<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/cdn.min.js"></script>
	<script defer>
	calculateSizes("gguf", "vram")
	</script>
	</body>

	</html>