|
import gguf |
|
import pytest |
|
import torch |
|
|
|
from invokeai.backend.quantization.gguf.ggml_tensor import GGMLTensor |
|
from invokeai.backend.util.calc_tensor_size import calc_tensor_size |
|
|
|
|
|
def quantize_tensor(data: torch.Tensor, ggml_quantization_type: gguf.GGMLQuantizationType) -> GGMLTensor: |
|
"""Quantize a torch.Tensor to a GGMLTensor. |
|
|
|
Uses the gguf library's numpy implementation to quantize the tensor. |
|
""" |
|
data_np = data.detach().cpu().numpy() |
|
quantized_np = gguf.quantize(data_np, ggml_quantization_type) |
|
return GGMLTensor( |
|
data=torch.from_numpy(quantized_np), |
|
ggml_quantization_type=ggml_quantization_type, |
|
tensor_shape=data.shape, |
|
compute_dtype=data.dtype, |
|
).to(device=data.device) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
["device", "x1_quant_type", "x2_quant_type"], |
|
[ |
|
|
|
("cpu", None, None), |
|
|
|
("cpu", gguf.GGMLQuantizationType.Q8_0, gguf.GGMLQuantizationType.Q8_0), |
|
("cpu", None, gguf.GGMLQuantizationType.Q8_0), |
|
("cpu", gguf.GGMLQuantizationType.Q8_0, None), |
|
|
|
("cpu", gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.F16), |
|
("cpu", None, gguf.GGMLQuantizationType.F16), |
|
("cpu", gguf.GGMLQuantizationType.F16, None), |
|
|
|
("cuda", None, None), |
|
|
|
("cuda", gguf.GGMLQuantizationType.Q8_0, gguf.GGMLQuantizationType.Q8_0), |
|
("cuda", None, gguf.GGMLQuantizationType.Q8_0), |
|
("cuda", gguf.GGMLQuantizationType.Q8_0, None), |
|
|
|
("cuda", gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.F16), |
|
("cuda", None, gguf.GGMLQuantizationType.F16), |
|
("cuda", gguf.GGMLQuantizationType.F16, None), |
|
], |
|
) |
|
def test_ggml_tensor_multiply( |
|
device: str, x1_quant_type: gguf.GGMLQuantizationType | None, x2_quant_type: gguf.GGMLQuantizationType | None |
|
): |
|
|
|
if device == "cuda" and not torch.cuda.is_available(): |
|
pytest.skip("CUDA is not available.") |
|
|
|
generator = torch.Generator().manual_seed(123) |
|
|
|
x1 = torch.randn(32, 64, generator=generator).to(device=device) |
|
x2 = torch.randn(32, 64, generator=generator).to(device=device) |
|
|
|
|
|
x1_quantized = quantize_tensor(x1, x1_quant_type) if x1_quant_type is not None else x1 |
|
x2_quantized = quantize_tensor(x2, x2_quant_type) if x2_quant_type is not None else x2 |
|
|
|
|
|
for x in [x1, x2, x1_quantized, x2_quantized]: |
|
assert x.device.type == device |
|
|
|
|
|
result = x1 * x2 |
|
result_quantized = x1_quantized * x2_quantized |
|
|
|
assert result.shape == result_quantized.shape |
|
assert result.dtype == result_quantized.dtype |
|
assert torch.allclose(result, result_quantized, atol=1e-1) |
|
|
|
|
|
def test_ggml_tensor_to_dtype_raises_error(): |
|
x = torch.randn(32, 64) |
|
x_quantized = quantize_tensor(x, gguf.GGMLQuantizationType.Q8_0) |
|
|
|
with pytest.raises(ValueError): |
|
x_quantized.to(dtype=torch.float32) |
|
|
|
with pytest.raises(ValueError): |
|
x_quantized.float() |
|
|
|
|
|
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA device") |
|
def test_ggml_tensor_to_device(): |
|
x = torch.randn(32, 64) |
|
x_cpu = quantize_tensor(x, gguf.GGMLQuantizationType.Q8_0) |
|
|
|
x_gpu = x_cpu.to(device=torch.device("cuda")) |
|
|
|
assert x_cpu.device.type == "cpu" |
|
assert x_gpu.device.type == "cuda" |
|
|
|
assert torch.allclose(x_cpu.quantized_data, x_gpu.quantized_data.cpu(), atol=1e-5) |
|
|
|
|
|
def test_ggml_tensor_shape(): |
|
x = torch.randn(32, 64) |
|
x_quantized = quantize_tensor(x, gguf.GGMLQuantizationType.Q8_0) |
|
|
|
assert x_quantized.shape == x.shape |
|
assert x_quantized.size() == x.size() |
|
|
|
|
|
def test_ggml_tensor_quantized_shape(): |
|
x = torch.randn(32, 64) |
|
x_quantized = quantize_tensor(x, gguf.GGMLQuantizationType.Q8_0) |
|
|
|
|
|
|
|
assert x_quantized.quantized_shape != x.shape |
|
|
|
|
|
def test_ggml_tensor_calc_size(): |
|
"""Test that the calc_tensor_size(...) utility function correctly uses the underlying quantized tensor to calculate |
|
size rather than the unquantized tensor. |
|
""" |
|
x = torch.randn(32, 64) |
|
x_quantized = quantize_tensor(x, gguf.GGMLQuantizationType.Q8_0) |
|
|
|
compression_ratio = calc_tensor_size(x) / calc_tensor_size(x_quantized) |
|
|
|
assert abs(compression_ratio - 4) < 0.5 |
|
|