from onnxruntime.quantization import quantize_dynamic, QuantType | |
models = ["encoder_model.onnx", "decoder_model.onnx", "decoder_with_past_model.onnx"] | |
for model in models: | |
print(f"Quantizing model: {model}") | |
output_model_name = f"{model[:-5]}-quantized.onnx" | |
quantize_dynamic( | |
model_input=model, | |
model_output=output_model_name, | |
per_channel=True, | |
reduce_range=True, | |
weight_type=QuantType.QUInt8, | |
optimize_model=False | |
) | |
print(f"Quantized model: {output_model_name}") |