Spaces:
Runtime error
Runtime error
import os | |
import time | |
import numpy as np | |
import onnxruntime as ort | |
os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1" | |
os.environ["ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE"] = "0" | |
os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1" | |
sess_opt = ort.SessionOptions() | |
sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL | |
print("Create inference session...") | |
execution_provider = ["TensorrtExecutionProvider", "CUDAExecutionProvider"] | |
sess = ort.InferenceSession("model.onnx", sess_options=sess_opt, providers=execution_provider) | |
run_opt = ort.RunOptions() | |
sequence = 128 | |
batch = 1 | |
input_ids = np.ones((batch, sequence), dtype=np.int64) | |
attention_mask = np.ones((batch, sequence), dtype=np.int64) | |
token_type_ids = np.ones((batch, sequence), dtype=np.int64) | |
print("Warm up phase...") | |
sess.run( | |
None, | |
{ | |
sess.get_inputs()[0].name: input_ids, | |
sess.get_inputs()[1].name: attention_mask, | |
sess.get_inputs()[2].name: token_type_ids, | |
}, | |
run_options=run_opt, | |
) | |
print("Start inference...") | |
start_time = time.time() | |
max_iters = 2000 | |
predict = {} | |
for iter in range(max_iters): | |
predict = sess.run( | |
None, | |
{ | |
sess.get_inputs()[0].name: input_ids, | |
sess.get_inputs()[1].name: attention_mask, | |
sess.get_inputs()[2].name: token_type_ids, | |
}, | |
run_options=run_opt, | |
) | |
print("Average Inference Time = {:.3f} ms".format((time.time() - start_time) * 1000 / max_iters)) | |