Spaces:

sonalkum
/

GAMA

Runtime error

App Files Files Community

GAMA / hf /transformers /examples /research_projects /quantization-qdqbert /evaluate-hf-trt-qa.py

sonalkum

bug fix

fa57c60 12 months ago

raw

history blame

17.8 kB

	# coding=utf-8
	# Copyright 2021 NVIDIA Corporation. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
	import argparse
	import logging
	import os
	import time
	import timeit

	import datasets
	import numpy as np
	import pycuda.autoinit # noqa: F401
	import pycuda.driver as cuda
	import tensorrt as trt
	import torch
	from absl import logging as absl_logging
	from accelerate import Accelerator
	from datasets import load_dataset, load_metric
	from torch.utils.data import DataLoader
	from utils_qa import postprocess_qa_predictions

	import transformers
	from transformers import AutoTokenizer, EvalPrediction, default_data_collator, set_seed
	from transformers.trainer_pt_utils import nested_concat, nested_truncate


	TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
	absl_logger = absl_logging.get_absl_logger()
	absl_logger.setLevel(logging.WARNING)

	logger = logging.getLogger(__name__)

	parser = argparse.ArgumentParser()

	# Required parameters
	parser.add_argument(
	"--onnx_model_path",
	default=None,
	type=str,
	required=True,
	help="Path to ONNX model: ",
	)

	parser.add_argument(
	"--output_dir",
	default=None,
	type=str,
	required=True,
	help="The output directory where the model checkpoints and predictions will be written.",
	)

	# Other parameters

	parser.add_argument(
	"--tokenizer_name",
	default="",
	type=str,
	required=True,
	help="Pretrained tokenizer name or path if not the same as model_name",
	)

	parser.add_argument(
	"--version_2_with_negative",
	action="store_true",
	help="If true, the SQuAD examples contain some that do not have an answer.",
	)
	parser.add_argument(
	"--null_score_diff_threshold",
	type=float,
	default=0.0,
	help="If null_score - best_non_null is greater than the threshold predict null.",
	)

	parser.add_argument(
	"--max_seq_length",
	default=384,
	type=int,
	help=(
	"The maximum total input sequence length after WordPiece tokenization. Sequences "
	"longer than this will be truncated, and sequences shorter than this will be padded."
	),
	)
	parser.add_argument(
	"--doc_stride",
	default=128,
	type=int,
	help="When splitting up a long document into chunks, how much stride to take between chunks.",
	)

	parser.add_argument("--per_device_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.")

	parser.add_argument(
	"--n_best_size",
	default=20,
	type=int,
	help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
	)
	parser.add_argument(
	"--max_answer_length",
	default=30,
	type=int,
	help=(
	"The maximum length of an answer that can be generated. This is needed because the start "
	"and end predictions are not conditioned on one another."
	),
	)

	parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

	parser.add_argument(
	"--dataset_name",
	type=str,
	default=None,
	required=True,
	help="The name of the dataset to use (via the datasets library).",
	)
	parser.add_argument(
	"--dataset_config_name",
	type=str,
	default=None,
	help="The configuration name of the dataset to use (via the datasets library).",
	)
	parser.add_argument(
	"--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data."
	)
	parser.add_argument("--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets")
	parser.add_argument(
	"--fp16",
	action="store_true",
	help="Whether to use 16-bit (mixed) precision instead of 32-bit",
	)
	parser.add_argument(
	"--int8",
	action="store_true",
	help="Whether to use INT8",
	)

	args = parser.parse_args()

	if args.tokenizer_name:
	tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
	else:
	raise ValueError(
	"You are instantiating a new tokenizer from scratch. This is not supported by this script."
	"You can do it from another script, save it, and load it from here, using --tokenizer_name."
	)

	logger.info("Training/evaluation parameters %s", args)

	args.eval_batch_size = args.per_device_eval_batch_size

	INPUT_SHAPE = (args.eval_batch_size, args.max_seq_length)

	# TRT Engine properties
	STRICT_TYPES = True

	engine_name = "temp_engine/bert-fp32.engine"
	if args.fp16:
	engine_name = "temp_engine/bert-fp16.engine"
	if args.int8:
	engine_name = "temp_engine/bert-int8.engine"

	# import ONNX file
	if not os.path.exists("temp_engine"):
	os.makedirs("temp_engine")

	EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
	with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(
	network, TRT_LOGGER
	) as parser:
	with open(args.onnx_model_path, "rb") as model:
	if not parser.parse(model.read()):
	for error in range(parser.num_errors):
	print(parser.get_error(error))

	# Query input names and shapes from parsed TensorRT network
	network_inputs = [network.get_input(i) for i in range(network.num_inputs)]
	input_names = [_input.name for _input in network_inputs] # ex: ["actual_input1"]

	with builder.create_builder_config() as config:
	config.max_workspace_size = 1 << 50
	if STRICT_TYPES:
	config.set_flag(trt.BuilderFlag.STRICT_TYPES)
	if args.fp16:
	config.set_flag(trt.BuilderFlag.FP16)
	if args.int8:
	config.set_flag(trt.BuilderFlag.INT8)
	profile = builder.create_optimization_profile()
	config.add_optimization_profile(profile)
	for i in range(len(input_names)):
	profile.set_shape(input_names[i], INPUT_SHAPE, INPUT_SHAPE, INPUT_SHAPE)
	engine = builder.build_engine(network, config)

	# serialize_engine and store in file (can be directly loaded and deserialized):
	with open(engine_name, "wb") as f:
	f.write(engine.serialize())


	# run inference with TRT
	def model_infer(inputs, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream):
	input_ids = np.asarray(inputs["input_ids"], dtype=np.int32)
	attention_mask = np.asarray(inputs["attention_mask"], dtype=np.int32)
	token_type_ids = np.asarray(inputs["token_type_ids"], dtype=np.int32)

	# Copy inputs
	cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream)
	cuda.memcpy_htod_async(d_inputs[1], attention_mask.ravel(), stream)
	cuda.memcpy_htod_async(d_inputs[2], token_type_ids.ravel(), stream)
	# start time
	start_time = time.time()
	# Run inference
	context.execute_async(
	bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output0), int(d_output1)], stream_handle=stream.handle
	)
	# Transfer predictions back from GPU
	cuda.memcpy_dtoh_async(h_output0, d_output0, stream)
	cuda.memcpy_dtoh_async(h_output1, d_output1, stream)
	# Synchronize the stream and take time
	stream.synchronize()
	# end time
	end_time = time.time()
	infer_time = end_time - start_time
	outputs = (h_output0, h_output1)
	# print(outputs)
	return outputs, infer_time


	# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
	accelerator = Accelerator()
	# Make one log on every process with the configuration for debugging.
	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	datefmt="%m/%d/%Y %H:%M:%S",
	level=logging.INFO,
	)

	# Setup logging, we only want one process per machine to log things on the screen.
	# accelerator.is_local_main_process is only True for one process per machine.
	logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
	if accelerator.is_local_main_process:
	datasets.utils.logging.set_verbosity_warning()
	transformers.utils.logging.set_verbosity_info()
	else:
	datasets.utils.logging.set_verbosity_error()
	transformers.utils.logging.set_verbosity_error()

	# If passed along, set the training seed now.
	if args.seed is not None:
	set_seed(args.seed)

	# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
	# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
	# (the dataset will be downloaded automatically from the datasets Hub).
	#
	# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
	# 'text' is found. You can easily tweak this behavior (see below).
	if args.dataset_name is not None:
	# Downloading and loading a dataset from the hub.
	raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
	else:
	raise ValueError("Evaluation requires a dataset name")
	# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
	# https://huggingface.co/docs/datasets/loading_datasets.html.

	# Preprocessing the datasets.
	# Preprocessing is slighlty different for training and evaluation.

	column_names = raw_datasets["validation"].column_names

	question_column_name = "question" if "question" in column_names else column_names[0]
	context_column_name = "context" if "context" in column_names else column_names[1]
	answer_column_name = "answers" if "answers" in column_names else column_names[2]

	# Padding side determines if we do (question\|context) or (context\|question).
	pad_on_right = tokenizer.padding_side == "right"

	if args.max_seq_length > tokenizer.model_max_length:
	logger.warning(
	f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
	f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
	)

	max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)


	# Validation preprocessing
	def prepare_validation_features(examples):
	# Some of the questions have lots of whitespace on the left, which is not useful and will make the
	# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
	# left whitespace
	examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]

	# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
	# in one example possible giving several features when a context is long, each of those features having a
	# context that overlaps a bit the context of the previous feature.
	tokenized_examples = tokenizer(
	examples[question_column_name if pad_on_right else context_column_name],
	examples[context_column_name if pad_on_right else question_column_name],
	truncation="only_second" if pad_on_right else "only_first",
	max_length=max_seq_length,
	stride=args.doc_stride,
	return_overflowing_tokens=True,
	return_offsets_mapping=True,
	padding="max_length",
	)

	# Since one example might give us several features if it has a long context, we need a map from a feature to
	# its corresponding example. This key gives us just that.
	sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

	# For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
	# corresponding example_id and we will store the offset mappings.
	tokenized_examples["example_id"] = []

	for i in range(len(tokenized_examples["input_ids"])):
	# Grab the sequence corresponding to that example (to know what is the context and what is the question).
	sequence_ids = tokenized_examples.sequence_ids(i)
	context_index = 1 if pad_on_right else 0

	# One example can give several spans, this is the index of the example containing this span of text.
	sample_index = sample_mapping[i]
	tokenized_examples["example_id"].append(examples["id"][sample_index])

	# Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
	# position is part of the context or not.
	tokenized_examples["offset_mapping"][i] = [
	(o if sequence_ids[k] == context_index else None)
	for k, o in enumerate(tokenized_examples["offset_mapping"][i])
	]

	return tokenized_examples


	eval_examples = raw_datasets["validation"]
	# Validation Feature Creation
	eval_dataset = eval_examples.map(
	prepare_validation_features,
	batched=True,
	num_proc=args.preprocessing_num_workers,
	remove_columns=column_names,
	load_from_cache_file=not args.overwrite_cache,
	desc="Running tokenizer on validation dataset",
	)

	data_collator = default_data_collator

	eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
	eval_dataloader = DataLoader(
	eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
	)


	# Post-processing:
	def post_processing_function(examples, features, predictions, stage="eval"):
	# Post-processing: we match the start logits and end logits to answers in the original context.
	predictions = postprocess_qa_predictions(
	examples=examples,
	features=features,
	predictions=predictions,
	version_2_with_negative=args.version_2_with_negative,
	n_best_size=args.n_best_size,
	max_answer_length=args.max_answer_length,
	null_score_diff_threshold=args.null_score_diff_threshold,
	output_dir=args.output_dir,
	prefix=stage,
	)
	# Format the result to the format the metric expects.
	if args.version_2_with_negative:
	formatted_predictions = [
	{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
	]
	else:
	formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

	references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
	return EvalPrediction(predictions=formatted_predictions, label_ids=references)


	metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")

	# Evaluation!
	logger.info("Loading ONNX model %s for evaluation", args.onnx_model_path)
	with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(
	f.read()
	) as engine, engine.create_execution_context() as context:
	# setup for TRT inferrence
	for i in range(len(input_names)):
	context.set_binding_shape(i, INPUT_SHAPE)
	assert context.all_binding_shapes_specified

	def binding_nbytes(binding):
	return trt.volume(engine.get_binding_shape(binding)) * engine.get_binding_dtype(binding).itemsize

	# Allocate device memory for inputs and outputs.
	d_inputs = [cuda.mem_alloc(binding_nbytes(binding)) for binding in engine if engine.binding_is_input(binding)]

	# Allocate output buffer
	h_output0 = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
	h_output1 = cuda.pagelocked_empty(tuple(context.get_binding_shape(4)), dtype=np.float32)
	d_output0 = cuda.mem_alloc(h_output0.nbytes)
	d_output1 = cuda.mem_alloc(h_output1.nbytes)

	# Create a stream in which to copy inputs/outputs and run inference.
	stream = cuda.Stream()

	# Evaluation
	logger.info("*** Running Evaluation ***")
	logger.info(f" Num examples = {len(eval_dataset)}")
	logger.info(f" Batch size = {args.per_device_eval_batch_size}")

	total_time = 0.0
	niter = 0
	start_time = timeit.default_timer()

	all_preds = None
	for step, batch in enumerate(eval_dataloader):
	outputs, infer_time = model_infer(batch, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream)
	total_time += infer_time
	niter += 1

	start_logits, end_logits = outputs
	start_logits = torch.tensor(start_logits)
	end_logits = torch.tensor(end_logits)

	# necessary to pad predictions and labels for being gathered
	start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
	end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)

	logits = (accelerator.gather(start_logits).cpu().numpy(), accelerator.gather(end_logits).cpu().numpy())
	all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)

	if all_preds is not None:
	all_preds = nested_truncate(all_preds, len(eval_dataset))

	evalTime = timeit.default_timer() - start_time
	logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset))
	# Inference time from TRT
	logger.info("Average Inference Time = {:.3f} ms".format(total_time * 1000 / niter))
	logger.info("Total Inference Time = {:.3f} ms".format(total_time * 1000))
	logger.info("Total Number of Inference = %d", niter)

	prediction = post_processing_function(eval_examples, eval_dataset, all_preds)
	eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
	logger.info(f"Evaluation metrics: {eval_metric}")