JoshuaChak
/

bmodel-qwen1.5-1.8b

Model card Files Files and versions Community

bmodel-qwen1.5-1.8b / Yi /compile /export_onnx.py

JoshuaChak

Upload folder using huggingface_hub

7c071a8 verified 5 months ago

raw

history blame

No virus

10.9 kB

	#!/usr/bin/env python3
	# ==============================================================================
	#
	# Copyright (C) 2023 Sophgo Technologies Inc. All rights reserved.
	#
	# TPU-MLIR is licensed under the 2-Clause BSD License except for the
	# third-party components.
	#
	# ==============================================================================

	import os
	import torch
	import argparse
	from tqdm import tqdm
	from transformers import AutoTokenizer, AutoModelForCausalLM
	torch.set_grad_enabled(False)

	parser = argparse.ArgumentParser(description='export onnx.')
	parser.add_argument('--model_path', type=str, default ="../Yi-6B-Chat" ,help='path to the torch model.')
	parser.add_argument('--seq_length', type=int, default=512, help="sequence length")

	args = parser.parse_args()

	model_path = args.model_path
	folder = f"./tmp/onnx"

	origin_model = AutoModelForCausalLM.from_pretrained(
	model_path, trust_remote_code=True).eval()

	for param in origin_model.parameters():
	param.requires_grad = False

	config = origin_model.config
	transformer = origin_model.model
	layers = transformer.layers
	SEQ_LENGTH = args.seq_length
	NUM_LAYERS = config.num_hidden_layers
	HIDDEN_SIZE = config.hidden_size
	NUM_ATTENTION_HEADS = config.num_attention_heads
	HEAD_DIM = HIDDEN_SIZE // NUM_ATTENTION_HEADS
	VOCAB_SIZE = config.vocab_size
	print(f'Layers: {NUM_LAYERS}\nHidden size: {HIDDEN_SIZE}\n')

	tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

	class Embedding(torch.nn.Module):

	def __init__(self):
	super().__init__()

	def forward(self, input_ids):
	return transformer.embed_tokens(input_ids)


	class Block(torch.nn.Module):

	def __init__(self, layer_id):
	super().__init__()
	self.layer_id = layer_id
	self.layer = layers[layer_id]

	def forward(self, hidden_states, position_ids, attention_mask):
	hidden_states, past_kv = self.layer(hidden_states,
	attention_mask,
	position_ids,
	use_cache=True)
	present_k, present_v = past_kv
	return hidden_states, present_k, present_v


	class BlockCache(torch.nn.Module):

	def __init__(self, layer_id):
	super().__init__()
	self.layer_id = layer_id
	self.layer = layers[layer_id]

	def forward(self, hidden_states, position_ids, attention_mask, past_k,
	past_v):
	hidden_states, past_kv = self.layer(hidden_states,
	attention_mask,
	position_ids=position_ids,
	past_key_value=(past_k, past_v),
	use_cache=True)
	present_k, present_v = past_kv
	return hidden_states, present_k, present_v


	class LmHead(torch.nn.Module):

	def __init__(self):
	super().__init__()

	def forward(self, hidden_states):
	hidden_states = transformer.norm(hidden_states)
	m_logits = origin_model.lm_head(hidden_states)
	return m_logits


	class GreedyHead(torch.nn.Module):

	def __init__(self):
	super().__init__()

	def forward(self, m_logits):
	_, token = torch.topk(m_logits.float(), 1)
	return token


	# refs:https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py
	class PenaltySampleHead(torch.nn.Module):

	def __init__(self, top_k = 50, min_tokens_to_keep = 5):
	super().__init__()
	self.top_k = top_k
	self.min_tokens_to_keep = min_tokens_to_keep
	self.keep_matrix = torch.zeros((1, self.top_k), dtype=torch.bool)
	self.keep_matrix[0, :self.min_tokens_to_keep] = True

	def forward(self, m_logits, input_ids, top_p, temperature, penalty):
	# repeat penalty
	logits = torch.gather(m_logits, 1, input_ids)
	logits = torch.where(logits < 0, logits * penalty, logits / penalty)
	m_logits.scatter_(1, input_ids, logits)

	# top_k
	logits, token = torch.topk(m_logits.float(), self.top_k)

	# temperature
	logits = logits / temperature

	# top_p
	cumulative_probs = logits.softmax(dim=1).cumsum(dim=1)
	mask = cumulative_probs < top_p
	mask = mask + self.keep_matrix
	filtered_logits = torch.where(mask, logits, torch.FloatTensor([-1000.]))
	probs = filtered_logits.softmax(dim=1)
	return probs, token

	def convert_block(layer_id):
	model = Block(layer_id)
	hidden_states = torch.randn((1, SEQ_LENGTH, HIDDEN_SIZE))
	position_ids = torch.tensor([range(SEQ_LENGTH)], dtype=torch.long)
	attention_mask = -1000 * torch.ones((1, 1, SEQ_LENGTH, SEQ_LENGTH), dtype=torch.float32).triu(diagonal=1)

	torch.onnx.export(
	model, (hidden_states, position_ids, attention_mask),
	f'{folder}/block_{layer_id}.onnx',
	verbose=False,
	input_names=['input_states', 'position_ids', 'attention_mask'],
	output_names=['hidden_states', 'past_k', 'past_v'],
	do_constant_folding=True,
	opset_version=15)


	def convert_block_cache(layer_id):
	model = BlockCache(layer_id)
	hidden_states = torch.randn((1, 1, HIDDEN_SIZE))
	position_ids = torch.tensor([range(1)], dtype=torch.long)
	attention_mask = -1000 * torch.ones((1, 1, 1, SEQ_LENGTH + 1), dtype=torch.float32).triu(diagonal=1)
	past_k = torch.randn((1, SEQ_LENGTH, config.num_key_value_heads, HEAD_DIM))
	past_v = torch.randn((1, SEQ_LENGTH, config.num_key_value_heads, HEAD_DIM))
	results = model(hidden_states, position_ids, attention_mask, past_k, past_v)

	torch.onnx.export(
	model, (hidden_states, position_ids, attention_mask, past_k, past_v),
	f'{folder}/block_cache_{layer_id}.onnx',
	verbose=False,
	input_names=[
	'input_states', 'position_ids', 'attention_mask', 'history_k',
	'history_v'
	],
	output_names=['hidden_states', 'past_k', 'past_v'],
	do_constant_folding=True,
	opset_version=15)


	def convert_embedding():
	model = Embedding()
	input_ids = torch.tensor([range(SEQ_LENGTH)])

	torch.onnx.export(model, (input_ids),
	f'{folder}/embedding.onnx',
	verbose=False,
	input_names=['input_ids'],
	output_names=['input_embed'],
	do_constant_folding=True,
	opset_version=15)


	def convert_lm_head():
	model = LmHead()
	input = torch.randn(1, HIDDEN_SIZE)

	torch.onnx.export(model, (input),
	f'{folder}/lm_head.onnx',
	verbose=False,
	input_names=['hidden_states'],
	output_names=['m_logits'],
	do_constant_folding=True,
	opset_version=15)

	def convert_greedy_head():
	model = GreedyHead()
	m_logits = torch.randn(1, VOCAB_SIZE)

	torch.onnx.export(
	model, (m_logits),
	f'{folder}/greedy_head.onnx',
	verbose=False,
	input_names=['m_logits'],
	output_names=['token'],
	do_constant_folding=True,
	opset_version=15)


	def convert_penalty_sample_head():
	model = PenaltySampleHead()
	m_logits = torch.randn(1, VOCAB_SIZE)
	input_ids = torch.tensor([range(SEQ_LENGTH)])
	top_p = torch.tensor([0.8])
	temperature = torch.tensor([0.98])
	penalty = torch.tensor([0.98])

	torch.onnx.export(
	model, (m_logits, input_ids, top_p, temperature, penalty),
	f'{folder}/penalty_sample_head.onnx',
	verbose=False,
	input_names=[
	'm_logits', 'input_ids', 'top_p', 'temperature',
	'penalty'
	],
	output_names=['probs', 'token'],
	do_constant_folding=True,
	opset_version=15)

	# create folder to store onnx
	if not os.path.exists(folder):
	os.makedirs(folder)

	# export models
	print(f'Convert block & block_cache')
	for i in tqdm(range(NUM_LAYERS)):
	convert_block_cache(i)
	convert_block(i)

	print(f'Convert embedding')
	convert_embedding()

	print(f'Convert lm_head')
	convert_lm_head()
	convert_greedy_head()
	convert_penalty_sample_head()
	print("Done")

	def test_net_with_mask():
	import numpy as np
	num_layers = NUM_LAYERS
	MAX_LEN = SEQ_LENGTH
	embed = Embedding()
	blocks = [Block(i) for i in range(num_layers)]
	block_kvs = [BlockCache(i) for i in range(num_layers)]
	ids = tokenizer.encode('你好')
	query = '你好'
	print(query)
	# promt = tokenizer.build_prompt(query)
	# ids = tokenizer.encode(promt)
	ids = [ 6, 3903, 144, 25902, 7, 144, 6, 765, 13611, 144]
	print("input ids:{}".format(ids))
	token_len = len(ids)
	ids = ids + (MAX_LEN - token_len) * [0]
	input_ids = torch.tensor(ids).view(MAX_LEN)
	out = embed(input_ids).view(1, MAX_LEN, 4096)
	position_ids = list(range(token_len)) + (MAX_LEN - token_len) * [0]
	position_ids = torch.tensor([position_ids])
	attention_mask = torch.ones((MAX_LEN, MAX_LEN)) * -10000.0
	for i in range(token_len):
	attention_mask[i,:i+1] = 0
	attention_mask = attention_mask.view((1,1,MAX_LEN,MAX_LEN))
	k_cache = []
	v_cache = []

	for i in tqdm(range(num_layers)):
	out, k, v = blocks[i](out, position_ids, attention_mask)
	k[:,token_len:,:,:] = 0
	v[:,token_len:,:,:] = 0
	k_cache.append(k)
	v_cache.append(v)
	out = out[0, token_len - 1:token_len, :].view(1, 4096)
	lm = LmHead()
	m_logits = lm(out)
	greedy_head = GreedyHead()
	token = greedy_head(m_logits)
	out_ids = [int(token)]
	word = tokenizer._convert_id_to_token(int(token[0]))
	print(word, end="")
	for i in tqdm(range(5)):
	token_len += 1
	input_ids = torch.tensor([token])
	out = embed(input_ids).view(1, 1, 4096)
	position_ids = torch.tensor([[token_len - 1]])
	attention_mask = torch.ones((1, 1, 1, MAX_LEN + 1)) * -10000.0
	attention_mask[:, :, :, :token_len] = 0
	attention_mask[:, :, :, -1] = 0

	for i in range(num_layers):
	out, k_cache_present, v_cache_present = block_kvs[i](out, position_ids,
	attention_mask,
	k_cache[i], v_cache[i])
	k_cache[i][:,token_len-1:token_len,:,:] = k_cache_present
	v_cache[i][:,token_len-1:token_len,:,:] = v_cache_present
	m_logits = lm(out)
	token = greedy_head(m_logits)
	out_ids.append(int(token))
	word = tokenizer._convert_id_to_token(int(token[0]))
	print(word, end="")
	print("\noutput_ids:{}".format(out_ids))

	# test_net_with_mask()