Command

Export onnx

pip install transformers_stream_generator einops tiktoken accelerate transformers==4.32.0

export basic onnx

python export_onnx.py --model CodeFuse-DevOps-Model-7B-Chat/ --seq_length 2048

Compile bmodel

这里需要提前下载好tpu-mlir

pushd /path_to/tpu-mlir
source envsetup.sh
popd

compile basic bmodel

./compile.sh --mode int4 --name codefuse-7b --addr_mode io_alone --seq_length 2048

对`modeling_qwen.py`文件代码做调整

第一点修改如下（为了进行常量折叠，防止rotary_emb更新）：

rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
    hidden_states.device
)

修改为

# rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
#     hidden_states.device
# )

第二点修改如下（为了进行常量折叠）：

if rotary_pos_emb is not None:
    q_pos_emb, k_pos_emb = rotary_pos_emb
    # Slice the pos emb for current inference
    cur_len = query.shape[1]
    q_pos_emb = q_pos_emb[:, cur_len, :, :]
    k_pos_emb = k_pos_emb[:, cur_len, :, :]
    query = apply_rotary_pos_emb(query, q_pos_emb)
    key = apply_rotary_pos_emb(key, k_pos_emb)

修改为

if rotary_pos_emb is not None:
    # breakpoint()
    # q_pos_emb, k_pos_emb = rotary_pos_emb
    # Slice the pos emb for current inference
    # cur_len = query.shape[1]
    # q_pos_emb = q_pos_emb[:, -cur_len:, :, :]
    # k_pos_emb = k_pos_emb[:, -cur_len:, :, :]
    # query = apply_rotary_pos_emb(query, q_pos_emb)
    # key = apply_rotary_pos_emb(key, k_pos_emb)
    query = apply_rotary_pos_emb(query, rotary_pos_emb)
    key = apply_rotary_pos_emb(key, rotary_pos_emb)

第三点修改如下（为了进行常量折叠，tpu-mlir自带的cos和sin会造成精度损失）：

# def apply_rotary_pos_emb(t, freqs):
#     if apply_rotary_emb_func is not None:
#         t_ = t.float()
#         freqs = freqs.squeeze(0).squeeze(1)
#         cos = freqs[:, : freqs.shape[-1] // 2].cos()
#         sin = freqs[:, : freqs.shape[-1] // 2].sin()
#         output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
#         return output
#     else:
#         rot_dim = freqs.shape[-1]
#         t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
#         t_ = t_.float()
#         t_pass_ = t_pass_.float()
#         t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
#         return torch.cat((t_, t_pass_), dim=-1).type_as(t)

修改为

def apply_rotary_pos_emb(t, freqs):
    cos, sin = freqs
    if apply_rotary_emb_func is not None and t.is_cuda:
        t_ = t.float()
        cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
        sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
        return output
    else:
        rot_dim = freqs[0].shape[-1]
        cos, sin = freqs
        t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
        t_ = t_.float()
        t_pass_ = t_pass_.float()
        t_ = (t_ * cos) + (_rotate_half(t_) * sin)
        return torch.cat((t_, t_pass_), dim=-1).type_as(t)

第四点修改如下（加快推理速度，避免kvcache复用）：

if layer_past is not None:
    past_key, past_value = layer_past[0], layer_past[1]
    key = torch.cat((past_key, key), dim=1)
    value = torch.cat((past_value, value), dim=1)

if use_cache:
    present = (key, value)
else:
    present = None

修改为

if use_cache:
    present = (key, value)
else:
    present = None

if layer_past is not None:
    past_key, past_value = layer_past[0], layer_past[1]
    key = torch.cat((past_key, key), dim=1)
    value = torch.cat((past_value, value), dim=1)

第五点修改如下（避免softmax溢出）：

query_length, key_length = query.size(-2), key.size(-2)
causal_mask = self.bias[
    :, :, key_length - query_length : key_length, :key_length
]
mask_value = torch.finfo(attn_weights.dtype).min
mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
    attn_weights.device
)
attn_weights = torch.where(
    causal_mask, attn_weights.to(attn_weights.dtype), mask_value
)

if attention_mask is not None:
    # Apply the attention mask
    attn_weights = attn_weights + attention_mask

修改为

# query_length, key_length = query.size(-2), key.size(-2)
# causal_mask = self.bias[
#     :, :, key_length - query_length : key_length, :key_length
# ]
# mask_value = torch.finfo(attn_weights.dtype).min
# mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
#     attn_weights.device
# )
# attn_weights = torch.where(
#     causal_mask, attn_weights.to(attn_weights.dtype), mask_value
# )

if attention_mask is not None:
    # Apply the attention mask
    attn_weights = attn_weights + attention_mask

Command

Export onnx

export basic onnx

Compile bmodel

compile basic bmodel

对modeling_qwen.py文件代码做调整

对`modeling_qwen.py`文件代码做调整