File size: 12,721 Bytes

f3a1217

import numpy as np
# import onnxruntime as ort
import ztu_somemodelruntime_rknnlite2 as ort
import sentencepiece as spm
import soundfile as sf

ort.set_default_logger_verbosity(0)

def load_onnx_model(model_path):
    """加载ONNX模型"""
    return ort.InferenceSession(
        model_path,
        providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
    )

class SimpleT5Tokenizer:
    def __init__(self, model_path, max_length=128):
        """初始化tokenizer
        
        Args:
            model_path: sentencepiece模型路径
            max_length: 序列最大长度,默认128
        """
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_path)
        
        # T5特殊token的ID
        self.pad_token_id = 0
        self.eos_token_id = 1
        self.max_length = max_length
        
    def __call__(self, texts, padding=True, truncation=True, max_length=None, return_tensors="np"):
        """处理文本序列
        
        Args:
            texts: 文本或文本列表
            padding: 是否padding
            truncation: 是否截断
            max_length: 可选,覆盖默认max_length
            return_tensors: 返回类型(只支持"np")
            
        Returns:
            dict: 包含input_ids和attention_mask
        """
        if isinstance(texts, str):
            texts = [texts]
            
        max_len = max_length if max_length is not None else self.max_length
            
        # 分词并转换为ID
        input_ids = []
        attention_mask = []
        for text in texts:
            ids = self.sp.EncodeAsIds(text)
            
            # 截断处理(预留EOS token位置)
            if truncation and len(ids) > max_len - 1:
                ids = ids[:max_len-1]
            ids.append(self.eos_token_id)
            
            # 创建attention mask
            mask = [1] * len(ids)
            
            # Padding处理
            if padding:
                pad_length = max_len - len(ids)
                ids.extend([self.pad_token_id] * pad_length)
                mask.extend([0] * pad_length)
                
            input_ids.append(ids)
            attention_mask.append(mask)
                
        # 转换为numpy array
        input_ids = np.array(input_ids, dtype=np.int64)
        attention_mask = np.array(attention_mask, dtype=np.int64)
            
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

def encode_text(prompt, negative_prompt, tokenizer, text_encoder_onnx, guidance_scale=None):
    """编码文本,同时处理条件和无条件文本
    
    Args:
        prompt: 文本提示
        tokenizer: T5 tokenizer
        text_encoder_onnx: T5 ONNX模型
        guidance_scale: 引导系数
    """
    if not isinstance(prompt, list):
        prompt = [prompt]
    
    if guidance_scale is not None and guidance_scale > 1.0:
        # 同时处理条件和无条件文本
        all_prompts = [negative_prompt] + prompt
        batch = tokenizer(
            all_prompts,
            padding=True,
            truncation=True,
            return_tensors="np"
        )
        
        # ONNX推理
        all_hidden_states = text_encoder_onnx.run(
            ['last_hidden_state'],
            {
                'input_ids': batch['input_ids'].astype(np.int64),
                'attention_mask': batch['attention_mask'].astype(np.int64)
            }
        )[0]
            
        # 分离无条件和条件结果
        uncond_hidden_states = all_hidden_states[0:1]
        cond_hidden_states = all_hidden_states[1:]
        uncond_mask = batch['attention_mask'][0:1]
        cond_mask = batch['attention_mask'][1:]
        
        return (uncond_hidden_states, uncond_mask), (cond_hidden_states, cond_mask)
    else:
        # 只处理条件文本
        batch = tokenizer(
            prompt,
            padding=True,
            truncation=True,
            return_tensors="np"
        )
        
        # ONNX推理
        hidden_states = text_encoder_onnx.run(
            ['last_hidden_state'],
            {
                'input_ids': batch['input_ids'].astype(np.int64),
                'attention_mask': batch['attention_mask'].astype(np.int64)
            }
        )[0]
            
        return hidden_states, batch['attention_mask']

def retrieve_timesteps(scheduler, num_inference_steps, device, timesteps=None, sigmas=None):
    """获取timesteps"""
    if sigmas is not None:
        scheduler.set_timesteps(sigmas=sigmas)
        timesteps = scheduler.timesteps
        num_inference_steps = len(timesteps)
    else:
        scheduler.set_timesteps(num_inference_steps)
        timesteps = scheduler.timesteps
    return timesteps, num_inference_steps

# 添加一个简单的FlowMatchScheduler类
class SimpleFlowMatchScheduler:
    def __init__(self, num_train_timesteps=1000, shift=1.0):
        """初始化scheduler
        
        Args:
            num_train_timesteps: 训练步数
            shift: 时间步偏移量
        """
        # 生成线性timesteps
        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
        
        # 计算sigmas
        sigmas = timesteps / num_train_timesteps
        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
        
        # 添加终止sigma
        self.sigmas = np.append(sigmas, 0.0)
        self.timesteps = sigmas * num_train_timesteps
        self.step_index = None
        
    def set_timesteps(self, num_inference_steps):
        """设置推理时的timesteps
        
        Args:
            num_inference_steps: 推理步数
        """
        timesteps = np.linspace(1, len(self.timesteps), num_inference_steps, dtype=np.float32)[::-1].copy()
        sigmas = timesteps / len(self.timesteps)
        self.sigmas = np.append(sigmas, 0.0)
        self.timesteps = sigmas * len(self.timesteps)
        self.step_index = 0
        
    def step(self, model_output, timestep, sample):
        """执行一步euler更新
        
        Args:
            model_output: 模型输出
            timestep: 当前时间步
            sample: 当前样本
            
        Returns:
            prev_sample: 更新后的样本
        """
        sigma = self.sigmas[self.step_index] 
        sigma_next = self.sigmas[self.step_index + 1]
        
        # euler更新
        prev_sample = sample + (sigma_next - sigma) * model_output
        
        self.step_index += 1
        return prev_sample

def generate_audio_onnx(
    prompt="",
    negative_prompt="",
    duration=10,
    steps=50,
    guidance_scale=4.5,
    onnx_dir="./onnx_models",
    output_path="output_onnx.wav",
    seed=None
):
    if seed is not None:
        np.random.seed(seed)
    
    # 加载tokenizer和ONNX模型,设置固定长度
    tokenizer = SimpleT5Tokenizer(f"{onnx_dir}/spiece.model", max_length=63)
    text_encoder_onnx = load_onnx_model(f"{onnx_dir}/text_encoder_nf4.onnx")
    
    # 加载其他ONNX模型
    vae_decoder = load_onnx_model(f"{onnx_dir}/vae_decoder.onnx")
    duration_embedder = load_onnx_model(f"{onnx_dir}/duration_embedder.onnx")
    transformer = load_onnx_model(f"{onnx_dir}/transformer.onnx")
    proj_layer = load_onnx_model(f"{onnx_dir}/proj.onnx")
    
    # 1. duration embedding
    duration_input = np.array([[duration]], dtype=np.float32)
    print(f"[Shape] duration输入: {duration_input.shape}")
    
    duration_hidden_states = duration_embedder.run(
        ['embedding'],
        {'duration': duration_input}
    )[0]
    print(f"[Shape] duration embedding: {duration_hidden_states.shape}")

    if guidance_scale > 1.0:
        duration_hidden_states = np.concatenate([duration_hidden_states] * 2, axis=0)
        print(f"[Shape] 复制后的duration embedding: {duration_hidden_states.shape}")
    
    # 2. text encoder
    if guidance_scale > 1.0:
        (uncond_hidden_states, uncond_mask), (cond_hidden_states, cond_mask) = encode_text(
            prompt, negative_prompt, tokenizer, text_encoder_onnx, guidance_scale=guidance_scale
        )
        print(cond_hidden_states)
        encoder_hidden_states = np.concatenate([uncond_hidden_states, cond_hidden_states])
        attention_mask = np.concatenate([uncond_mask, cond_mask])
    else:
        encoder_hidden_states, attention_mask = encode_text(
            prompt, tokenizer, text_encoder_onnx
        )
    
    # 3. pooled_text
    boolean_encoder_mask = (attention_mask == 1)
    mask_expanded = boolean_encoder_mask[..., None].repeat(encoder_hidden_states.shape[-1], axis=-1)
    masked_data = np.where(mask_expanded, encoder_hidden_states, np.nan)
    pooled = np.nanmean(masked_data, axis=1)
    
    # 使用projection层处理
    pooled_text = proj_layer.run(
        ['projected'],
        {'text_embedding': pooled.astype(np.float32)}
    )[0]
    
    # 4. 合并duration和text特征
    encoder_hidden_states = np.concatenate(
        [encoder_hidden_states, duration_hidden_states],
        axis=1
    )
    
    # 5. 创建其他输入
    txt_ids = np.zeros((1, encoder_hidden_states.shape[1], 3), dtype=np.int64)
    img_ids = np.tile(
        np.arange(645, dtype=np.int64)[None, :, None],
        (1, 1, 3)
    )
    
    # 6. scheduler
    scheduler = SimpleFlowMatchScheduler(num_train_timesteps=1000)
    scheduler.set_timesteps(steps)
    
    # 初始化latents
    latents = np.random.randn(1, 645, 64).astype(np.float32)
    
    # 7. 生成循环
    for i in range(steps):
        # Transformer前向传播
        noise_pred = transformer.run(
            ['output'],
            {
                'hidden_states': latents,
                'timestep': np.array([scheduler.timesteps[i]/1000], dtype=np.float32),
                'pooled_text': pooled_text,
                'encoder_hidden_states': encoder_hidden_states, 
                'txt_ids': txt_ids,
                'img_ids': img_ids
            }
        )[0]
        
        if i == 0:  # 只在第一步打印
            print(f"[Shape] noise预测输出: {noise_pred.shape}")
            
        # 应用classifier free guidance
        if guidance_scale > 1.0:
            noise_pred_uncond, noise_pred_text = noise_pred[0:1], noise_pred[1:2]
            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
            
        # 使用scheduler更新latents
        latents = scheduler.step(noise_pred, scheduler.timesteps[i], latents)
        
        if i % 10 == 0:
            print(f"生成进度: {i}/{steps}")
            
    # 8. VAE解码前的处理
    latents = latents / scheduler.sigmas[0]
    latents = np.transpose(latents, (0, 2, 1))
    
    # 9. VAE解码
    wave = vae_decoder.run(['audio'], {'latent': latents})[0]
    
    # 10. 裁剪
    sample_rate = 44100
    waveform_end = int(duration * sample_rate)
    wave = wave[:, :, :waveform_end]
    print(f"[Shape] 裁剪后的最终波形: {wave.shape}")
    
    # 11. 保存音频
    wave = wave[0]  # 移除batch维度
    sf.write(output_path, wave.T, sample_rate)  # soundfile需要(samples, channels)格式
    
    return wave

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="测试ONNX模型推理")
    parser.add_argument("--prompt", type=str, default="What does the fox say?", help="文本提示")
    parser.add_argument("--negative_prompt", type=str, default="", help="负文本提示")
    parser.add_argument("--onnx_dir", type=str, default=".", help="ONNX模型目录")
    parser.add_argument("--duration", type=float, default=10.0, help="生成音频时长（秒）")
    parser.add_argument("--steps", type=int, default=30, help="推理步数")
    parser.add_argument("--guidance_scale", type=float, default=4.5, help="引导系数")
    parser.add_argument("--output", type=str, default="output_onnx.wav", help="输出音频路径")
    parser.add_argument("--seed", type=int, default=42, help="随机种子")
    
    args = parser.parse_args()
    
    # 生成音频
    wave = generate_audio_onnx(
        # prompt="What does the fox say?",
        # prompt="Never gonna give you up, never gonna let you down",
        # prompt="Electonic music, future house style",
        prompt=args.prompt,
        negative_prompt=args.negative_prompt,
        duration=args.duration,
        steps=args.steps,
        guidance_scale=args.guidance_scale,
        onnx_dir=args.onnx_dir,
        output_path=args.output,
        seed=args.seed
    )
    
    print(f"生成的音频shape为: {wave.shape}")
    print(f"音频已保存到: {args.output}")