Spaces:

amphion
/

DeepfakeDetection

Running on Zero

File size: 16,240 Bytes

import torch
import torch.nn as nn
from transformers import Wav2Vec2BertModel
from llama_nar import LlamaNAREmb
from transformers import LlamaConfig
import time
import torch.nn.functional as F
from huggingface_hub import hf_hub_download


class Wav2Vec2BERT_Llama(nn.Module):
    def __init__(self):
        super().__init__()

        # 1. 加载预训练模型
        self.wav2vec2bert = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", output_hidden_states=True)
        
        # 2. 选择性冻结参数
        for name, param in self.wav2vec2bert.named_parameters():
            # 冻结所有FFN1 (保留FFN2的适应能力)
            if 'ffn1' in name:
                param.requires_grad = False
            
            # 冻结多头注意力中的K,V投影
            if any(proj in name for proj in ['linear_k', 'linear_v']):
                param.requires_grad = False
            
            # 冻结distance_embedding
            if 'distance_embedding' in name:
                param.requires_grad = False
            
            # 冻结所有卷积相关模块
            if any(conv_name in name for conv_name in [
                'conv_module', 'pointwise_conv', 'depthwise_conv',
                'feature_extractor', 'pos_conv_embed', 'conv_layers'
            ]):
                param.requires_grad = False
        
        # 3. 减小Llama模型规模
        self.llama_nar = LlamaNAREmb(
            config=LlamaConfig(
                hidden_size=512,
                num_attention_heads=8,
                num_hidden_layers=8,
            ), 
            num_heads=8, 
            num_layers=8,
            hidden_size=512
        )
        
        # 4. 降维投影层
        self.projection = nn.Sequential(
            nn.Linear(1024, 512),
            nn.LayerNorm(512)
        )
        
        # 5. 简化分类头
        self.classifier = nn.Sequential(
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 2)
        )
        
        # 6. 减小embedding维度
        self.label_embedding = nn.Embedding(num_embeddings=2, embedding_dim=512)
        
        # 7. 简化特征处理层
        self.feature_processor = nn.Sequential(
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        
        # 8. 减小特殊token的维度
        self.special_tokens = nn.Parameter(torch.randn(4, 512))

    def _fuse_layers(self, hidden_states):
        # 修改特征融合方法
        def downsample_sequence(sequence, factor=10):
            """对序列进行下采样"""
            batch_size, seq_len, hidden_size = sequence.shape
            # 确保序列长度可以被因子整除
            new_len = seq_len // factor
            padded_len = new_len * factor
            
            if seq_len > padded_len:
                sequence = sequence[:, :padded_len, :]
            
            # 重塑张量并进行平均池化 [batch_size, new_len, factor, hidden_size]
            reshaped = sequence.reshape(batch_size, new_len, factor, hidden_size)
            downsampled = torch.mean(reshaped, dim=2)  # [batch_size, new_len, hidden_size]
            return downsampled

        # 1. 获取最后一层特征并进行下采样
        last_layer = hidden_states[-1]  # [batch_size, seq_len, 1024]
        downsampled_features = downsample_sequence(last_layer)  # [batch_size, seq_len//10, 1024]
        
        # 2. 投影到512维度
        projected_features = self.projection(downsampled_features)  # [batch_size, seq_len//10, 512]
        
        return projected_features  # 不再需要unsqueeze，因为已经保留了序列维度

    def forward(self, batch):
        main_output = self.wav2vec2bert(
            **batch['main_features']
        )
        
        fused_features = self._fuse_layers(main_output.hidden_states)
        fused_features = self.feature_processor(fused_features)
        
        if ('prompt_labels' in batch and 
            batch['prompt_labels'] is not None and 
            'prompt_features' in batch and 
            batch['prompt_features'] and 
            len(batch['prompt_features']) > 0):
            
            batch_size, num_prompts = batch['prompt_labels'].shape
            
            # 重塑特征以批量处理
            prompt_features = batch['prompt_features']
            all_prompt_outputs = []
            
            for i in range(num_prompts):
                prompt_output = self.wav2vec2bert(
                    **prompt_features[i]
                )
                all_prompt_outputs.append(self._fuse_layers(prompt_output.hidden_states))
            
            if all_prompt_outputs:
                fused_prompts = torch.stack([
                    self.feature_processor(p) for p in all_prompt_outputs
                ], dim=1)  # [batch_size, num_prompts, seq_len, hidden_size]
                
                # 获取label embeddings并扩展到对应序列长度
                label_embs = self.label_embedding(batch['prompt_labels'])  # [batch_size, num_prompts, 512]
                
                prompt_embeddings = []
                for i in range(batch_size):
                    sequence = []
                    
                    # 添加示例prompts
                    for j in range(num_prompts):
                        prompt_seq_len = fused_prompts[i, j].size(0)  # 获取当前prompt的序列长度
                        
                        sequence.append(self.special_tokens[1].expand(1, -1))  # [PROMPT]
                        sequence.append(self.special_tokens[2].expand(1, -1))  # [AUDIO]
                        sequence.append(fused_prompts[i, j])  # [seq_len, hidden_size]
                        sequence.append(self.special_tokens[3].expand(1, -1))  # [LABEL]
                        
                        # 扩展label embedding到与音频特征相同的长度
                        expanded_label = label_embs[i, j].unsqueeze(0).expand(prompt_seq_len, -1)
                        sequence.append(expanded_label)  # [seq_len, hidden_size]
                        
                        sequence.append(self.special_tokens[0].expand(1, -1))  # [SEP]
                    
                    # 添加待预测的主特征
                    main_seq_len = fused_features[i].size(0)  # 获取主特征的序列长度
                    sequence.append(self.special_tokens[1].expand(1, -1))  # [PROMPT]
                    sequence.append(self.special_tokens[2].expand(1, -1))  # [AUDIO]
                    sequence.append(fused_features[i])  # [main_seq_len, hidden_size]
                    sequence.append(self.special_tokens[3].expand(1, -1))  # [LABEL]
                    # 预测位置使用零向量，长度与主特征相同
                    sequence.append(torch.zeros(main_seq_len, fused_features.size(-1)).to(fused_features.device))
                    
                    prompt_embeddings.append(torch.cat(sequence, dim=0))
                
                prompt_embeddings = torch.stack(prompt_embeddings, dim=0)
            
        else:
            # 简化无prompt情况的处理
            batch_size = fused_features.size(0)
            main_seq_len = fused_features.size(1)  # 直接获取主特征序列长度
            
            # 构建序列 [batch_size, total_len, hidden_size]
            prompt_embeddings = torch.cat([
                self.special_tokens[1].expand(batch_size, 1, -1),  # [PROMPT]
                self.special_tokens[2].expand(batch_size, 1, -1),  # [AUDIO]
                fused_features,  # [batch_size, main_seq_len, hidden_size]
                self.special_tokens[3].expand(batch_size, 1, -1),  # [LABEL]
                torch.zeros(batch_size, main_seq_len, fused_features.size(-1)).to(fused_features.device)  # 预测位置
            ], dim=1)
        
        # 输入到llama_nar
        output = self.llama_nar(inputs_embeds=prompt_embeddings)
        
        # 获取所有预测位置的输出（即最后main_seq_len个位置）
        pred_pos_embeddings = output[:, -main_seq_len:, :]  # [batch_size, main_seq_len, hidden_size]
        # 对每一帧进行分类
        frame_logits = self.classifier(pred_pos_embeddings)  # [batch_size, main_seq_len, 2]
        
        # 同时返回帧级别的logits和整体的logits（通过平均得到）
        avg_embedding = torch.mean(pred_pos_embeddings, dim=1)  # [batch_size, hidden_size]
        avg_logits = self.classifier(avg_embedding)  # [batch_size, 2]
        
        return {
            'frame_logits': frame_logits,  # 每一帧的预测分数
            'avg_logits': avg_logits       # 整体的预测分数
        }


if __name__ == '__main__':
    import torch
    from torch.utils.data import DataLoader
    from dataset.train_MultiDataset import train_MultiDataset, collate_fn
    from tqdm import tqdm
    import time
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"\n=== 使用设备: {device} ===")
    
    # 初始化模型
    print("\n=== 初始化模型 ===")
    model = Wav2Vec2BERT_Llama().to(device)
    model.eval()  # 设置为评估模式
    
    # 打印wav2vec2bert的参数结构
    print("\n=== Wav2Vec2BERT 参数结构 ===")
    w2v_params_by_layer = {}
    total_trainable = 0
    total_frozen = 0
    
    for name, param in model.wav2vec2bert.named_parameters():
        # 获取主要层名称
        layer_name = name.split('.')[0]
        if layer_name not in w2v_params_by_layer:
            w2v_params_by_layer[layer_name] = {
                'trainable_params': 0,
                'frozen_params': 0,
                'parameter_names': []
            }
        
        # 统计参数
        if param.requires_grad:
            w2v_params_by_layer[layer_name]['trainable_params'] += param.numel()
            total_trainable += param.numel()
        else:
            w2v_params_by_layer[layer_name]['frozen_params'] += param.numel()
            total_frozen += param.numel()
        
        w2v_params_by_layer[layer_name]['parameter_names'].append(name)
    
    # 打印每层的详细信息
    print("\n各层参数统计:")
    for layer_name, info in w2v_params_by_layer.items():
        trainable_mb = info['trainable_params'] / 1024 / 1024
        frozen_mb = info['frozen_params'] / 1024 / 1024
        total_mb = (info['trainable_params'] + info['frozen_params']) / 1024 / 1024
        
        print(f"\n{layer_name}:")
        print(f"  - 总参数量: {total_mb:.2f}MB")
        print(f"  - 可训练参数: {trainable_mb:.2f}MB")
        print(f"  - 冻结参数: {frozen_mb:.2f}MB")
        print(f"  - 参数名称:")
        for param_name in info['parameter_names']:
            print(f"    * {param_name}")
    
    # 打印总体统计
    print("\n=== 总体统计 ===")
    print(f"可训练参数总量: {total_trainable/1024/1024:.2f}MB")
    print(f"冻结参数总量: {total_frozen/1024/1024:.2f}MB")
    print(f"参数总量: {(total_trainable + total_frozen)/1024/1024:.2f}MB")
    print(f"可训练参数占比: {total_trainable/(total_trainable + total_frozen)*100:.2f}%")
    
    # 分别统计各个模块的参数量
    wav2vec2bert_params = sum(p.numel() for p in model.wav2vec2bert.parameters())
    llama_params = sum(p.numel() for p in model.llama_nar.parameters())
    other_params = sum(p.numel() for name, p in model.named_parameters() 
                      if not name.startswith('wav2vec2bert.') and not name.startswith('llama_nar.'))
    
    total_params = wav2vec2bert_params + llama_params + other_params
    
    print(f"\n=== 参数量统计 ===")
    print(f"Wav2Vec2BERT参数量: {wav2vec2bert_params:,} ({wav2vec2bert_params/1024/1024:.2f}MB)")
    print(f"LlamaNAR参数量: {llama_params:,} ({llama_params/1024/1024:.2f}MB)")
    print(f"其他模块参数量: {other_params:,} ({other_params/1024/1024:.2f}MB)")
    print(f"总参数量: {total_params:,} ({total_params/1024/1024:.2f}MB)")
    
    # 计算百分比
    print(f"\n=== 参数量占比 ===")
    print(f"Wav2Vec2BERT: {wav2vec2bert_params/total_params*100:.2f}%")
    print(f"LlamaNAR: {llama_params/total_params*100:.2f}%")
    print(f"其他模块: {other_params/total_params*100:.2f}%")
    
    # 测试运行时间和内存使用
    print("\n=== 测试运行时间和内存使用 (batch_size=4) ===")
    batch_size = 4
    total_samples = 600000
    
    # 清空GPU缓存
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        initial_memory = torch.cuda.memory_allocated() / 1024 / 1024
        print(f"初始GPU内存使用: {initial_memory:.2f}MB")
    
    # 初始化数据集
    print("\n初始化数据集...")
    ds = train_MultiDataset(max_prompts=3)
    
    # 创建DataLoader
    dl = DataLoader(ds, 
                   batch_size=batch_size,
                   shuffle=True,
                   collate_fn=collate_fn,
                   num_workers=4)
    
    print(f"\n数据集大小: {len(ds)}")
    print(f"批次数量: {len(dl)}")
    
    # 计算一个batch的平均时间
    num_test_batches = 10
    total_time = 0
    max_memory = 0
    
    print(f"\n测试{num_test_batches}个batch的平均运行时间...")
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dl, total=num_test_batches)):
            if i >= num_test_batches:
                break
            
            # 正确处理字典类型的特征
            main_features = {
                'input_features': batch['main_features']['input_features'].to(device),
                'attention_mask': batch['main_features']['attention_mask'].to(device)
            }
            
            prompt_features = [{
                'input_features': pf['input_features'].to(device),
                'attention_mask': pf['attention_mask'].to(device)
            } for pf in batch['prompt_features']]
            
            labels = batch['labels'].to(device)
            prompt_labels = batch['prompt_labels'].to(device)
            
            # 记录开始时间
            start_time = time.time()
            
            # 前向传播
            outputs = model({
                'main_features': main_features,
                'prompt_features': prompt_features,
                'prompt_labels': prompt_labels
            })
            
            # 确保GPU运算完成
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            
            # 记录结束时间和内存使用
            end_time = time.time()
            total_time += (end_time - start_time)
            
            if torch.cuda.is_available():
                current_memory = torch.cuda.memory_allocated() / 1024 / 1024
                max_memory = max(max_memory, current_memory)
            
            # 打印第一个batch的详细信息
            if i == 0:
                print("\n=== 第一个Batch的详细信息 ===")
                print(f"主特征形状: {main_features['input_features'].shape}")
                print(f"主掩码形状: {main_features['attention_mask'].shape}")
                print(f"Prompt特征形状: {prompt_features[0]['input_features'].shape}")
                print(f"Prompt掩码形状: {prompt_features[0]['attention_mask'].shape}")
                print(f"标签形状: {labels.shape}")
                print(f"Prompt标签形状: {prompt_labels.shape}")
                print(f"模型输出形状: {outputs.shape}")
                print(f"输出logits范围: [{outputs.min().item():.3f}, {outputs.max().item():.3f}]")
    
    # 计算和打印统计信息
    avg_time = total_time / num_test_batches
    print(f"\n=== 性能统计 ===")
    print(f"平均每个batch处理时间: {avg_time:.4f}秒")
    print(f"估计处理{total_samples}个样本需要: {(total_samples/batch_size*avg_time/3600):.2f}小时")
    if torch.cuda.is_available():
        print(f"最大GPU内存使用: {max_memory:.2f}MB")
        print(f"GPU内存增长: {max_memory - initial_memory:.2f}MB")
    
    print("\n测试完成!")