Spaces:

amphion
/

DeepfakeDetection

Running on Zero

App Files Files Community

DeepfakeDetection / model.py

wli3221134

Update model.py

46689af verified 7 days ago

raw

history blame contribute delete

16.2 kB

	import torch
	import torch.nn as nn
	from transformers import Wav2Vec2BertModel
	from llama_nar import LlamaNAREmb
	from transformers import LlamaConfig
	import time
	import torch.nn.functional as F
	from huggingface_hub import hf_hub_download


	class Wav2Vec2BERT_Llama(nn.Module):
	def __init__(self):
	super().__init__()

	# 1. 加载预训练模型
	self.wav2vec2bert = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", output_hidden_states=True)

	# 2. 选择性冻结参数
	for name, param in self.wav2vec2bert.named_parameters():
	# 冻结所有FFN1 (保留FFN2的适应能力)
	if 'ffn1' in name:
	param.requires_grad = False

	# 冻结多头注意力中的K,V投影
	if any(proj in name for proj in ['linear_k', 'linear_v']):
	param.requires_grad = False

	# 冻结distance_embedding
	if 'distance_embedding' in name:
	param.requires_grad = False

	# 冻结所有卷积相关模块
	if any(conv_name in name for conv_name in [
	'conv_module', 'pointwise_conv', 'depthwise_conv',
	'feature_extractor', 'pos_conv_embed', 'conv_layers'
	]):
	param.requires_grad = False

	# 3. 减小Llama模型规模
	self.llama_nar = LlamaNAREmb(
	config=LlamaConfig(
	hidden_size=512,
	num_attention_heads=8,
	num_hidden_layers=8,
	),
	num_heads=8,
	num_layers=8,
	hidden_size=512
	)

	# 4. 降维投影层
	self.projection = nn.Sequential(
	nn.Linear(1024, 512),
	nn.LayerNorm(512)
	)

	# 5. 简化分类头
	self.classifier = nn.Sequential(
	nn.Linear(512, 128),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(128, 2)
	)

	# 6. 减小embedding维度
	self.label_embedding = nn.Embedding(num_embeddings=2, embedding_dim=512)

	# 7. 简化特征处理层
	self.feature_processor = nn.Sequential(
	nn.Linear(512, 512),
	nn.LayerNorm(512),
	nn.ReLU(),
	nn.Dropout(0.1)
	)

	# 8. 减小特殊token的维度
	self.special_tokens = nn.Parameter(torch.randn(4, 512))

	def _fuse_layers(self, hidden_states):
	# 修改特征融合方法
	def downsample_sequence(sequence, factor=10):
	"""对序列进行下采样"""
	batch_size, seq_len, hidden_size = sequence.shape
	# 确保序列长度可以被因子整除
	new_len = seq_len // factor
	padded_len = new_len * factor

	if seq_len > padded_len:
	sequence = sequence[:, :padded_len, :]

	# 重塑张量并进行平均池化 [batch_size, new_len, factor, hidden_size]
	reshaped = sequence.reshape(batch_size, new_len, factor, hidden_size)
	downsampled = torch.mean(reshaped, dim=2) # [batch_size, new_len, hidden_size]
	return downsampled

	# 1. 获取最后一层特征并进行下采样
	last_layer = hidden_states[-1] # [batch_size, seq_len, 1024]
	downsampled_features = downsample_sequence(last_layer) # [batch_size, seq_len//10, 1024]

	# 2. 投影到512维度
	projected_features = self.projection(downsampled_features) # [batch_size, seq_len//10, 512]

	return projected_features # 不再需要unsqueeze，因为已经保留了序列维度

	def forward(self, batch):
	main_output = self.wav2vec2bert(
	**batch['main_features']
	)

	fused_features = self._fuse_layers(main_output.hidden_states)
	fused_features = self.feature_processor(fused_features)

	if ('prompt_labels' in batch and
	batch['prompt_labels'] is not None and
	'prompt_features' in batch and
	batch['prompt_features'] and
	len(batch['prompt_features']) > 0):

	batch_size, num_prompts = batch['prompt_labels'].shape

	# 重塑特征以批量处理
	prompt_features = batch['prompt_features']
	all_prompt_outputs = []

	for i in range(num_prompts):
	prompt_output = self.wav2vec2bert(
	**prompt_features[i]
	)
	all_prompt_outputs.append(self._fuse_layers(prompt_output.hidden_states))

	if all_prompt_outputs:
	fused_prompts = torch.stack([
	self.feature_processor(p) for p in all_prompt_outputs
	], dim=1) # [batch_size, num_prompts, seq_len, hidden_size]

	# 获取label embeddings并扩展到对应序列长度
	label_embs = self.label_embedding(batch['prompt_labels']) # [batch_size, num_prompts, 512]

	prompt_embeddings = []
	for i in range(batch_size):
	sequence = []

	# 添加示例prompts
	for j in range(num_prompts):
	prompt_seq_len = fused_prompts[i, j].size(0) # 获取当前prompt的序列长度

	sequence.append(self.special_tokens[1].expand(1, -1)) # [PROMPT]
	sequence.append(self.special_tokens[2].expand(1, -1)) # [AUDIO]
	sequence.append(fused_prompts[i, j]) # [seq_len, hidden_size]
	sequence.append(self.special_tokens[3].expand(1, -1)) # [LABEL]

	# 扩展label embedding到与音频特征相同的长度
	expanded_label = label_embs[i, j].unsqueeze(0).expand(prompt_seq_len, -1)
	sequence.append(expanded_label) # [seq_len, hidden_size]

	sequence.append(self.special_tokens[0].expand(1, -1)) # [SEP]

	# 添加待预测的主特征
	main_seq_len = fused_features[i].size(0) # 获取主特征的序列长度
	sequence.append(self.special_tokens[1].expand(1, -1)) # [PROMPT]
	sequence.append(self.special_tokens[2].expand(1, -1)) # [AUDIO]
	sequence.append(fused_features[i]) # [main_seq_len, hidden_size]
	sequence.append(self.special_tokens[3].expand(1, -1)) # [LABEL]
	# 预测位置使用零向量，长度与主特征相同
	sequence.append(torch.zeros(main_seq_len, fused_features.size(-1)).to(fused_features.device))

	prompt_embeddings.append(torch.cat(sequence, dim=0))

	prompt_embeddings = torch.stack(prompt_embeddings, dim=0)

	else:
	# 简化无prompt情况的处理
	batch_size = fused_features.size(0)
	main_seq_len = fused_features.size(1) # 直接获取主特征序列长度

	# 构建序列 [batch_size, total_len, hidden_size]
	prompt_embeddings = torch.cat([
	self.special_tokens[1].expand(batch_size, 1, -1), # [PROMPT]
	self.special_tokens[2].expand(batch_size, 1, -1), # [AUDIO]
	fused_features, # [batch_size, main_seq_len, hidden_size]
	self.special_tokens[3].expand(batch_size, 1, -1), # [LABEL]
	torch.zeros(batch_size, main_seq_len, fused_features.size(-1)).to(fused_features.device) # 预测位置
	], dim=1)

	# 输入到llama_nar
	output = self.llama_nar(inputs_embeds=prompt_embeddings)

	# 获取所有预测位置的输出（即最后main_seq_len个位置）
	pred_pos_embeddings = output[:, -main_seq_len:, :] # [batch_size, main_seq_len, hidden_size]
	# 对每一帧进行分类
	frame_logits = self.classifier(pred_pos_embeddings) # [batch_size, main_seq_len, 2]

	# 同时返回帧级别的logits和整体的logits（通过平均得到）
	avg_embedding = torch.mean(pred_pos_embeddings, dim=1) # [batch_size, hidden_size]
	avg_logits = self.classifier(avg_embedding) # [batch_size, 2]

	return {
	'frame_logits': frame_logits, # 每一帧的预测分数
	'avg_logits': avg_logits # 整体的预测分数
	}


	if __name__ == '__main__':
	import torch
	from torch.utils.data import DataLoader
	from dataset.train_MultiDataset import train_MultiDataset, collate_fn
	from tqdm import tqdm
	import time

	# 设置设备
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"\n=== 使用设备: {device} ===")

	# 初始化模型
	print("\n=== 初始化模型 ===")
	model = Wav2Vec2BERT_Llama().to(device)
	model.eval() # 设置为评估模式

	# 打印wav2vec2bert的参数结构
	print("\n=== Wav2Vec2BERT 参数结构 ===")
	w2v_params_by_layer = {}
	total_trainable = 0
	total_frozen = 0

	for name, param in model.wav2vec2bert.named_parameters():
	# 获取主要层名称
	layer_name = name.split('.')[0]
	if layer_name not in w2v_params_by_layer:
	w2v_params_by_layer[layer_name] = {
	'trainable_params': 0,
	'frozen_params': 0,
	'parameter_names': []
	}

	# 统计参数
	if param.requires_grad:
	w2v_params_by_layer[layer_name]['trainable_params'] += param.numel()
	total_trainable += param.numel()
	else:
	w2v_params_by_layer[layer_name]['frozen_params'] += param.numel()
	total_frozen += param.numel()

	w2v_params_by_layer[layer_name]['parameter_names'].append(name)

	# 打印每层的详细信息
	print("\n各层参数统计:")
	for layer_name, info in w2v_params_by_layer.items():
	trainable_mb = info['trainable_params'] / 1024 / 1024
	frozen_mb = info['frozen_params'] / 1024 / 1024
	total_mb = (info['trainable_params'] + info['frozen_params']) / 1024 / 1024

	print(f"\n{layer_name}:")
	print(f" - 总参数量: {total_mb:.2f}MB")
	print(f" - 可训练参数: {trainable_mb:.2f}MB")
	print(f" - 冻结参数: {frozen_mb:.2f}MB")
	print(f" - 参数名称:")
	for param_name in info['parameter_names']:
	print(f" * {param_name}")

	# 打印总体统计
	print("\n=== 总体统计 ===")
	print(f"可训练参数总量: {total_trainable/1024/1024:.2f}MB")
	print(f"冻结参数总量: {total_frozen/1024/1024:.2f}MB")
	print(f"参数总量: {(total_trainable + total_frozen)/1024/1024:.2f}MB")
	print(f"可训练参数占比: {total_trainable/(total_trainable + total_frozen)*100:.2f}%")

	# 分别统计各个模块的参数量
	wav2vec2bert_params = sum(p.numel() for p in model.wav2vec2bert.parameters())
	llama_params = sum(p.numel() for p in model.llama_nar.parameters())
	other_params = sum(p.numel() for name, p in model.named_parameters()
	if not name.startswith('wav2vec2bert.') and not name.startswith('llama_nar.'))

	total_params = wav2vec2bert_params + llama_params + other_params

	print(f"\n=== 参数量统计 ===")
	print(f"Wav2Vec2BERT参数量: {wav2vec2bert_params:,} ({wav2vec2bert_params/1024/1024:.2f}MB)")
	print(f"LlamaNAR参数量: {llama_params:,} ({llama_params/1024/1024:.2f}MB)")
	print(f"其他模块参数量: {other_params:,} ({other_params/1024/1024:.2f}MB)")
	print(f"总参数量: {total_params:,} ({total_params/1024/1024:.2f}MB)")

	# 计算百分比
	print(f"\n=== 参数量占比 ===")
	print(f"Wav2Vec2BERT: {wav2vec2bert_params/total_params*100:.2f}%")
	print(f"LlamaNAR: {llama_params/total_params*100:.2f}%")
	print(f"其他模块: {other_params/total_params*100:.2f}%")

	# 测试运行时间和内存使用
	print("\n=== 测试运行时间和内存使用 (batch_size=4) ===")
	batch_size = 4
	total_samples = 600000

	# 清空GPU缓存
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	initial_memory = torch.cuda.memory_allocated() / 1024 / 1024
	print(f"初始GPU内存使用: {initial_memory:.2f}MB")

	# 初始化数据集
	print("\n初始化数据集...")
	ds = train_MultiDataset(max_prompts=3)

	# 创建DataLoader
	dl = DataLoader(ds,
	batch_size=batch_size,
	shuffle=True,
	collate_fn=collate_fn,
	num_workers=4)

	print(f"\n数据集大小: {len(ds)}")
	print(f"批次数量: {len(dl)}")

	# 计算一个batch的平均时间
	num_test_batches = 10
	total_time = 0
	max_memory = 0

	print(f"\n测试{num_test_batches}个batch的平均运行时间...")
	with torch.no_grad():
	for i, batch in enumerate(tqdm(dl, total=num_test_batches)):
	if i >= num_test_batches:
	break

	# 正确处理字典类型的特征
	main_features = {
	'input_features': batch['main_features']['input_features'].to(device),
	'attention_mask': batch['main_features']['attention_mask'].to(device)
	}

	prompt_features = [{
	'input_features': pf['input_features'].to(device),
	'attention_mask': pf['attention_mask'].to(device)
	} for pf in batch['prompt_features']]

	labels = batch['labels'].to(device)
	prompt_labels = batch['prompt_labels'].to(device)

	# 记录开始时间
	start_time = time.time()

	# 前向传播
	outputs = model({
	'main_features': main_features,
	'prompt_features': prompt_features,
	'prompt_labels': prompt_labels
	})

	# 确保GPU运算完成
	if torch.cuda.is_available():
	torch.cuda.synchronize()

	# 记录结束时间和内存使用
	end_time = time.time()
	total_time += (end_time - start_time)

	if torch.cuda.is_available():
	current_memory = torch.cuda.memory_allocated() / 1024 / 1024
	max_memory = max(max_memory, current_memory)

	# 打印第一个batch的详细信息
	if i == 0:
	print("\n=== 第一个Batch的详细信息 ===")
	print(f"主特征形状: {main_features['input_features'].shape}")
	print(f"主掩码形状: {main_features['attention_mask'].shape}")
	print(f"Prompt特征形状: {prompt_features[0]['input_features'].shape}")
	print(f"Prompt掩码形状: {prompt_features[0]['attention_mask'].shape}")
	print(f"标签形状: {labels.shape}")
	print(f"Prompt标签形状: {prompt_labels.shape}")
	print(f"模型输出形状: {outputs.shape}")
	print(f"输出logits范围: [{outputs.min().item():.3f}, {outputs.max().item():.3f}]")

	# 计算和打印统计信息
	avg_time = total_time / num_test_batches
	print(f"\n=== 性能统计 ===")
	print(f"平均每个batch处理时间: {avg_time:.4f}秒")
	print(f"估计处理{total_samples}个样本需要: {(total_samples/batch_size*avg_time/3600):.2f}小时")
	if torch.cuda.is_available():
	print(f"最大GPU内存使用: {max_memory:.2f}MB")
	print(f"GPU内存增长: {max_memory - initial_memory:.2f}MB")

	print("\n测试完成!")