Spaces:

amphion
/

DeepfakeDetection

Running on Zero

App Files Files Community

wli3221134 commited on 24 days ago

Commit

34146f0

verified ·

1 Parent(s): 851ce73

Upload 5 files

Browse files

Files changed (5) hide show

app.py +64 -15
dataset.py +130 -0
inference.py +2 -0
llama_nar.py +571 -0
model.py +379 -0

app.py CHANGED Viewed

@@ -1,15 +1,56 @@
 import gradio as gr
 import os
-# import inference
-def audio_deepfake_detection(demonstration_paths, audio_path):
     """Audio deepfake detection function"""
     # Replace with your actual detection logic
     print("Demonstration audio paths: {}".format(demonstration_paths))
     print("Query audio path: {}".format(audio_path))
     # Example return value, modify according to your model
-    result = inference.detect(demonstration_paths, audio_path)
     # Return detection results and confidence scores
     return {
@@ -33,14 +74,22 @@ with gr.Blocks() as demo:
         """
     )
-    # Demonstration audio input component
-    demonstration_audio_input = gr.Audio(
-        sources=["upload"],
-        label="Demonstration Audios",
-        type="filepath",
-    )
-    # Audio input component
     query_audio_input = gr.Audio(
         sources=["upload"],
         label="Query Audio (Audio for Detection)",
@@ -56,7 +105,7 @@ with gr.Blocks() as demo:
     # Set click event
     submit_btn.click(
         fn=audio_deepfake_detection,
-        inputs=[demonstration_audio_input, query_audio_input],
         outputs=[output_labels]
     )
@@ -64,10 +113,10 @@ with gr.Blocks() as demo:
     gr.Markdown("## Test Examples")
     gr.Examples(
         examples=[
-            ["examples/real_audio.wav", "examples/query_audio.wav"],
-            ["examples/fake_audio.wav", "examples/query_audio.wav"],
         ],
-        inputs=[demonstration_audio_input, query_audio_input],
     )
 if __name__ == "__main__":

 import gradio as gr
 import os
+import dataset
+import torch
+from model import Wav2Vec2BERT_Llama
+# init
+device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
+# init model
+model = Wav2Vec2BERT_Llama().to(device)
+checkpoint_path = "ckpt/model_checkpoint.pth"
+if os.path.exists(checkpoint_path):
+    checkpoint = torch.load(checkpoint_path)
+    model_state_dict = checkpoint['model_state_dict']
+    # 处理模型状态字典
+    if hasattr(model, 'module') and not any(key.startswith('module.') for key in model_state_dict.keys()):
+        model_state_dict = {'module.' + key: value for key, value in model_state_dict.items()}
+    elif not hasattr(model, 'module') and any(key.startswith('module.') for key in model_state_dict.keys()):
+        model_state_dict = {key.replace('module.', ''): value for key, value in model_state_dict.items()}
+    model.load_state_dict(model_state_dict)
+    model.eval()
+else:
+    raise FileNotFoundError(f"Not found checkpoint: {checkpoint_path}")
+def detect(dataset, model):
+    with torch.no_grad():
+        for batch in dataset:
+            main_features = {
+                'input_features': batch['main_features']['input_features'].to(device),
+                'attention_mask': batch['main_features']['attention_mask'].to(device)
+            }
+            prompt_features = [{
+                'input_features': pf['input_features'].to(device),
+                'attention_mask': pf['attention_mask'].to(device)
+            } for pf in batch['prompt_features']]
+def audio_deepfake_detection(demonstration_paths, audio_path, model):
     """Audio deepfake detection function"""
     # Replace with your actual detection logic
     print("Demonstration audio paths: {}".format(demonstration_paths))
     print("Query audio path: {}".format(audio_path))
+    # dataset
+    dataset = dataset.DemoDataset(demonstration_paths, audio_path)
     # Example return value, modify according to your model
+    result = detect(dataset, model)
     # Return detection results and confidence scores
     return {
         """
     )
+    # Create container for demonstration audio
+    with gr.Row():
+        # Demonstration audio file upload
+        demonstration_audio_input = gr.File(
+            file_count="multiple",
+            file_types=["audio"],
+            label="Demonstration Audios",
+        )
+        # Add demonstration type selection
+        demonstration_type = gr.Dropdown(
+            choices=["bonafide", "spoof"],
+            value="bonafide",
+            label="Demonstration Label",
+        )
+    # Query audio input component
     query_audio_input = gr.Audio(
         sources=["upload"],
         label="Query Audio (Audio for Detection)",
     # Set click event
     submit_btn.click(
         fn=audio_deepfake_detection,
+        inputs=[demonstration_audio_input, demonstration_type, query_audio_input],
         outputs=[output_labels]
     )
     gr.Markdown("## Test Examples")
     gr.Examples(
         examples=[
+            ["examples/real_audio.wav", "bonafide", "examples/query_audio.wav"],
+            ["examples/fake_audio.wav", "spoof", "examples/query_audio.wav"],
         ],
+        inputs=[demonstration_audio_input, demonstration_type, query_audio_input],
     )
 if __name__ == "__main__":

dataset.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+from torch.utils.data import Dataset
+from transformers import AutoFeatureExtractor
+import os
+import librosa
+import numpy as np
+class DemoDataset(Dataset):
+    def __init__(self, demonstration_paths, query_path, sample_rate=16000):
+        self.sample_rate = sample_rate
+        self.query_path = query_path
+        # Convert to list if single path
+        if isinstance(demonstration_paths, str):
+            self.demonstration_paths = [demonstration_paths]
+        else:
+            self.demonstration_paths = demonstration_paths
+        # Load feature extractor
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+        print(f'Number of demonstration audios: {len(self.demonstration_paths)}')
+        print(f'Query audio: {self.query_path}')
+    def load_pad(self, path, max_length=64000):
+        """Load and pad audio file"""
+        X, sr = librosa.load(path, sr=self.sample_rate)
+        X = self.pad(X, max_length)
+        return X
+    def pad(self, x, max_len=64000):
+        """Pad audio to fixed length"""
+        x_len = x.shape[0]
+        if x_len >= max_len:
+            return x[:max_len]
+        pad_length = max_len - x_len
+        return np.concatenate([x, np.zeros(pad_length)], axis=0)
+    def __len__(self):
+        return 1  # Only one query audio
+    def __getitem__(self, idx):
+        # Load query audio
+        query_waveform = self.load_pad(self.query_path)
+        query_waveform = torch.from_numpy(query_waveform).float()
+        if len(query_waveform.shape) == 1:
+            query_waveform = query_waveform.unsqueeze(0)
+        # Extract features for query audio
+        main_features = self.feature_extractor(
+            query_waveform,
+            sampling_rate=self.sample_rate,
+            padding=True,
+            return_attention_mask=True,
+            return_tensors="pt"
+        )
+        # Process demonstration audios
+        prompt_features = []
+        for demo_path in self.demonstration_paths:
+            # Load demonstration audio
+            demo_waveform = self.load_pad(demo_path)
+            demo_waveform = torch.from_numpy(demo_waveform).float()
+            if len(demo_waveform.shape) == 1:
+                demo_waveform = demo_waveform.unsqueeze(0)
+            # Extract features
+            prompt_feature = self.feature_extractor(
+                demo_waveform,
+                sampling_rate=self.sample_rate,
+                padding=True,
+                return_attention_mask=True,
+                return_tensors="pt"
+            )
+            prompt_features.append(prompt_feature)
+        return {
+            'main_features': main_features,
+            'prompt_features': prompt_features,
+            'file_name': os.path.basename(self.query_path),
+            'file_path': self.query_path
+        }
+def collate_fn(batch):
+    """
+    Collate function for dataloader
+    Args:
+        batch: List containing dictionaries with:
+            - main_features: feature extractor output
+            - prompt_features: list of feature extractor outputs
+            - file_name: file name
+            - file_path: file path
+    """
+    batch_size = len(batch)
+    # Process main features
+    main_features_keys = batch[0]['main_features'].keys()
+    main_features = {}
+    for key in main_features_keys:
+        main_features[key] = torch.cat([item['main_features'][key] for item in batch], dim=0)
+    # Get number of prompts
+    num_prompts = len(batch[0]['prompt_features'])
+    # Process prompt features
+    prompt_features = []
+    for i in range(num_prompts):
+        prompt_feature = {}
+        for key in main_features_keys:
+            prompt_feature[key] = torch.cat([item['prompt_features'][i][key] for item in batch], dim=0)
+        prompt_features.append(prompt_feature)
+    # Collect file names and paths
+    file_names = [item['file_name'] for item in batch]
+    file_paths = [item['file_path'] for item in batch]
+    return {
+        'main_features': main_features,
+        'prompt_features': prompt_features,
+        'file_names': file_names,
+        'file_paths': file_paths
+    }
+if __name__ == '__main__':
+    # Test the dataset
+    demo_paths = ["examples/demo1.wav", "examples/demo2.wav"]
+    query_path = "examples/query.wav"
+    dataset = DemoDataset(demo_paths, query_path)
+    print(dataset[0])

inference.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def detect(dataset):
2	+ pass

llama_nar.py ADDED Viewed

	@@ -0,0 +1,571 @@

+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+import torch
+import torch.nn.functional as F
+import numpy as np
+import os
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+import math
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+from torchmetrics.classification import MulticlassAccuracy
+from transformers.models.llama.modeling_llama import BaseModelOutputWithPast
+# sinusoidal positional encoding
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :] * 1.0
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class LlamaAdaptiveRMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        # The gamma parameter
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x: torch.Tensor):
+        # (B, Seq_Len, Dim) * (B, Seq_Len, 1) = (B, Seq_Len, Dim)
+        # rsqrt: 1 / sqrt(x)
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor):
+        # (Dim) * (B, Seq_Len, Dim) = (B, Seq_Len, Dim)
+        return self.weight * self._norm(x.float()).type_as(x)
+class MultiEmbedding(nn.Module):
+    """Embedding for multiple quantization layers, summing up the embeddings of each layer."""
+    def __init__(
+        self,
+        num_embeddings=1028,
+        embedding_dim=1024,
+        num_quantization_layers=8,
+    ):
+        super().__init__()
+        self.embeddings = nn.ModuleList(
+            [
+                nn.Embedding(num_embeddings, embedding_dim)
+                for _ in range(num_quantization_layers)
+            ]
+        )
+        # initialize embeddings
+        for i in range(num_quantization_layers):
+            self.embeddings[i].weight.data.normal_(mean=0.0, std=0.02)
+        self._is_hf_initialized = True  # disable automatic init
+    def forward(self, input_ids):
+        """Input: [num_quant, B, T] -> Output: [B, T, H]"""
+        num_quant, B, T = input_ids.shape
+        summed_embeddings = torch.zeros(
+            B, T, self.embeddings[0].embedding_dim, device=input_ids.device
+        )
+        for i in range(num_quant):
+            summed_embeddings += self.embeddings[i](input_ids[i])
+        return summed_embeddings
+class LlamaNARDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        """Override to adaptive layer norm"""
+        super().__init__(config, layer_idx)  # init attention, mlp, etc.
+        self.input_layernorm = LlamaAdaptiveRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = LlamaAdaptiveRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class LlamaNAR(LlamaModel):
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_heads=16,
+        num_layers=16,
+        config=LlamaConfig(0, 256, 1024, 1, 1),
+    ):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                LlamaNARDecoderLayer(
+                    config=LlamaConfig(hidden_size=hidden_size,num_attention_heads=num_heads,max_position_embeddings=4096,intermediate_size=hidden_size*4),
+                    layer_idx=i,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = LlamaAdaptiveRMSNorm(hidden_size)
+        self.multi_embedding = MultiEmbedding(
+            num_quantization_layers=8, embedding_dim=hidden_size
+        )
+        self.post_init()
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        # create noncausal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        def _expand_mask(
+            mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+        ):
+            """
+            Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+            """
+            bsz, src_len = mask.size()
+            tgt_len = tgt_len if tgt_len is not None else src_len
+            expanded_mask = (
+                mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+            )
+            inverted_mask = 1.0 - expanded_mask
+            return inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(dtype).min
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        length: Optional[torch.LongTensor] = None,
+    )-> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length, num_quant = input_ids.shape
+        input_ids = input_ids.permute(2, 0, 1) # [num_quant, B, T]
+        inputs_embeds = self.multi_embedding(input_ids)
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            if self.gradient_checkpointing and self.training:
+                raise NotImplementedError
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        return hidden_states
+class LlamaNAREmb(LlamaModel):
+    """LlamaNAR model that works directly with embeddings input.
+    This variant of LlamaNAR takes pre-computed embeddings as input
+    instead of token IDs that need to be embedded.
+    """
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_heads=16,
+        num_layers=16,
+        config=LlamaConfig(0, 256, 1024, 1, 1),
+    ):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                LlamaNARDecoderLayer(
+                    config=LlamaConfig(hidden_size=hidden_size,num_attention_heads=num_heads,max_position_embeddings=4096,intermediate_size=hidden_size*4),
+                    layer_idx=i,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = LlamaAdaptiveRMSNorm(hidden_size)
+        self.post_init()
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        # create noncausal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        def _expand_mask(
+            mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+        ):
+            """
+            Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+            """
+            bsz, src_len = mask.size()
+            tgt_len = tgt_len if tgt_len is not None else src_len
+            expanded_mask = (
+                mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+            )
+            inverted_mask = 1.0 - expanded_mask
+            return inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(dtype).min
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    )-> torch.Tensor:
+        """
+        Returns:
+            hidden_states: Tensor of shape (batch_size, sequence_length, hidden_size)
+        """
+        if inputs_embeds is None:
+            raise ValueError("inputs_embeds must be provided for LlamaNAREmb")
+        if input_ids is not None:
+            warnings.warn("input_ids is ignored in LlamaNAREmb, use inputs_embeds instead")
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length, hidden_size = inputs_embeds.shape
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            if self.gradient_checkpointing and self.training:
+                raise NotImplementedError
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        return hidden_states
+if __name__ == '__main__':
+    config = LlamaConfig(hidden_size=1024, num_attention_heads=8, num_hidden_layers=8)
+    model = LlamaNAR(config=config)
+    # 模拟输入数据
+    batch_size = 2
+    seq_length = 10
+    n_q = 8
+    input_ids = torch.randint(0, 1028, (batch_size, seq_length, n_q))  # 随机生成输入ID
+    inputs_embeds = torch.randn(batch_size, seq_length, config.hidden_size)  # 随机生成输入嵌入
+    attention_mask = torch.ones(batch_size, seq_length)  # 所有位置可见
+    length = torch.tensor([4,10])  # 输入长度
+    # 前向传播
+    hidden_states, class_out = model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        output_attentions=True,
+        output_hidden_states=True,
+        length=length
+    )
+    # 打印输出形状
+    print("Hidden States Shape:", hidden_states.shape)  # 输出隐藏状态形状
+    print('Class output Shape:', class_out.shape)

model.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import torch
+import torch.nn as nn
+from transformers import Wav2Vec2BertModel
+from llama_nar import LlamaNAREmb
+from transformers import LlamaConfig
+import time
+import torch.nn.functional as F
+class Wav2Vec2BERT_Llama(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # 1. 加载预训练模型
+        self.wav2vec2bert = Wav2Vec2BertModel.from_pretrained("/mntcephfs/lab_data/wangli/pretrain/w2v-bert-2.0/", output_hidden_states=True)
+        # 2. 选择性冻结参数
+        for name, param in self.wav2vec2bert.named_parameters():
+            # 冻结所有FFN1 (保留FFN2的适应能力)
+            if 'ffn1' in name:
+                param.requires_grad = False
+            # 冻结多头注意力中的K,V投影
+            if any(proj in name for proj in ['linear_k', 'linear_v']):
+                param.requires_grad = False
+            # 冻结distance_embedding
+            if 'distance_embedding' in name:
+                param.requires_grad = False
+            # 冻结所有卷积相关模块
+            if any(conv_name in name for conv_name in [
+                'conv_module', 'pointwise_conv', 'depthwise_conv',
+                'feature_extractor', 'pos_conv_embed', 'conv_layers'
+            ]):
+                param.requires_grad = False
+        # 3. 减小Llama模型规模
+        self.llama_nar = LlamaNAREmb(
+            config=LlamaConfig(
+                hidden_size=512,
+                num_attention_heads=8,
+                num_hidden_layers=8,
+            ),
+            num_heads=8,
+            num_layers=8,
+            hidden_size=512
+        )
+        # 4. 降维投影层
+        self.projection = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.LayerNorm(512)
+        )
+        # 5. 简化分类头
+        self.classifier = nn.Sequential(
+            nn.Linear(512, 128),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(128, 2)
+        )
+        # 6. 减小embedding维度
+        self.label_embedding = nn.Embedding(num_embeddings=2, embedding_dim=512)
+        # 7. 简化特征处理层
+        self.feature_processor = nn.Sequential(
+            nn.Linear(512, 512),
+            nn.LayerNorm(512),
+            nn.ReLU(),
+            nn.Dropout(0.1)
+        )
+        # 8. 减小特殊token的维度
+        self.special_tokens = nn.Parameter(torch.randn(4, 512))
+    def _fuse_layers(self, hidden_states):
+        # 修改特征融合方法
+        def downsample_sequence(sequence, factor=10):
+            """对序列进行下采样"""
+            batch_size, seq_len, hidden_size = sequence.shape
+            # 确保序列长度可以被因子整除
+            new_len = seq_len // factor
+            padded_len = new_len * factor
+            if seq_len > padded_len:
+                sequence = sequence[:, :padded_len, :]
+            # 重塑张量并进行平均池化 [batch_size, new_len, factor, hidden_size]
+            reshaped = sequence.reshape(batch_size, new_len, factor, hidden_size)
+            downsampled = torch.mean(reshaped, dim=2)  # [batch_size, new_len, hidden_size]
+            return downsampled
+        # 1. 获取最后一层特征并进行下采样
+        last_layer = hidden_states[-1]  # [batch_size, seq_len, 1024]
+        downsampled_features = downsample_sequence(last_layer)  # [batch_size, seq_len//10, 1024]
+        # 2. 投影到512维度
+        projected_features = self.projection(downsampled_features)  # [batch_size, seq_len//10, 512]
+        return projected_features  # 不再需要unsqueeze，因为已经保留了序列维度
+    def forward(self, batch):
+        main_output = self.wav2vec2bert(
+            **batch['main_features']
+        )
+        fused_features = self._fuse_layers(main_output.hidden_states)
+        fused_features = self.feature_processor(fused_features)
+        if ('prompt_labels' in batch and
+            batch['prompt_labels'] is not None and
+            'prompt_features' in batch and
+            batch['prompt_features'] and
+            len(batch['prompt_features']) > 0):
+            batch_size, num_prompts = batch['prompt_labels'].shape
+            # 重塑特征以批量处理
+            prompt_features = batch['prompt_features']
+            all_prompt_outputs = []
+            for i in range(num_prompts):
+                prompt_output = self.wav2vec2bert(
+                    **prompt_features[i]
+                )
+                all_prompt_outputs.append(self._fuse_layers(prompt_output.hidden_states))
+            if all_prompt_outputs:
+                fused_prompts = torch.stack([
+                    self.feature_processor(p) for p in all_prompt_outputs
+                ], dim=1)  # [batch_size, num_prompts, seq_len, hidden_size]
+                # 获取label embeddings并扩展到对应序列长度
+                label_embs = self.label_embedding(batch['prompt_labels'])  # [batch_size, num_prompts, 512]
+                prompt_embeddings = []
+                for i in range(batch_size):
+                    sequence = []
+                    # 添加示例prompts
+                    for j in range(num_prompts):
+                        prompt_seq_len = fused_prompts[i, j].size(0)  # 获取当前prompt的序列长度
+                        sequence.append(self.special_tokens[1].expand(1, -1))  # [PROMPT]
+                        sequence.append(self.special_tokens[2].expand(1, -1))  # [AUDIO]
+                        sequence.append(fused_prompts[i, j])  # [seq_len, hidden_size]
+                        sequence.append(self.special_tokens[3].expand(1, -1))  # [LABEL]
+                        # 扩展label embedding到与音频特征相同的长度
+                        expanded_label = label_embs[i, j].unsqueeze(0).expand(prompt_seq_len, -1)
+                        sequence.append(expanded_label)  # [seq_len, hidden_size]
+                        sequence.append(self.special_tokens[0].expand(1, -1))  # [SEP]
+                    # 添加待预测的主特征
+                    main_seq_len = fused_features[i].size(0)  # 获取主特征的序列长度
+                    sequence.append(self.special_tokens[1].expand(1, -1))  # [PROMPT]
+                    sequence.append(self.special_tokens[2].expand(1, -1))  # [AUDIO]
+                    sequence.append(fused_features[i])  # [main_seq_len, hidden_size]
+                    sequence.append(self.special_tokens[3].expand(1, -1))  # [LABEL]
+                    # 预测位置使用零向量，长度与主特征相同
+                    sequence.append(torch.zeros(main_seq_len, fused_features.size(-1)).to(fused_features.device))
+                    prompt_embeddings.append(torch.cat(sequence, dim=0))
+                prompt_embeddings = torch.stack(prompt_embeddings, dim=0)
+        else:
+            # 简化无prompt情况的处理
+            batch_size = fused_features.size(0)
+            main_seq_len = fused_features.size(1)  # 直接获取主特征序列长度
+            # 构建序列 [batch_size, total_len, hidden_size]
+            prompt_embeddings = torch.cat([
+                self.special_tokens[1].expand(batch_size, 1, -1),  # [PROMPT]
+                self.special_tokens[2].expand(batch_size, 1, -1),  # [AUDIO]
+                fused_features,  # [batch_size, main_seq_len, hidden_size]
+                self.special_tokens[3].expand(batch_size, 1, -1),  # [LABEL]
+                torch.zeros(batch_size, main_seq_len, fused_features.size(-1)).to(fused_features.device)  # 预测位置
+            ], dim=1)
+        # 输入到llama_nar
+        output = self.llama_nar(inputs_embeds=prompt_embeddings)
+        # 获取所有预测位置的输出（即最后main_seq_len个位置）
+        pred_pos_embeddings = output[:, -main_seq_len:, :]  # [batch_size, main_seq_len, hidden_size]
+        # 对每一帧进行分类
+        frame_logits = self.classifier(pred_pos_embeddings)  # [batch_size, main_seq_len, 2]
+        # 同时返回帧级别的logits和整体的logits（通过平均得到）
+        avg_embedding = torch.mean(pred_pos_embeddings, dim=1)  # [batch_size, hidden_size]
+        avg_logits = self.classifier(avg_embedding)  # [batch_size, 2]
+        return {
+            'frame_logits': frame_logits,  # 每一帧的预测分数
+            'avg_logits': avg_logits       # 整体的预测分数
+        }
+if __name__ == '__main__':
+    import torch
+    from torch.utils.data import DataLoader
+    from dataset.train_MultiDataset import train_MultiDataset, collate_fn
+    from tqdm import tqdm
+    import time
+    # 设置设备
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"\n=== 使用设备: {device} ===")
+    # 初始化模型
+    print("\n=== 初始化模型 ===")
+    model = Wav2Vec2BERT_Llama().to(device)
+    model.eval()  # 设置为评估模式
+    # 打印wav2vec2bert的参数结构
+    print("\n=== Wav2Vec2BERT 参数结构 ===")
+    w2v_params_by_layer = {}
+    total_trainable = 0
+    total_frozen = 0
+    for name, param in model.wav2vec2bert.named_parameters():
+        # 获取主要层名称
+        layer_name = name.split('.')[0]
+        if layer_name not in w2v_params_by_layer:
+            w2v_params_by_layer[layer_name] = {
+                'trainable_params': 0,
+                'frozen_params': 0,
+                'parameter_names': []
+            }
+        # 统计参数
+        if param.requires_grad:
+            w2v_params_by_layer[layer_name]['trainable_params'] += param.numel()
+            total_trainable += param.numel()
+        else:
+            w2v_params_by_layer[layer_name]['frozen_params'] += param.numel()
+            total_frozen += param.numel()
+        w2v_params_by_layer[layer_name]['parameter_names'].append(name)
+    # 打印每层的详细信息
+    print("\n各层参数统计:")
+    for layer_name, info in w2v_params_by_layer.items():
+        trainable_mb = info['trainable_params'] / 1024 / 1024
+        frozen_mb = info['frozen_params'] / 1024 / 1024
+        total_mb = (info['trainable_params'] + info['frozen_params']) / 1024 / 1024
+        print(f"\n{layer_name}:")
+        print(f"  - 总参数量: {total_mb:.2f}MB")
+        print(f"  - 可训练参数: {trainable_mb:.2f}MB")
+        print(f"  - 冻结参数: {frozen_mb:.2f}MB")
+        print(f"  - 参数名称:")
+        for param_name in info['parameter_names']:
+            print(f"    * {param_name}")
+    # 打印总体统计
+    print("\n=== 总体统计 ===")
+    print(f"可训练参数总量: {total_trainable/1024/1024:.2f}MB")
+    print(f"冻结参数总量: {total_frozen/1024/1024:.2f}MB")
+    print(f"参数总量: {(total_trainable + total_frozen)/1024/1024:.2f}MB")
+    print(f"可训练参数占比: {total_trainable/(total_trainable + total_frozen)*100:.2f}%")
+    # 分别统计各个模块的参数量
+    wav2vec2bert_params = sum(p.numel() for p in model.wav2vec2bert.parameters())
+    llama_params = sum(p.numel() for p in model.llama_nar.parameters())
+    other_params = sum(p.numel() for name, p in model.named_parameters()
+                      if not name.startswith('wav2vec2bert.') and not name.startswith('llama_nar.'))
+    total_params = wav2vec2bert_params + llama_params + other_params
+    print(f"\n=== 参数量统计 ===")
+    print(f"Wav2Vec2BERT参数量: {wav2vec2bert_params:,} ({wav2vec2bert_params/1024/1024:.2f}MB)")
+    print(f"LlamaNAR参数量: {llama_params:,} ({llama_params/1024/1024:.2f}MB)")
+    print(f"其他模块参数量: {other_params:,} ({other_params/1024/1024:.2f}MB)")
+    print(f"总参数量: {total_params:,} ({total_params/1024/1024:.2f}MB)")
+    # 计算百分比
+    print(f"\n=== 参数量占比 ===")
+    print(f"Wav2Vec2BERT: {wav2vec2bert_params/total_params*100:.2f}%")
+    print(f"LlamaNAR: {llama_params/total_params*100:.2f}%")
+    print(f"其他模块: {other_params/total_params*100:.2f}%")
+    # 测试运行时间和内存使用
+    print("\n=== 测试运行时间和内存使用 (batch_size=4) ===")
+    batch_size = 4
+    total_samples = 600000
+    # 清空GPU缓存
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        initial_memory = torch.cuda.memory_allocated() / 1024 / 1024
+        print(f"初始GPU内存使用: {initial_memory:.2f}MB")
+    # 初始化数据集
+    print("\n初始化数据集...")
+    ds = train_MultiDataset(max_prompts=3)
+    # 创建DataLoader
+    dl = DataLoader(ds,
+                   batch_size=batch_size,
+                   shuffle=True,
+                   collate_fn=collate_fn,
+                   num_workers=4)
+    print(f"\n数据集大小: {len(ds)}")
+    print(f"批次数量: {len(dl)}")
+    # 计算一个batch的平均时间
+    num_test_batches = 10
+    total_time = 0
+    max_memory = 0
+    print(f"\n测试{num_test_batches}个batch的平均运行时间...")
+    with torch.no_grad():
+        for i, batch in enumerate(tqdm(dl, total=num_test_batches)):
+            if i >= num_test_batches:
+                break
+            # 正确处理字典类型的特征
+            main_features = {
+                'input_features': batch['main_features']['input_features'].to(device),
+                'attention_mask': batch['main_features']['attention_mask'].to(device)
+            }
+            prompt_features = [{
+                'input_features': pf['input_features'].to(device),
+                'attention_mask': pf['attention_mask'].to(device)
+            } for pf in batch['prompt_features']]
+            labels = batch['labels'].to(device)
+            prompt_labels = batch['prompt_labels'].to(device)
+            # 记录开始时间
+            start_time = time.time()
+            # 前向传播
+            outputs = model({
+                'main_features': main_features,
+                'prompt_features': prompt_features,
+                'prompt_labels': prompt_labels
+            })
+            # 确保GPU运算完成
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            # 记录结束时间和内存使用
+            end_time = time.time()
+            total_time += (end_time - start_time)
+            if torch.cuda.is_available():
+                current_memory = torch.cuda.memory_allocated() / 1024 / 1024
+                max_memory = max(max_memory, current_memory)
+            # 打印第一���batch的详细信息
+            if i == 0:
+                print("\n=== 第一个Batch的详细信息 ===")
+                print(f"主特征形状: {main_features['input_features'].shape}")
+                print(f"主掩码形状: {main_features['attention_mask'].shape}")
+                print(f"Prompt特征形状: {prompt_features[0]['input_features'].shape}")
+                print(f"Prompt掩码形状: {prompt_features[0]['attention_mask'].shape}")
+                print(f"标签形状: {labels.shape}")
+                print(f"Prompt标签形状: {prompt_labels.shape}")
+                print(f"模型输出形状: {outputs.shape}")
+                print(f"输出logits范围: [{outputs.min().item():.3f}, {outputs.max().item():.3f}]")
+    # 计算和打印统计信息
+    avg_time = total_time / num_test_batches
+    print(f"\n=== 性能统计 ===")
+    print(f"平均每个batch处理时间: {avg_time:.4f}秒")
+    print(f"估计处理{total_samples}个样本需要: {(total_samples/batch_size*avg_time/3600):.2f}小时")
+    if torch.cuda.is_available():
+        print(f"最大GPU内存使用: {max_memory:.2f}MB")
+        print(f"GPU内存增长: {max_memory - initial_memory:.2f}MB")
+    print("\n测试完成!")