alefiury
/

wav2vec2-large-xlsr-53-gender-recognition-librispeech

@@ -78,32 +78,37 @@ class CustomDataset(torch.utils.data.Dataset):
         return audio
-    def __getitem__(self, index) -> torch.Tensor:
-        """
-        Return the audio and the sampling rate
-        """
         if self.basedir is None:
             filepath = self.dataset[index]
         else:
             filepath = os.path.join(self.basedir, self.dataset[index])
         speech_array, sr = torchaudio.load(filepath)
-        # Transform to mono
         if speech_array.shape[0] > 1:
             speech_array = torch.mean(speech_array, dim=0, keepdim=True)
         if sr != self.sampling_rate:
             transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
             speech_array = transform(speech_array)
             sr = self.sampling_rate
         speech_array = speech_array.squeeze().numpy()
-        # Cut or pad audio
-        speech_array = self._cutorpad(speech_array)
-        return speech_array
 class CollateFunc:
     def __init__(
@@ -172,7 +177,8 @@ def get_gender(model_name_or_path: str, audio_paths: List[str], label2id: Dict,
         id2label=id2label,
     )
-    test_dataset = CustomDataset(audio_paths)
     data_collator = CollateFunc(
         processor=feature_extractor,
         padding=True,

         return audio
+    def __getitem__(self, index):
         if self.basedir is None:
             filepath = self.dataset[index]
         else:
             filepath = os.path.join(self.basedir, self.dataset[index])
         speech_array, sr = torchaudio.load(filepath)
         if speech_array.shape[0] > 1:
             speech_array = torch.mean(speech_array, dim=0, keepdim=True)
         if sr != self.sampling_rate:
             transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
             speech_array = transform(speech_array)
             sr = self.sampling_rate
+        len_audio = speech_array.shape[1]
+        # Pad or truncate the audio to match the desired length
+        if len_audio < self.max_audio_len * self.sampling_rate:
+            # Pad the audio if it's shorter than the desired length
+            padding = torch.zeros(1, self.max_audio_len * self.sampling_rate - len_audio)
+            speech_array = torch.cat([speech_array, padding], dim=1)
+        else:
+            # Truncate the audio if it's longer than the desired length
+            speech_array = speech_array[:, :self.max_audio_len * self.sampling_rate]
         speech_array = speech_array.squeeze().numpy()
+        return {"input_values": speech_array, "attention_mask": None}
 class CollateFunc:
     def __init__(
         id2label=id2label,
     )
+    test_dataset = CustomDataset(audio_paths, max_audio_len=300)  # for 5-minute audio
     data_collator = CollateFunc(
         processor=feature_extractor,
         padding=True,