dkounadis
/

wav2small

@@ -44,8 +44,10 @@ Florian Eyben, Felix Burkhardt, Björn Schuller.
 # Usage
 ```python
 from transformers import AutoModelForAudioClassification
-from transformers.models.wav2vec2.modeling_wav2vec2 import (Wav2Vec2Model,
-    Wav2Vec2PreTrainedModel)
 import torch
 import types
 import torch.nn as nn
@@ -53,32 +55,45 @@ import torch.nn as nn
 signal = torch.rand((1, 16000))  # audio signal 16 KHz
 device = 'cpu'
-class ADV(nn.Module):
-    def __init__(self):
         super().__init__()
-        self.dense = nn.Linear(1024, 1024)
-        self.out_proj = nn.Linear(1024, 3)
     def forward(self, x):
-        x = self.dense(x).tanh()
         return self.out_proj(x)
 class Dawn(Wav2Vec2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.wav2vec2 = Wav2Vec2Model(config)
-        self.classifier = ADV()
     def forward(self, x):
         x = x - x.mean(1, keepdim=True)
         variance = (x * x).mean(1, keepdim=True) + 1e-7
-        x = self.wav2vec2(x / variance.sqrt())[0]
-        return self.classifier(x.mean(1)).clip(0, 1)
-def _fast(self, x):
-    x = (x + self.config.mean) / self.config.std  # sign
     x = self.ssl_model(x, attention_mask=None).last_hidden_state
     # pool
     h = self.pool_model.sap_linear(x).tanh()
@@ -96,24 +111,23 @@ def _fast(self, x):
 # WavLM
 base = AutoModelForAudioClassification.from_pretrained(
-'3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
-trust_remote_code=True).to(device).eval()
-base.forward = types.MethodType(_fast, base)
-# Wav2Vec2.0
 dawn = Dawn.from_pretrained(
-'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
 ).to(device).eval()
 def wav2small(x):
-    '''x: (batch, audio-samples-16KHz)'''
     return .5 * dawn(x) + .5 * base(x)
 with torch.no_grad():
     pred = wav2small(signal.to(device))
-print(f'arousal={pred[0, 0]} dominance={pred[0, 1]}',
-      f'valence={pred[0, 2]}')
 ```

 # Usage
 ```python
 from transformers import AutoModelForAudioClassification
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Model,
+    Wav2Vec2PreTrainedModel
+)
 import torch
 import types
 import torch.nn as nn
 signal = torch.rand((1, 16000))  # audio signal 16 KHz
 device = 'cpu'
+class RegressionHead(nn.Module):
+    r"""A/D/V"""
+    def __init__(self, config):
         super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
     def forward(self, x):
+        x = self.dense(x)
+        x = torch.tanh(x)
         return self.out_proj(x)
 class Dawn(Wav2Vec2PreTrainedModel):
+    r"""https://arxiv.org/abs/2203.07378"""
     def __init__(self, config):
         super().__init__(config)
         self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = RegressionHead(config)
     def forward(self, x):
+        '''x: (batch, audio-samples-16KHz)'''
         x = x - x.mean(1, keepdim=True)
         variance = (x * x).mean(1, keepdim=True) + 1e-7
+        out = self.wav2vec2(x / variance.sqrt())
+        return self.classifier(out[0].mean(1)).clip(0, 1)
+def _infer(self, x):
+    '''x: (batch, audio-samples-16KHz)'''
+    x = (x + self.config.mean) / self.config.std  # plus
     x = self.ssl_model(x, attention_mask=None).last_hidden_state
     # pool
     h = self.pool_model.sap_linear(x).tanh()
 # WavLM
 base = AutoModelForAudioClassification.from_pretrained(
+        '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
+        trust_remote_code=True).to(device).eval()
+base.forward = types.MethodType(_infer, base)
+# Wav2Vec2
 dawn = Dawn.from_pretrained(
+    'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
 ).to(device).eval()
 def wav2small(x):
     return .5 * dawn(x) + .5 * base(x)
 with torch.no_grad():
     pred = wav2small(signal.to(device))
+print(f'\nArousal = {pred[0, 0]}  Dominance = {pred[0, 1]}',
+      f'  Valence  = {pred[0, 2]}')
 ```