dkounadis
/

wav2small

@@ -19,7 +19,7 @@ tags:
 # Arousal - Dominance - Valence
 Dimensional Speech Emotion Recognition model of simultaneous use of [WavLM](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) / [Wav2Vec2.0](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim).
-Achieves `0.6760566` valence CCC on [MSP Podcast Test 1](https://paperswithcode.com/sota/speech-emotion-recognition-on-msp-podcast). Used as teacher for [wav2small](https://arxiv.org/abs/2408.13920).
@@ -48,9 +48,8 @@ import torch
 import types
 import torch.nn as nn
 from transformers import AutoModelForAudioClassification
-from transformers.models.wav2vec2.modeling_wav2vec2 import (
-    Wav2Vec2Model,
-    Wav2Vec2PreTrainedModel)
 signal = torch.from_numpy(
@@ -85,22 +84,19 @@ class Dawn(Wav2Vec2PreTrainedModel):
         self.classifier = ADV(config)
     def forward(self, x):
-        '''x: (batch, audio-samples-16KHz)'''
         x -= x.mean(1, keepdim=True)
         variance = (x * x).mean(1, keepdim=True) + 1e-7
-        x = self.wav2vec2(x / variance.sqrt()
-                            ).last_hidden_state
-        return self.classifier(x.mean(1))
-def _infer(self, x):
     '''x: (batch, audio-samples-16KHz)'''
-    x = (x + self.config.mean) / self.config.std  # plus
     x = self.ssl_model(x, attention_mask=None).last_hidden_state
     # pool
     h = self.pool_model.sap_linear(x).tanh()
-    w = torch.matmul(h, self.pool_model.attention)
-    w = w.softmax(1)
     mu = (x * w).sum(1)
     x = torch.cat(
         [
@@ -115,7 +111,7 @@ def _infer(self, x):
 base = AutoModelForAudioClassification.from_pretrained(
         '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
         trust_remote_code=True).to(device).eval()
-base.forward = types.MethodType(_infer, base)
 # Wav2Vec2
@@ -128,6 +124,7 @@ def wav2small(x):
     return .5 * dawn(x) + .5 * base(x)
 pred = wav2small(signal.to(device))
-print(f'\nArousal = {pred[:, 0]}  Dominance = {pred[:, 1]}',
-      f'  Valence  = {pred[:, 2]}')
 ```

 # Arousal - Dominance - Valence
 Dimensional Speech Emotion Recognition model of simultaneous use of [WavLM](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) / [Wav2Vec2.0](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim).
+Achieves `0.6760566` valence CCC on [MSP Podcast Test 1](https://paperswithcode.com/sota/speech-emotion-recognition-on-msp-podcast). Used as teacher for [Wav2Small](https://arxiv.org/abs/2408.13920).
 import types
 import torch.nn as nn
 from transformers import AutoModelForAudioClassification
+from transformers.models.wav2vec2.modeling_wav2vec2 import (Wav2Vec2Model,
+                                                  Wav2Vec2PreTrainedModel)
 signal = torch.from_numpy(
         self.classifier = ADV(config)
     def forward(self, x):
         x -= x.mean(1, keepdim=True)
         variance = (x * x).mean(1, keepdim=True) + 1e-7
+        x = self.wav2vec2(x / variance.sqrt())
+        return self.classifier(x.last_hidden_state.mean(1))
+def _forward(self, x):
     '''x: (batch, audio-samples-16KHz)'''
+    x = (x + self.config.mean) / self.config.std  # sgn
     x = self.ssl_model(x, attention_mask=None).last_hidden_state
     # pool
     h = self.pool_model.sap_linear(x).tanh()
+    w = torch.matmul(h, self.pool_model.attention).softmax(1)
     mu = (x * w).sum(1)
     x = torch.cat(
         [
 base = AutoModelForAudioClassification.from_pretrained(
         '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
         trust_remote_code=True).to(device).eval()
+base.forward = types.MethodType(_forward, base)
 # Wav2Vec2
     return .5 * dawn(x) + .5 * base(x)
 pred = wav2small(signal.to(device))
+print(f'Arousal={pred[0, 0]} '
+      f'Dominance={pred[0, 1]} ',
+      f'Valence={pred[0, 2]}')
 ```