hagenw commited on
Commit
c6c2fd0
·
1 Parent(s): e079ec8

Add first version of app

Browse files
Files changed (3) hide show
  1. README.md +4 -0
  2. app.py +144 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -8,6 +8,10 @@ sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-4.0
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-4.0
11
+ tags:
12
+ - age
13
+ - gender
14
+ - audio
15
  ---
16
 
17
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import spaces
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import Wav2Vec2Processor
7
+ from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
8
+ from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
9
+
10
+ import audiofile
11
+
12
+
13
+ class ModelHead(nn.Module):
14
+ r"""Classification head."""
15
+
16
+ def __init__(self, config, num_labels):
17
+
18
+ super().__init__()
19
+
20
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
21
+ self.dropout = nn.Dropout(config.final_dropout)
22
+ self.out_proj = nn.Linear(config.hidden_size, num_labels)
23
+
24
+ def forward(self, features, **kwargs):
25
+
26
+ x = features
27
+ x = self.dropout(x)
28
+ x = self.dense(x)
29
+ x = torch.tanh(x)
30
+ x = self.dropout(x)
31
+ x = self.out_proj(x)
32
+
33
+ return x
34
+
35
+
36
+ class AgeGenderModel(Wav2Vec2PreTrainedModel):
37
+ r"""Speech emotion classifier."""
38
+
39
+ def __init__(self, config):
40
+
41
+ super().__init__(config)
42
+
43
+ self.config = config
44
+ self.wav2vec2 = Wav2Vec2Model(config)
45
+ self.age = ModelHead(config, 1)
46
+ self.gender = ModelHead(config, 3)
47
+ self.init_weights()
48
+
49
+ def forward(
50
+ self,
51
+ input_values,
52
+ ):
53
+
54
+ outputs = self.wav2vec2(input_values)
55
+ hidden_states = outputs[0]
56
+ hidden_states = torch.mean(hidden_states, dim=1)
57
+ logits_age = self.age(hidden_states)
58
+ logits_gender = torch.softmax(self.gender(hidden_states), dim=1)
59
+
60
+ return hidden_states, logits_age, logits_gender
61
+
62
+
63
+
64
+ # load model from hub
65
+ device = 0 if torch.cuda.is_available() else "cpu"
66
+ model_name = "audeering/wav2vec2-large-robust-24-ft-age-gender"
67
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
68
+ model = AgeGenderModel.from_pretrained(model_name)
69
+
70
+
71
+ def process_func(x: np.ndarray, sampling_rate: int) -> dict:
72
+ r"""Predict age and gender or extract embeddings from raw audio signal."""
73
+ # run through processor to normalize signal
74
+ # always returns a batch, so we just get the first entry
75
+ # then we put it on the device
76
+ y = processor(x, sampling_rate=sampling_rate)
77
+ y = y['input_values'][0]
78
+ y = y.reshape(1, -1)
79
+ y = torch.from_numpy(y).to(device)
80
+
81
+ # run through model
82
+ with torch.no_grad():
83
+ y = model(y)
84
+ y = torch.hstack([y[1], y[2]])
85
+
86
+ # convert to numpy
87
+ y = y.detach().cpu().numpy()
88
+
89
+ # convert to dict
90
+ y = [
91
+ {"score": 100 * y[0][0], "label": "age"},
92
+ {"score": y[0][1], "label": "female"},
93
+ {"score": y[0][2], "label": "male"},
94
+ {"score": y[0][3], "label": "child"},
95
+ ]
96
+
97
+ return y
98
+
99
+
100
+ @spaces.GPU
101
+ def recognize(file):
102
+ if file is None:
103
+ raise gr.Error(
104
+ "No audio file submitted! "
105
+ "Please upload or record an audio file "
106
+ "before submitting your request."
107
+ )
108
+ signal, sampling_rate = audiofile.read(file)
109
+ age_gender = process_func(signal, sampling_rate)
110
+ return age_gender
111
+
112
+
113
+ demo = gr.Blocks()
114
+
115
+ outputs = gr.outputs.Label()
116
+ title = "audEERING age and gender recognition"
117
+ description = (
118
+ "Recognize age and gender of a microphone recording or audio file. "
119
+ "Demo uses the checkpoint [{model_name}](https://huggingface.co/{model_name})."
120
+ )
121
+ allow_flagging = "never"
122
+
123
+ microphone = gr.Interface(
124
+ fn=recognize,
125
+ inputs=gr.Audio(sources="microphone", type="filepath"),
126
+ outputs=outputs,
127
+ title=title,
128
+ description=description,
129
+ allow_flagging=allow_flagging,
130
+ )
131
+
132
+ file = gr.Interface(
133
+ fn=recognize,
134
+ inputs=gr.Audio(sources="upload", type="filepath", label="Audio file"),
135
+ outputs=outputs,
136
+ title=title,
137
+ description=description,
138
+ allow_flagging=allow_flagging,
139
+ )
140
+
141
+ with demo:
142
+ gr.TabbedInterface([microphone, file], ["Microphone", "Audio file"])
143
+
144
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ audiofile
2
+ # torch
3
+ transformers