waiv commited on
Commit
f684a69
·
verified ·
1 Parent(s): 221e661

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +162 -3
  2. __init__.py +4 -0
  3. preprocessor_config.json +9 -0
  4. requirements.txt +28 -0
README.md CHANGED
@@ -1,3 +1,162 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - facebook/wav2vec2-large-xlsr-53
4
+ language:
5
+ - en
6
+ license:
7
+ - cc-by-nc-4.0
8
+ pipeline_tag: audio-classification
9
+ tags:
10
+ - audio
11
+ - classification
12
+ - Wav2Vec2
13
+ - sentiment
14
+ - earnings conference calls
15
+ ---
16
+
17
+ # FinVoc2Vec
18
+ We introduce FinVoc2Vec, a vocal tone classifier designed for real-world corporate disclosures.
19
+ In the first stage, we apply a self-supervised pre-training procedure that allows the base model to adapt to the acoustic characteristics of disclosure environments using a sample of 500,000 unlabeled sentences of conference call speech. In the second stage, we apply a supervised fine-tuning procedure that enables the model to learn representations of human-labeled vocal tone. We construct a speech corpus containing
20
+ 5,000 audio recordings of linguistically neutral sentences from conference calls and manually label each sentence with perceived vocal tone — positive, negative, or neutral.
21
+
22
+
23
+ ## Example Usage
24
+ ```python
25
+ import torch
26
+ import torch.nn.functional as F
27
+ from torch.utils.data import DataLoader
28
+ from datasets import load_dataset
29
+ from dataclasses import dataclass
30
+ from typing import Dict, List, Optional, Union
31
+ from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModel
32
+ import torchaudio
33
+
34
+ @dataclass
35
+ class DataCollatorWithPadding:
36
+
37
+ processor: Union[Wav2Vec2Processor, Wav2Vec2FeatureExtractor]
38
+ padding: Union[bool, str] = True
39
+ max_length: Optional[int] = None
40
+ pad_to_multiple_of: Optional[int] = None
41
+
42
+ def __call__(self,
43
+ features: List[Dict[str, Union[List[int], torch.Tensor]]]
44
+ )-> Dict[str, torch.Tensor]:
45
+
46
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
47
+
48
+ # trunc and pad max lengths, get attention mask
49
+ batch = self.processor.pad(
50
+ input_features,
51
+ padding=self.padding,
52
+ max_length=self.max_length,
53
+ pad_to_multiple_of=self.pad_to_multiple_of,
54
+ return_tensors="pt")
55
+ return batch
56
+
57
+
58
+ def preprocess_audio(batch: Dict,
59
+ feature_extractor: Wav2Vec2FeatureExtractor = None,
60
+ max_duration: Optional[float] = 20.0):
61
+
62
+ target_sr = feature_extractor.sampling_rate # 16kHz
63
+ audio_arrays = []
64
+
65
+ for path in batch['path']:
66
+ audio_array, sampling_rate = torchaudio.load(path)
67
+
68
+ # split to mono if multiple channels exist
69
+ if audio_array.shape[0] > 1:
70
+ audio_array = torch.mean(audio_array, dim=0, keepdim=True)
71
+
72
+ # resample audio
73
+ resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
74
+ audio_array = resampler(audio_array).squeeze().numpy()
75
+ audio_arrays.append(audio_array)
76
+
77
+ # set params for feature extractor
78
+ max_length = int(target_sr*max_duration) if max_duration is not None else None
79
+
80
+ # use feature extractor to normalize inputs and trunc data
81
+ result = feature_extractor(
82
+ audio_arrays,
83
+ sampling_rate=target_sr,
84
+ max_length=max_length,
85
+ truncation=bool(max_length))
86
+ return result
87
+
88
+ # load model
89
+ model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
90
+
91
+ # load feature extractor
92
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
93
+
94
+ # load dataset
95
+ # NOTE: Needed feature: 'path' -> path to the audio-data
96
+ test_dataset = load_dataset(r'path/to/dataset')
97
+
98
+ # preprocess audio data
99
+ test_dataset = test_dataset.map(
100
+ preprocess_audio,
101
+ batch_size=1000,
102
+ batched=True,
103
+ num_proc=4,
104
+ fn_kwargs={'feature_extractor': feature_extractor,
105
+ 'max_duration': 20.0})
106
+
107
+ data_collator = DataCollatorWithPadding(feature_extractor)
108
+
109
+ data_loader = DataLoader(
110
+ test_dataset,
111
+ batch_size=16,
112
+ shuffle=False,
113
+ collate_fn=data_collator,
114
+ num_workers=4)
115
+
116
+ with torch.no_grad():
117
+ for batch in data_loader:
118
+
119
+ attention_mask, inputs = batch['attention_mask'], batch['input_values']
120
+ model_output = model(inputs, attention_mask=attention_mask)
121
+
122
+ logits = model_output['logits'].to(torch.float32)
123
+ probs = F.softmax(logits, dim=1).numpy()
124
+
125
+ label_to_id = model.config.label2id
126
+ dict_probs = {f'prob_negative': probs[:, label_to_id['negative']],
127
+ f'prob_neutral': probs[:, label_to_id['neutral']],
128
+ f'prob_positive': probs[:, label_to_id['positive']]}
129
+ ```
130
+
131
+ ## Register for autoclass
132
+ To register the model for your local autoclass, use the following code:
133
+ ```python
134
+ from transformers import AutoConfig, AutoModel
135
+
136
+ # download model and config
137
+ finvoc2vec_config = AutoConfig.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
138
+ finvoc2vec_model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
139
+
140
+ # register model and config for automodel class
141
+ AutoConfig.register("finvoc2vec", FinVoc2VecConfig)
142
+ AutoModel.register(FinVoc2VecConfig, FinVoc2Vec)
143
+ ```
144
+
145
+ ## Further resources
146
+ Check the 🤗 Hugging Face [Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2) model description for additional resources and configurations.
147
+
148
+ ## License
149
+ - This model is a derivative work based on Wav2Vec2 (Apache-2.0)
150
+ - This model is licensed under the Creative Commons Attribution Non Commercial 4.0 licence (CC-BY-NC-4.0)
151
+
152
+ ## Paper
153
+ - [Listen Closely: Measuring Vocal Tone in Corporate Disclosures](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4307178)
154
+
155
+ ## BibTeX
156
+ ```
157
+ @article{ewertz2024,
158
+ title={Listen Closely: Measuring Vocal Tone in Corporate Disclosures},
159
+ author={Ewertz, Jonas and Knickrehm, Charlotte and Nienhaus, Martin and Reichmann, Doron},
160
+ year={2024},
161
+ note={Available at SSRN: \url{https://ssrn.com/abstract=4307178}}
162
+ }
__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .finvoc2vec_config import FinVoc2VecConfig
2
+ from .finvoc2vec_model import FinVoc2Vec
3
+
4
+ __all__ = ['FinVoc2VecConfig', 'FinVoc2Vec']
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets==2.20.0
2
+ numba==0.59.1
3
+ numpy==1.26.2
4
+ nvidia-cublas-cu12==12.1.3.1
5
+ nvidia-cuda-cupti-cu12==12.1.105
6
+ nvidia-cuda-nvcc-cu12==12.2.140
7
+ nvidia-cuda-nvrtc-cu12==12.1.105
8
+ nvidia-cuda-runtime-cu12==12.1.105
9
+ nvidia-cudnn-cu12==8.9.2.26
10
+ nvidia-cufft-cu12==11.0.2.54
11
+ nvidia-curand-cu12==10.3.2.106
12
+ nvidia-cusolver-cu12==11.4.5.107
13
+ nvidia-cusparse-cu12==12.1.0.106
14
+ nvidia-nccl-cu12==2.20.5
15
+ nvidia-nvjitlink-cu12==12.2.140
16
+ nvidia-nvtx-cu12==12.1.105
17
+ pandas==2.1.4
18
+ torch==2.3.1
19
+ torch-audiomentations==0.11.0
20
+ torch-pitch-shift==1.2.4
21
+ torch-tb-profiler==0.4.3
22
+ torchaudio==2.3.1
23
+ torchinfo==1.8.0
24
+ torchmetrics==1.2.1
25
+ torchsummary==1.5.1
26
+ torchvision==0.18.1
27
+ transformers==4.42.4
28
+ tqdm==4.66.4