Upload 4 files
Browse files- README.md +162 -3
- __init__.py +4 -0
- preprocessor_config.json +9 -0
- requirements.txt +28 -0
README.md
CHANGED
@@ -1,3 +1,162 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model:
|
3 |
+
- facebook/wav2vec2-large-xlsr-53
|
4 |
+
language:
|
5 |
+
- en
|
6 |
+
license:
|
7 |
+
- cc-by-nc-4.0
|
8 |
+
pipeline_tag: audio-classification
|
9 |
+
tags:
|
10 |
+
- audio
|
11 |
+
- classification
|
12 |
+
- Wav2Vec2
|
13 |
+
- sentiment
|
14 |
+
- earnings conference calls
|
15 |
+
---
|
16 |
+
|
17 |
+
# FinVoc2Vec
|
18 |
+
We introduce FinVoc2Vec, a vocal tone classifier designed for real-world corporate disclosures.
|
19 |
+
In the first stage, we apply a self-supervised pre-training procedure that allows the base model to adapt to the acoustic characteristics of disclosure environments using a sample of 500,000 unlabeled sentences of conference call speech. In the second stage, we apply a supervised fine-tuning procedure that enables the model to learn representations of human-labeled vocal tone. We construct a speech corpus containing
|
20 |
+
5,000 audio recordings of linguistically neutral sentences from conference calls and manually label each sentence with perceived vocal tone — positive, negative, or neutral.
|
21 |
+
|
22 |
+
|
23 |
+
## Example Usage
|
24 |
+
```python
|
25 |
+
import torch
|
26 |
+
import torch.nn.functional as F
|
27 |
+
from torch.utils.data import DataLoader
|
28 |
+
from datasets import load_dataset
|
29 |
+
from dataclasses import dataclass
|
30 |
+
from typing import Dict, List, Optional, Union
|
31 |
+
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModel
|
32 |
+
import torchaudio
|
33 |
+
|
34 |
+
@dataclass
|
35 |
+
class DataCollatorWithPadding:
|
36 |
+
|
37 |
+
processor: Union[Wav2Vec2Processor, Wav2Vec2FeatureExtractor]
|
38 |
+
padding: Union[bool, str] = True
|
39 |
+
max_length: Optional[int] = None
|
40 |
+
pad_to_multiple_of: Optional[int] = None
|
41 |
+
|
42 |
+
def __call__(self,
|
43 |
+
features: List[Dict[str, Union[List[int], torch.Tensor]]]
|
44 |
+
)-> Dict[str, torch.Tensor]:
|
45 |
+
|
46 |
+
input_features = [{"input_values": feature["input_values"]} for feature in features]
|
47 |
+
|
48 |
+
# trunc and pad max lengths, get attention mask
|
49 |
+
batch = self.processor.pad(
|
50 |
+
input_features,
|
51 |
+
padding=self.padding,
|
52 |
+
max_length=self.max_length,
|
53 |
+
pad_to_multiple_of=self.pad_to_multiple_of,
|
54 |
+
return_tensors="pt")
|
55 |
+
return batch
|
56 |
+
|
57 |
+
|
58 |
+
def preprocess_audio(batch: Dict,
|
59 |
+
feature_extractor: Wav2Vec2FeatureExtractor = None,
|
60 |
+
max_duration: Optional[float] = 20.0):
|
61 |
+
|
62 |
+
target_sr = feature_extractor.sampling_rate # 16kHz
|
63 |
+
audio_arrays = []
|
64 |
+
|
65 |
+
for path in batch['path']:
|
66 |
+
audio_array, sampling_rate = torchaudio.load(path)
|
67 |
+
|
68 |
+
# split to mono if multiple channels exist
|
69 |
+
if audio_array.shape[0] > 1:
|
70 |
+
audio_array = torch.mean(audio_array, dim=0, keepdim=True)
|
71 |
+
|
72 |
+
# resample audio
|
73 |
+
resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
|
74 |
+
audio_array = resampler(audio_array).squeeze().numpy()
|
75 |
+
audio_arrays.append(audio_array)
|
76 |
+
|
77 |
+
# set params for feature extractor
|
78 |
+
max_length = int(target_sr*max_duration) if max_duration is not None else None
|
79 |
+
|
80 |
+
# use feature extractor to normalize inputs and trunc data
|
81 |
+
result = feature_extractor(
|
82 |
+
audio_arrays,
|
83 |
+
sampling_rate=target_sr,
|
84 |
+
max_length=max_length,
|
85 |
+
truncation=bool(max_length))
|
86 |
+
return result
|
87 |
+
|
88 |
+
# load model
|
89 |
+
model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
|
90 |
+
|
91 |
+
# load feature extractor
|
92 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
|
93 |
+
|
94 |
+
# load dataset
|
95 |
+
# NOTE: Needed feature: 'path' -> path to the audio-data
|
96 |
+
test_dataset = load_dataset(r'path/to/dataset')
|
97 |
+
|
98 |
+
# preprocess audio data
|
99 |
+
test_dataset = test_dataset.map(
|
100 |
+
preprocess_audio,
|
101 |
+
batch_size=1000,
|
102 |
+
batched=True,
|
103 |
+
num_proc=4,
|
104 |
+
fn_kwargs={'feature_extractor': feature_extractor,
|
105 |
+
'max_duration': 20.0})
|
106 |
+
|
107 |
+
data_collator = DataCollatorWithPadding(feature_extractor)
|
108 |
+
|
109 |
+
data_loader = DataLoader(
|
110 |
+
test_dataset,
|
111 |
+
batch_size=16,
|
112 |
+
shuffle=False,
|
113 |
+
collate_fn=data_collator,
|
114 |
+
num_workers=4)
|
115 |
+
|
116 |
+
with torch.no_grad():
|
117 |
+
for batch in data_loader:
|
118 |
+
|
119 |
+
attention_mask, inputs = batch['attention_mask'], batch['input_values']
|
120 |
+
model_output = model(inputs, attention_mask=attention_mask)
|
121 |
+
|
122 |
+
logits = model_output['logits'].to(torch.float32)
|
123 |
+
probs = F.softmax(logits, dim=1).numpy()
|
124 |
+
|
125 |
+
label_to_id = model.config.label2id
|
126 |
+
dict_probs = {f'prob_negative': probs[:, label_to_id['negative']],
|
127 |
+
f'prob_neutral': probs[:, label_to_id['neutral']],
|
128 |
+
f'prob_positive': probs[:, label_to_id['positive']]}
|
129 |
+
```
|
130 |
+
|
131 |
+
## Register for autoclass
|
132 |
+
To register the model for your local autoclass, use the following code:
|
133 |
+
```python
|
134 |
+
from transformers import AutoConfig, AutoModel
|
135 |
+
|
136 |
+
# download model and config
|
137 |
+
finvoc2vec_config = AutoConfig.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
|
138 |
+
finvoc2vec_model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
|
139 |
+
|
140 |
+
# register model and config for automodel class
|
141 |
+
AutoConfig.register("finvoc2vec", FinVoc2VecConfig)
|
142 |
+
AutoModel.register(FinVoc2VecConfig, FinVoc2Vec)
|
143 |
+
```
|
144 |
+
|
145 |
+
## Further resources
|
146 |
+
Check the 🤗 Hugging Face [Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2) model description for additional resources and configurations.
|
147 |
+
|
148 |
+
## License
|
149 |
+
- This model is a derivative work based on Wav2Vec2 (Apache-2.0)
|
150 |
+
- This model is licensed under the Creative Commons Attribution Non Commercial 4.0 licence (CC-BY-NC-4.0)
|
151 |
+
|
152 |
+
## Paper
|
153 |
+
- [Listen Closely: Measuring Vocal Tone in Corporate Disclosures](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4307178)
|
154 |
+
|
155 |
+
## BibTeX
|
156 |
+
```
|
157 |
+
@article{ewertz2024,
|
158 |
+
title={Listen Closely: Measuring Vocal Tone in Corporate Disclosures},
|
159 |
+
author={Ewertz, Jonas and Knickrehm, Charlotte and Nienhaus, Martin and Reichmann, Doron},
|
160 |
+
year={2024},
|
161 |
+
note={Available at SSRN: \url{https://ssrn.com/abstract=4307178}}
|
162 |
+
}
|
__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .finvoc2vec_config import FinVoc2VecConfig
|
2 |
+
from .finvoc2vec_model import FinVoc2Vec
|
3 |
+
|
4 |
+
__all__ = ['FinVoc2VecConfig', 'FinVoc2Vec']
|
preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets==2.20.0
|
2 |
+
numba==0.59.1
|
3 |
+
numpy==1.26.2
|
4 |
+
nvidia-cublas-cu12==12.1.3.1
|
5 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
6 |
+
nvidia-cuda-nvcc-cu12==12.2.140
|
7 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
8 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
9 |
+
nvidia-cudnn-cu12==8.9.2.26
|
10 |
+
nvidia-cufft-cu12==11.0.2.54
|
11 |
+
nvidia-curand-cu12==10.3.2.106
|
12 |
+
nvidia-cusolver-cu12==11.4.5.107
|
13 |
+
nvidia-cusparse-cu12==12.1.0.106
|
14 |
+
nvidia-nccl-cu12==2.20.5
|
15 |
+
nvidia-nvjitlink-cu12==12.2.140
|
16 |
+
nvidia-nvtx-cu12==12.1.105
|
17 |
+
pandas==2.1.4
|
18 |
+
torch==2.3.1
|
19 |
+
torch-audiomentations==0.11.0
|
20 |
+
torch-pitch-shift==1.2.4
|
21 |
+
torch-tb-profiler==0.4.3
|
22 |
+
torchaudio==2.3.1
|
23 |
+
torchinfo==1.8.0
|
24 |
+
torchmetrics==1.2.1
|
25 |
+
torchsummary==1.5.1
|
26 |
+
torchvision==0.18.1
|
27 |
+
transformers==4.42.4
|
28 |
+
tqdm==4.66.4
|