File size: 2,473 Bytes
102da0f
bb52bb9
102da0f
bb52bb9
 
 
102da0f
 
 
 
 
 
 
bb52bb9
102da0f
bb52bb9
 
 
 
 
6fbf399
102da0f
 
eac28d8
102da0f
 
 
 
 
 
bb52bb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102da0f
 
bb52bb9
 
 
 
102da0f
 
bb52bb9
 
102da0f
 
bb52bb9
102da0f
 
bb52bb9
102da0f
 
bb52bb9
102da0f
 
 
 
 
 
bb52bb9
102da0f
 
bb52bb9
 
 
 
102da0f
 
bb52bb9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# ################################
# Model: wav2vec2 + DNN + CTC
# Augmentation: SpecAugment
# Authors: 
# Sung-Lin Yeh 2021
# Pooneh Mousavi 2023
# ################################

# BPE parameters
token_type: unigram  # ["unigram", "bpe", "char"]
character_coverage: 1.0

# Model parameters
# activation: !name:torch.nn.LeakyReLU
dnn_neurons: 1024
wav2vec_output_dim: 1024
dropout: 0.15

sample_rate: 16000

wav2vec2_hub: facebook/wav2vec2-large-it-voxpopuli

# Outputs
output_neurons: 1000  # BPE size, index(blank/eos/bos) = 0

# Decoding parameters
# Be sure that the bos and eos index match with the BPEs ones
blank_index: 0
bos_index: 1
eos_index: 2

enc: !new:speechbrain.nnet.containers.Sequential
  input_shape: [null, null, !ref <wav2vec_output_dim>]
  linear1: !name:speechbrain.nnet.linear.Linear
    n_neurons: !ref <dnn_neurons>
    bias: True
  bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
  activation: !new:torch.nn.LeakyReLU
  drop: !new:torch.nn.Dropout
    p: !ref <dropout>
  linear2: !name:speechbrain.nnet.linear.Linear
    n_neurons: !ref <dnn_neurons>
    bias: True
  bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
  activation2: !new:torch.nn.LeakyReLU
  drop2: !new:torch.nn.Dropout
    p: !ref <dropout>
  linear3: !name:speechbrain.nnet.linear.Linear
    n_neurons: !ref <dnn_neurons>
    bias: True
  bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
  activation3: !new:torch.nn.LeakyReLU

wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
  source: !ref <wav2vec2_hub>
  output_norm: True
  freeze: True
  save_path: wav2vec2_checkpoint

ctc_lin: !new:speechbrain.nnet.linear.Linear
  input_size: !ref <dnn_neurons>
  n_neurons: !ref <output_neurons>

log_softmax: !new:speechbrain.nnet.activations.Softmax
  apply_log: True

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
  blank_index: !ref <blank_index>

asr_model: !new:torch.nn.ModuleList
    - [!ref <enc>, !ref <ctc_lin>]

tokenizer: !new:sentencepiece.SentencePieceProcessor

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    wav2vec2: !ref <wav2vec2>
    enc: !ref <enc>
    ctc_lin: !ref <ctc_lin>

modules:
  encoder: !ref <encoder>

decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
    blank_id: !ref <blank_index>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
      wav2vec2: !ref <wav2vec2>
      asr: !ref <asr_model>
      tokenizer: !ref <tokenizer>