aniltrkkn commited on
Commit
c8cdc96
·
1 Parent(s): 98e2fbd

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -26
README.md CHANGED
@@ -1,3 +1,4 @@
 
1
  language: tr
2
  datasets:
3
  - common_voice
@@ -51,15 +52,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
51
  # Preprocessing the datasets.
52
  # We need to read the aduio files as arrays
53
  def speech_file_to_array_fn(batch):
54
- speech_array, sampling_rate = torchaudio.load(batch["path"])
55
- batch["speech"] = resampler(speech_array).squeeze().numpy()
56
- return batch
57
 
58
  test_dataset = test_dataset.map(speech_file_to_array_fn)
59
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
60
 
61
  with torch.no_grad():
62
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
@@ -87,30 +88,30 @@ processor = Wav2Vec2Processor.from_pretrained("aniltrkkn/wav2vec2-large-xlsr-53-
87
  model = Wav2Vec2ForCTC.from_pretrained("aniltrkkn/wav2vec2-large-xlsr-53-turkish")
88
  model.to("cuda")
89
 
90
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
94
  # We need to read the aduio files as arrays
95
  def speech_file_to_array_fn(batch):
96
- batch["sentence"] = str(unicode_tr(re.sub(chars_to_ignore_regex, "", batch["sentence"])).lower())
97
- speech_array, sampling_rate = torchaudio.load(batch["path"])
98
- batch["speech"] = resampler(speech_array).squeeze().numpy()
99
- return batch
100
 
101
  test_dataset = test_dataset.map(speech_file_to_array_fn)
102
 
103
  # Preprocessing the datasets.
104
  # We need to read the aduio files as arrays
105
  def evaluate(batch):
106
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
107
 
108
- with torch.no_grad():
109
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
110
 
111
- pred_ids = torch.argmax(logits, dim=-1)
112
- batch["pred_strings"] = processor.batch_decode(pred_ids)
113
- return batch
114
 
115
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
 
@@ -123,16 +124,16 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
123
  unicode_tr package is used for converting sentences to lower case since regular lower() does not work well with Turkish.
124
 
125
  Since training data is very limited for Turkish, all data is employed with a K-Fold (k=5) training approach. Best model out of the 5 trainings is uploaded. Training arguments:
126
- --num_train_epochs="30" \
127
- --per_device_train_batch_size="32" \
128
- --evaluation_strategy="steps" \
129
- --activation_dropout="0.055" \
130
- --attention_dropout="0.094" \
131
- --feat_proj_dropout="0.04" \
132
- --hidden_dropout="0.047" \
133
- --layerdrop="0.041" \
134
- --learning_rate="2.34e-4" \
135
- --mask_time_prob="0.082" \
136
- --warmup_steps="250" \
137
 
138
  All trainings took ~20 hours with a GeForce RTX 3090 Graphics Card.
 
1
+ ---
2
  language: tr
3
  datasets:
4
  - common_voice
 
52
  # Preprocessing the datasets.
53
  # We need to read the aduio files as arrays
54
  def speech_file_to_array_fn(batch):
55
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
56
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
57
+ \treturn batch
58
 
59
  test_dataset = test_dataset.map(speech_file_to_array_fn)
60
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
61
 
62
  with torch.no_grad():
63
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
64
 
65
  predicted_ids = torch.argmax(logits, dim=-1)
66
 
 
88
  model = Wav2Vec2ForCTC.from_pretrained("aniltrkkn/wav2vec2-large-xlsr-53-turkish")
89
  model.to("cuda")
90
 
91
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]'
92
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
93
 
94
  # Preprocessing the datasets.
95
  # We need to read the aduio files as arrays
96
  def speech_file_to_array_fn(batch):
97
+ \tbatch["sentence"] = str(unicode_tr(re.sub(chars_to_ignore_regex, "", batch["sentence"])).lower())
98
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
99
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
100
+ \treturn batch
101
 
102
  test_dataset = test_dataset.map(speech_file_to_array_fn)
103
 
104
  # Preprocessing the datasets.
105
  # We need to read the aduio files as arrays
106
  def evaluate(batch):
107
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
108
 
109
+ \twith torch.no_grad():
110
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
111
 
112
+ \tpred_ids = torch.argmax(logits, dim=-1)
113
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
114
+ \treturn batch
115
 
116
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
117
 
 
124
  unicode_tr package is used for converting sentences to lower case since regular lower() does not work well with Turkish.
125
 
126
  Since training data is very limited for Turkish, all data is employed with a K-Fold (k=5) training approach. Best model out of the 5 trainings is uploaded. Training arguments:
127
+ --num_train_epochs="30" \\
128
+ --per_device_train_batch_size="32" \\
129
+ --evaluation_strategy="steps" \\
130
+ --activation_dropout="0.055" \\
131
+ --attention_dropout="0.094" \\
132
+ --feat_proj_dropout="0.04" \\
133
+ --hidden_dropout="0.047" \\
134
+ --layerdrop="0.041" \\
135
+ --learning_rate="2.34e-4" \\
136
+ --mask_time_prob="0.082" \\
137
+ --warmup_steps="250" \\
138
 
139
  All trainings took ~20 hours with a GeForce RTX 3090 Graphics Card.