vitouphy commited on
Commit
75e6bf2
·
1 Parent(s): 4c2d6b6

add some files

Browse files
Files changed (6) hide show
  1. Untitled.ipynb +0 -0
  2. eval.py +137 -0
  3. eval.sh +6 -0
  4. inference.ipynb +479 -0
  5. train_tr.ipynb +0 -0
  6. vocab.json +1 -1
Untitled.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
eval.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import re
4
+ from typing import Dict
5
+
6
+ import torch
7
+ from datasets import Audio, Dataset, load_dataset, load_metric
8
+
9
+ from transformers import AutoFeatureExtractor, pipeline
10
+
11
+
12
+ def log_results(result: Dataset, args: Dict[str, str]):
13
+ """DO NOT CHANGE. This function computes and logs the result metrics."""
14
+
15
+ log_outputs = args.log_outputs
16
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
17
+
18
+ # load metric
19
+ wer = load_metric("wer")
20
+ cer = load_metric("cer")
21
+
22
+ # compute metrics
23
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
24
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
25
+
26
+ # print & log results
27
+ result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
28
+ print(result_str)
29
+
30
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
31
+ f.write(result_str)
32
+
33
+ # log all results in text file. Possibly interesting for analysis
34
+ if log_outputs is not None:
35
+ pred_file = f"log_{dataset_id}_predictions.txt"
36
+ target_file = f"log_{dataset_id}_targets.txt"
37
+
38
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
39
+
40
+ # mapping function to write output
41
+ def write_to_file(batch, i):
42
+ p.write(f"{i}" + "\n")
43
+ p.write(batch["prediction"] + "\n")
44
+ t.write(f"{i}" + "\n")
45
+ t.write(batch["target"] + "\n")
46
+
47
+ result.map(write_to_file, with_indices=True)
48
+
49
+
50
+ def normalize_text(text: str) -> str:
51
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
52
+
53
+ chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
54
+
55
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
56
+
57
+ # In addition, we can normalize the target text, e.g. removing new lines characters etc...
58
+ # note that order is important here!
59
+ token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
60
+
61
+ for t in token_sequences_to_ignore:
62
+ text = " ".join(text.split(t))
63
+
64
+ return text
65
+
66
+
67
+ def main(args):
68
+ # load dataset
69
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
70
+
71
+ # for testing: only process the first two examples as a test
72
+ # dataset = dataset.select(range(10))
73
+
74
+ # load processor
75
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
76
+ sampling_rate = feature_extractor.sampling_rate
77
+
78
+ # resample audio
79
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
80
+
81
+ # load eval pipeline
82
+ if args.device is None:
83
+ args.device = 0 if torch.cuda.is_available() else -1
84
+ asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
85
+
86
+ # map function to decode audio
87
+ def map_to_pred(batch):
88
+ prediction = asr(
89
+ batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
90
+ )
91
+
92
+ batch["prediction"] = prediction["text"]
93
+ batch["target"] = normalize_text(batch["sentence"])
94
+ return batch
95
+
96
+ # run inference on all examples
97
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
98
+
99
+ # compute and log_results
100
+ # do not change function below
101
+ log_results(result, args)
102
+
103
+
104
+ if __name__ == "__main__":
105
+ parser = argparse.ArgumentParser()
106
+
107
+ parser.add_argument(
108
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
109
+ )
110
+ parser.add_argument(
111
+ "--dataset",
112
+ type=str,
113
+ required=True,
114
+ help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
115
+ )
116
+ parser.add_argument(
117
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
118
+ )
119
+ parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
120
+ parser.add_argument(
121
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
122
+ )
123
+ parser.add_argument(
124
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
125
+ )
126
+ parser.add_argument(
127
+ "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
128
+ )
129
+ parser.add_argument(
130
+ "--device",
131
+ type=int,
132
+ default=None,
133
+ help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
134
+ )
135
+ args = parser.parse_args()
136
+
137
+ main(args)
eval.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ./eval.py \
2
+ --model_id ./ \
3
+ --dataset "mozilla-foundation/common_voice_8_0" \
4
+ --config ja \
5
+ --split test \
6
+ --log_outputs
inference.ipynb ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "3eace62e",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
11
+ "from datasets import load_dataset, load_metric, Audio\n",
12
+ "import torch"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 3,
18
+ "id": "47d5c062",
19
+ "metadata": {},
20
+ "outputs": [
21
+ {
22
+ "name": "stderr",
23
+ "output_type": "stream",
24
+ "text": [
25
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
26
+ ]
27
+ }
28
+ ],
29
+ "source": [
30
+ "# model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
31
+ "processor = Wav2Vec2Processor.from_pretrained(\".\")"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 47,
37
+ "id": "1ffed05d",
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "name": "stderr",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "Using custom data configuration default-f6158d05a859ae5c\n",
45
+ "Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-f6158d05a859ae5c/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
46
+ ]
47
+ }
48
+ ],
49
+ "source": [
50
+ "common_voice_test = load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 48,
56
+ "id": "bb365941",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "common_voice_test = (common_voice_test\n",
61
+ " .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
62
+ " .rename_column('text', 'sentence'))"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": null,
68
+ "id": "34979efb",
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "66ac6b14",
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": []
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 17,
86
+ "id": "e135b397",
87
+ "metadata": {},
88
+ "outputs": [
89
+ {
90
+ "name": "stderr",
91
+ "output_type": "stream",
92
+ "text": [
93
+ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd)\n"
94
+ ]
95
+ }
96
+ ],
97
+ "source": [
98
+ "common_voice_test = load_dataset(\"common_voice\", \"tr\", split=\"test\")"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 18,
104
+ "id": "9dd4cfd4",
105
+ "metadata": {},
106
+ "outputs": [
107
+ {
108
+ "data": {
109
+ "text/plain": [
110
+ "{'client_id': 'b8fffa3c4745500cd2c5f40a82b65bf1fb2d4c4f8638209a33fe1886fbfffdbd2f93aa43e0bd2026c4643e08aada408165138f75787cee501c4d735aa555a61c',\n",
111
+ " 'path': 'common_voice_tr_17343551.mp3',\n",
112
+ " 'audio': {'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17343551.mp3',\n",
113
+ " 'array': array([0. , 0. , 0. , ..., 0.00157976, 0.00167614,\n",
114
+ " 0.00091976], dtype=float32),\n",
115
+ " 'sampling_rate': 48000},\n",
116
+ " 'sentence': 'Aşırı derecede kapalı bir ortamımız var.',\n",
117
+ " 'up_votes': 2,\n",
118
+ " 'down_votes': 0,\n",
119
+ " 'age': 'thirties',\n",
120
+ " 'gender': 'male',\n",
121
+ " 'accent': 'other',\n",
122
+ " 'locale': 'tr',\n",
123
+ " 'segment': \"''\"}"
124
+ ]
125
+ },
126
+ "execution_count": 18,
127
+ "metadata": {},
128
+ "output_type": "execute_result"
129
+ }
130
+ ],
131
+ "source": [
132
+ "common_voice_test[3]"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 19,
138
+ "id": "f36c3bcd",
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "# remove unnecceesary attributes\n",
143
+ "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 20,
149
+ "id": "142cffaa",
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "common_voice_test = common_voice_test.cast_column(\"audio\", Audio(sampling_rate=16_000))"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": 29,
159
+ "id": "b1103455",
160
+ "metadata": {},
161
+ "outputs": [
162
+ {
163
+ "data": {
164
+ "text/plain": [
165
+ "Dataset({\n",
166
+ " features: ['path', 'audio', 'sentence'],\n",
167
+ " num_rows: 1647\n",
168
+ "})"
169
+ ]
170
+ },
171
+ "execution_count": 29,
172
+ "metadata": {},
173
+ "output_type": "execute_result"
174
+ }
175
+ ],
176
+ "source": [
177
+ "common_voice_test"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 30,
183
+ "id": "e2f9be66",
184
+ "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "data": {
188
+ "text/plain": [
189
+ "'Pek çoğu da Roman toplumundan geliyor.'"
190
+ ]
191
+ },
192
+ "execution_count": 30,
193
+ "metadata": {},
194
+ "output_type": "execute_result"
195
+ }
196
+ ],
197
+ "source": [
198
+ "common_voice_test[0]['sentence']"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 26,
204
+ "id": "94a0e9c5",
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": [
208
+ "import numpy as np"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": 33,
214
+ "id": "c2bcce8a",
215
+ "metadata": {},
216
+ "outputs": [
217
+ {
218
+ "data": {
219
+ "text/plain": [
220
+ "{'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17341269.mp3',\n",
221
+ " 'array': array([ 0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.288735e-06,\n",
222
+ " -1.994405e-03, -7.770515e-03], dtype=float32),\n",
223
+ " 'sampling_rate': 16000}"
224
+ ]
225
+ },
226
+ "execution_count": 33,
227
+ "metadata": {},
228
+ "output_type": "execute_result"
229
+ }
230
+ ],
231
+ "source": [
232
+ "common_voice_test[0]['audio']"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 34,
238
+ "id": "47d9dd9c",
239
+ "metadata": {},
240
+ "outputs": [
241
+ {
242
+ "ename": "ValueError",
243
+ "evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
244
+ "output_type": "error",
245
+ "traceback": [
246
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
247
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
248
+ "Input \u001b[0;32mIn [34]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marray\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m16000\u001b[39;49m\u001b[43m)\u001b[49m\n",
249
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:138\u001b[0m, in \u001b[0;36mWav2Vec2Processor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124;03m When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's\u001b[39;00m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;124;03m [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_processor\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
250
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2417\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.__call__\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2414\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 2416\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[0;32m-> 2417\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2418\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2419\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2420\u001b[0m )\n\u001b[1;32m 2422\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[1;32m 2423\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2424\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2425\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2426\u001b[0m )\n",
251
+ "\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
252
+ ]
253
+ }
254
+ ],
255
+ "source": [
256
+ "processor(np.array(common_voice_test[0]['audio'][\"array\"]), sampling_rate=16000)"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 27,
262
+ "id": "5f0e5342",
263
+ "metadata": {},
264
+ "outputs": [],
265
+ "source": [
266
+ "def prepare_dataset(batch):\n",
267
+ " audio = batch[\"audio\"]\n",
268
+ " \n",
269
+ " # batched output is \"un-batched\"\n",
270
+ " batch[\"input_values\"] = processor(np.array(audio[\"array\"]), sampling_rate=audio[\"sampling_rate\"]).input_values[0]\n",
271
+ " batch[\"input_length\"] = len(batch[\"input_values\"])\n",
272
+ " \n",
273
+ " with processor.as_target_processor():\n",
274
+ " batch[\"labels\"] = processor(batch[\"sentence\"]).input_ids\n",
275
+ " return batch"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": 28,
281
+ "id": "6786ed7e",
282
+ "metadata": {},
283
+ "outputs": [
284
+ {
285
+ "data": {
286
+ "application/vnd.jupyter.widget-view+json": {
287
+ "model_id": "b74fe324f3bd4d98b6366c614fec7991",
288
+ "version_major": 2,
289
+ "version_minor": 0
290
+ },
291
+ "text/plain": [
292
+ "0ex [00:00, ?ex/s]"
293
+ ]
294
+ },
295
+ "metadata": {},
296
+ "output_type": "display_data"
297
+ },
298
+ {
299
+ "ename": "ValueError",
300
+ "evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
301
+ "output_type": "error",
302
+ "traceback": [
303
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
304
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
305
+ "Input \u001b[0;32mIn [28]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m common_voice_test \u001b[38;5;241m=\u001b[39m \u001b[43mcommon_voice_test\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprepare_dataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mremove_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommon_voice_test\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumn_names\u001b[49m\u001b[43m)\u001b[49m\n",
306
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2107\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 2104\u001b[0m disable_tqdm \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mbool\u001b[39m(logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mNOTSET) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mis_progress_bar_enabled()\n\u001b[1;32m 2106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m num_proc \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m-> 2107\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_single\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2108\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2109\u001b[0m \u001b[43m \u001b[49m\u001b[43mwith_indices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwith_indices\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2110\u001b[0m \u001b[43m \u001b[49m\u001b[43mwith_rank\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwith_rank\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2111\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2112\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatched\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2113\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2114\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_last_batch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_last_batch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2115\u001b[0m \u001b[43m \u001b[49m\u001b[43mremove_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mremove_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2116\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_in_memory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeep_in_memory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2117\u001b[0m \u001b[43m \u001b[49m\u001b[43mload_from_cache_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mload_from_cache_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2118\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_file_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_file_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2119\u001b[0m \u001b[43m \u001b[49m\u001b[43mwriter_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwriter_batch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2120\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2121\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_nullable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable_nullable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2122\u001b[0m \u001b[43m \u001b[49m\u001b[43mfn_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2123\u001b[0m \u001b[43m \u001b[49m\u001b[43mnew_fingerprint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_fingerprint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2124\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_tqdm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable_tqdm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2125\u001b[0m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdesc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2126\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2129\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mformat_cache_file_name\u001b[39m(cache_file_name, rank):\n",
307
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:519\u001b[0m, in \u001b[0;36mtransmit_tasks.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;28mself\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mself\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 518\u001b[0m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 519\u001b[0m out: Union[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetDict\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 520\u001b[0m datasets: List[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(out\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[1;32m 521\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m dataset \u001b[38;5;129;01min\u001b[39;00m datasets:\n\u001b[1;32m 522\u001b[0m \u001b[38;5;66;03m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n",
308
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:486\u001b[0m, in \u001b[0;36mtransmit_format.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 479\u001b[0m self_format \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 480\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_type,\n\u001b[1;32m 481\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mformat_kwargs\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_kwargs,\n\u001b[1;32m 482\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumns\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_columns,\n\u001b[1;32m 483\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_all_columns\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_output_all_columns,\n\u001b[1;32m 484\u001b[0m }\n\u001b[1;32m 485\u001b[0m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 486\u001b[0m out: Union[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetDict\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 487\u001b[0m datasets: List[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(out\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[1;32m 488\u001b[0m \u001b[38;5;66;03m# re-apply format to the output\u001b[39;00m\n",
309
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/fingerprint.py:413\u001b[0m, in \u001b[0;36mfingerprint_transform.<locals>._fingerprint.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 407\u001b[0m kwargs[fingerprint_name] \u001b[38;5;241m=\u001b[39m update_fingerprint(\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fingerprint, transform, kwargs_for_fingerprint\n\u001b[1;32m 409\u001b[0m )\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# Call actual function\u001b[39;00m\n\u001b[0;32m--> 413\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;66;03m# Update fingerprint of in-place transforms + update in-place history of transforms\u001b[39;00m\n\u001b[1;32m 417\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace: \u001b[38;5;66;03m# update after calling func so that the fingerprint doesn't change if the function fails\u001b[39;00m\n",
310
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2465\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)\u001b[0m\n\u001b[1;32m 2463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m batched:\n\u001b[1;32m 2464\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, example \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(pbar):\n\u001b[0;32m-> 2465\u001b[0m example \u001b[38;5;241m=\u001b[39m \u001b[43mapply_function_on_filtered_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexample\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2466\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m update_data:\n\u001b[1;32m 2467\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
311
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2372\u001b[0m, in \u001b[0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs\u001b[0;34m(inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 2370\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m with_rank:\n\u001b[1;32m 2371\u001b[0m additional_args \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (rank,)\n\u001b[0;32m-> 2372\u001b[0m processed_inputs \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfn_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43madditional_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2373\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m update_data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2374\u001b[0m \u001b[38;5;66;03m# Check if the function returns updated examples\u001b[39;00m\n\u001b[1;32m 2375\u001b[0m update_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28misinstance\u001b[39m(processed_inputs, (Mapping, pa\u001b[38;5;241m.\u001b[39mTable))\n",
312
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2067\u001b[0m, in \u001b[0;36mDataset.map.<locals>.decorate.<locals>.decorated\u001b[0;34m(item, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2063\u001b[0m decorated_item \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2064\u001b[0m Example(item, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m batched \u001b[38;5;28;01melse\u001b[39;00m Batch(item, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures)\n\u001b[1;32m 2065\u001b[0m )\n\u001b[1;32m 2066\u001b[0m \u001b[38;5;66;03m# Use the LazyDict internally, while mapping the function\u001b[39;00m\n\u001b[0;32m-> 2067\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdecorated_item\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2068\u001b[0m \u001b[38;5;66;03m# Return a standard dict\u001b[39;00m\n\u001b[1;32m 2069\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, LazyDict) \u001b[38;5;28;01melse\u001b[39;00m result\n",
313
+ "Input \u001b[0;32mIn [27]\u001b[0m, in \u001b[0;36mprepare_dataset\u001b[0;34m(batch)\u001b[0m\n\u001b[1;32m 2\u001b[0m audio \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# batched output is \"un-batched\"\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_values\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marray\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msampling_rate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39minput_values[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 6\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_length\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_values\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m processor\u001b[38;5;241m.\u001b[39mas_target_processor():\n",
314
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:138\u001b[0m, in \u001b[0;36mWav2Vec2Processor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124;03m When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's\u001b[39;00m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;124;03m [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_processor\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
315
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2417\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.__call__\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2414\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 2416\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[0;32m-> 2417\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2418\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2419\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2420\u001b[0m )\n\u001b[1;32m 2422\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[1;32m 2423\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2424\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2425\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2426\u001b[0m )\n",
316
+ "\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
317
+ ]
318
+ }
319
+ ],
320
+ "source": [
321
+ "common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": 24,
327
+ "id": "81506c80",
328
+ "metadata": {},
329
+ "outputs": [
330
+ {
331
+ "data": {
332
+ "text/plain": [
333
+ "{'path': 'common_voice_tr_17341269.mp3',\n",
334
+ " 'audio': {'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17341269.mp3',\n",
335
+ " 'array': array([ 0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.288735e-06,\n",
336
+ " -1.994405e-03, -7.770515e-03], dtype=float32),\n",
337
+ " 'sampling_rate': 16000},\n",
338
+ " 'sentence': 'Pek çoğu da Roman toplumundan geliyor.'}"
339
+ ]
340
+ },
341
+ "execution_count": 24,
342
+ "metadata": {},
343
+ "output_type": "execute_result"
344
+ }
345
+ ],
346
+ "source": [
347
+ "common_voice_test[0]"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": null,
353
+ "id": "603ecd46",
354
+ "metadata": {},
355
+ "outputs": [],
356
+ "source": []
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": 12,
361
+ "id": "760f0031",
362
+ "metadata": {},
363
+ "outputs": [],
364
+ "source": [
365
+ "i = 20"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": 14,
371
+ "id": "e0355fac",
372
+ "metadata": {},
373
+ "outputs": [
374
+ {
375
+ "ename": "KeyError",
376
+ "evalue": "'input_values'",
377
+ "output_type": "error",
378
+ "traceback": [
379
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
380
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
381
+ "Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m input_dict \u001b[38;5;241m=\u001b[39m processor(\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput_values\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
382
+ "\u001b[0;31mKeyError\u001b[0m: 'input_values'"
383
+ ]
384
+ }
385
+ ],
386
+ "source": [
387
+ "input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": 13,
393
+ "id": "c0b3603c",
394
+ "metadata": {},
395
+ "outputs": [
396
+ {
397
+ "ename": "KeyError",
398
+ "evalue": "'input_values'",
399
+ "output_type": "error",
400
+ "traceback": [
401
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
402
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
403
+ "Input \u001b[0;32mIn [13]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m input_dict \u001b[38;5;241m=\u001b[39m processor(\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput_values\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m logits \u001b[38;5;241m=\u001b[39m model(input_dict\u001b[38;5;241m.\u001b[39minput_values\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m))\u001b[38;5;241m.\u001b[39mlogits\n\u001b[1;32m 3\u001b[0m pred_ids \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39margmax(logits, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n",
404
+ "\u001b[0;31mKeyError\u001b[0m: 'input_values'"
405
+ ]
406
+ }
407
+ ],
408
+ "source": [
409
+ "input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)\n",
410
+ "logits = model(input_dict.input_values.to(\"cuda\")).logits\n",
411
+ "pred_ids = torch.argmax(logits, dim=-1)[0]"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "code",
416
+ "execution_count": 15,
417
+ "id": "23db2fe7",
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "name": "stdout",
422
+ "output_type": "stream",
423
+ "text": [
424
+ "Prediction:\n",
425
+ "ş\n",
426
+ "\n",
427
+ "Reference:\n",
428
+ "Yine de her iki grup farklı sorunlar çıkarıyor.\n"
429
+ ]
430
+ }
431
+ ],
432
+ "source": [
433
+ "print(\"Prediction:\")\n",
434
+ "print(processor.decode(pred_ids))\n",
435
+ "\n",
436
+ "print(\"\\nReference:\")\n",
437
+ "print(processor.decode(common_voice_test['labels'][i]))\n",
438
+ "# print(common_voice_test_transcription[0][\"sentence\"].lower())"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": null,
444
+ "id": "4da2cb6c",
445
+ "metadata": {},
446
+ "outputs": [],
447
+ "source": []
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "id": "0f5325dd",
453
+ "metadata": {},
454
+ "outputs": [],
455
+ "source": []
456
+ }
457
+ ],
458
+ "metadata": {
459
+ "kernelspec": {
460
+ "display_name": "Python 3 (ipykernel)",
461
+ "language": "python",
462
+ "name": "python3"
463
+ },
464
+ "language_info": {
465
+ "codemirror_mode": {
466
+ "name": "ipython",
467
+ "version": 3
468
+ },
469
+ "file_extension": ".py",
470
+ "mimetype": "text/x-python",
471
+ "name": "python",
472
+ "nbconvert_exporter": "python",
473
+ "pygments_lexer": "ipython3",
474
+ "version": "3.8.8"
475
+ }
476
+ },
477
+ "nbformat": 4,
478
+ "nbformat_minor": 5
479
+ }
train_tr.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json CHANGED
@@ -1 +1 @@
1
- {"\u1780": 1, "\u1781": 2, "\u1782": 3, "\u1783": 4, "\u1784": 5, "\u1785": 6, "\u1786": 7, "\u1787": 8, "\u1788": 9, "\u1789": 10, "\u178a": 11, "\u178b": 12, "\u178c": 13, "\u178d": 14, "\u178e": 15, "\u178f": 16, "\u1790": 17, "\u1791": 18, "\u1792": 19, "\u1793": 20, "\u1794": 21, "\u1795": 22, "\u1796": 23, "\u1797": 24, "\u1798": 25, "\u1799": 26, "\u179a": 27, "\u179b": 28, "\u179c": 29, "\u179f": 30, "\u17a0": 31, "\u17a1": 32, "\u17a2": 33, "\u17a5": 34, "\u17a7": 35, "\u17aa": 36, "\u17ab": 37, "\u17ac": 38, "\u17ad": 39, "\u17ae": 40, "\u17af": 41, "\u17b1": 42, "\u17b6": 43, "\u17b7": 44, "\u17b8": 45, "\u17b9": 46, "\u17ba": 47, "\u17bb": 48, "\u17bc": 49, "\u17bd": 50, "\u17be": 51, "\u17bf": 52, "\u17c0": 53, "\u17c1": 54, "\u17c2": 55, "\u17c3": 56, "\u17c4": 57, "\u17c5": 58, "\u17c6": 59, "\u17c7": 60, "\u17c8": 61, "\u17c9": 62, "\u17ca": 63, "\u17cb": 64, "\u17cc": 65, "\u17cd": 66, "\u17ce": 67, "\u17cf": 68, "\u17d0": 69, "\u17d2": 70, "|": 0, "[UNK]": 72, "[PAD]": 73}
 
1
+ {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, "I": 9, "J": 10, "K": 11, "L": 12, "M": 13, "N": 14, "O": 15, "P": 16, "Q": 17, "R": 18, "S": 19, "T": 20, "U": 21, "V": 22, "W": 23, "X": 24, "Y": 25, "Z": 26, "a": 27, "b": 28, "c": 29, "d": 30, "e": 31, "f": 32, "g": 33, "h": 34, "i": 35, "j": 36, "k": 37, "l": 38, "m": 39, "n": 40, "o": 41, "p": 42, "r": 43, "s": 44, "t": 45, "u": 46, "v": 47, "w": 48, "x": 49, "y": 50, "z": 51, "\u00c7": 52, "\u00d6": 53, "\u00dc": 54, "\u00e2": 55, "\u00e7": 56, "\u00eb": 57, "\u00ee": 58, "\u00f6": 59, "\u00fc": 60, "\u011f": 61, "\u0130": 62, "\u0131": 63, "\u015e": 64, "\u015f": 65, "|": 0, "[UNK]": 67, "[PAD]": 68}