"
]
},
- "execution_count": 51,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -13900,8 +14015,8 @@
},
{
"cell_type": "code",
- "execution_count": 52,
- "id": "b10b6aa1",
+ "execution_count": 23,
+ "id": "c0af61ca",
"metadata": {},
"outputs": [],
"source": [
@@ -13922,10 +14037,33 @@
},
{
"cell_type": "code",
- "execution_count": 53,
- "id": "a1776bef",
+ "execution_count": 24,
+ "id": "527e89eb",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-e3ff506f96ec6817.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-00a0dacd1c387ee8.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-89839f1a29958c06.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-ea97d53e6e03248b.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-74c31e1ede89718b.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-b4485d5ec10af59a.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-87741a8a8705e488.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-2aa5c421e49dbb8a.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-6fa3756abc090cb1.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-7082faf01a7536d9.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-dbf56923bad5550e.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-cfa541d30ccf3270.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-9f28af78c8d178d8.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-4fc740b07e55a01b.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-ec4bd65c3d0c2b80.arrow\n",
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-033c2e0fab0f0e8a.arrow\n"
+ ]
+ }
+ ],
"source": [
"common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, num_proc=16)\n",
"common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, num_proc=16)"
@@ -13934,7 +14072,7 @@
{
"cell_type": "code",
"execution_count": 41,
- "id": "8eea6563",
+ "id": "b73b00a8",
"metadata": {},
"outputs": [],
"source": [
@@ -13945,8 +14083,8 @@
},
{
"cell_type": "code",
- "execution_count": 54,
- "id": "f15bec0a",
+ "execution_count": 25,
+ "id": "0d157cd6",
"metadata": {},
"outputs": [],
"source": [
@@ -13987,6 +14125,7 @@
" padding=self.padding,\n",
" return_tensors=\"pt\",\n",
" )\n",
+ "\n",
" with self.processor.as_target_processor():\n",
" labels_batch = self.processor.pad(\n",
" label_features,\n",
@@ -14004,8 +14143,8 @@
},
{
"cell_type": "code",
- "execution_count": 55,
- "id": "d30dafe4",
+ "execution_count": 26,
+ "id": "848984c4",
"metadata": {},
"outputs": [],
"source": [
@@ -14014,19 +14153,19 @@
},
{
"cell_type": "code",
- "execution_count": 56,
- "id": "408bcb4d",
+ "execution_count": 27,
+ "id": "1472ec1d",
"metadata": {},
"outputs": [],
"source": [
- "# wer_metric = load_metric(\"wer\")\n",
- "cer_metric = load_metric(\"cer\")"
+ "wer_metric = load_metric(\"wer\")\n",
+ "# cer_metric = load_metric(\"cer\")"
]
},
{
"cell_type": "code",
- "execution_count": 57,
- "id": "e5573fd8",
+ "execution_count": 28,
+ "id": "585247d7",
"metadata": {},
"outputs": [],
"source": [
@@ -14034,48 +14173,30 @@
" pred_logits = pred.predictions\n",
" pred_ids = np.argmax(pred_logits, axis=-1)\n",
"\n",
- " pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n",
+ " pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id\n",
"\n",
- " pred_str = processor.batch_decode(pred_ids)\n",
+ " pred_str = tokenizer.batch_decode(pred_ids)\n",
" # we do not want to group tokens when computing the metrics\n",
- " label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n",
+ " label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)\n",
"\n",
- " # wer = wer_metric.compute(predictions=pred_str, references=label_str)\n",
- " cer = cer_metric.compute(predictions=pred_str, references=label_str)\n",
+ " wer = wer_metric.compute(predictions=pred_str, references=label_str)\n",
+ "# cer = cer_metric.compute(predictions=pred_str, references=label_str)\n",
"\n",
- " # return {\"wer\": wer}\n",
- " return {\"cer\": cer}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "da8c34ad",
- "metadata": {},
- "outputs": [],
- "source": [
- "\n"
+ " return {\"wer\": wer}\n",
+ "# return {\"cer\": cer}"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "229b8ebf",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "id": "7b36eee5",
+ "execution_count": 29,
+ "id": "674f8f32",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_hid.bias', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.weight', 'quantizer.codevectors', 'project_q.bias']\n",
+ "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.weight_proj.weight', 'project_q.bias', 'project_hid.bias', 'project_q.weight', 'quantizer.weight_proj.bias', 'quantizer.codevectors', 'project_hid.weight']\n",
"- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n",
@@ -14094,45 +14215,27 @@
" mask_time_prob=0.05,\n",
" layerdrop=0.0,\n",
" ctc_loss_reduction=\"mean\", \n",
- " pad_token_id=processor.tokenizer.pad_token_id,\n",
+ " pad_token_id=tokenizer.pad_token_id,\n",
" vocab_size=len(processor.tokenizer),\n",
")"
]
},
{
"cell_type": "code",
- "execution_count": 59,
- "id": "7ec46eca",
+ "execution_count": 30,
+ "id": "79cab4ae",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1700: FutureWarning: The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5.Please use the equivalent `freeze_feature_encoder` method instead.\n",
- " warnings.warn(\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "model.freeze_feature_extractor()"
+ "model.freeze_feature_encoder()"
]
},
{
"cell_type": "code",
- "execution_count": 65,
- "id": "9db5b382",
+ "execution_count": 31,
+ "id": "d463ab8f",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "PyTorch: setting up devices\n",
- "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"from transformers import TrainingArguments\n",
"\n",
@@ -14144,28 +14247,26 @@
" evaluation_strategy=\"steps\",\n",
" gradient_checkpointing=True,\n",
" fp16=True,\n",
- " num_train_epochs=30,\n",
- " save_steps=1,\n",
- " eval_steps=1,\n",
+ " num_train_epochs=25,\n",
+ " save_steps=500,\n",
+ " eval_steps=500,\n",
" logging_steps=100,\n",
- " learning_rate=3e-5,\n",
- " warmup_steps=500,\n",
- " save_total_limit=3,\n",
- " push_to_hub=True,\n",
+ " learning_rate=5e-5,\n",
+ " warmup_steps=1000,\n",
+ " save_total_limit=3\n",
")"
]
},
{
"cell_type": "code",
- "execution_count": 66,
- "id": "96c9aeb9",
+ "execution_count": 32,
+ "id": "03fbc0fa",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "/workspace/xls-r-300m-km/. is already a clone of https://huggingface.co/vitouphy/xls-r-300m-km. Make sure you pull the latest changes with `repo.git_pull()`.\n",
"Using amp half precision backend\n"
]
}
@@ -14186,8 +14287,8 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "a3a38848",
+ "execution_count": 33,
+ "id": "ddc04b2e",
"metadata": {},
"outputs": [
{
@@ -14198,12 +14299,12 @@
"/opt/conda/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n",
"***** Running training *****\n",
- " Num examples = 2615\n",
- " Num Epochs = 30\n",
+ " Num examples = 3478\n",
+ " Num Epochs = 25\n",
" Instantaneous batch size per device = 8\n",
" Total train batch size (w. parallel, distributed & accumulation) = 16\n",
" Gradient Accumulation steps = 2\n",
- " Total optimization steps = 4890\n"
+ " Total optimization steps = 5425\n"
]
},
{
@@ -14212,8 +14313,8 @@
"\n",
" \n",
" \n",
- "
\n",
- " [ 2/4890 : < :, Epoch 0.01/30]\n",
+ "
\n",
+ " [5425/5425 1:31:08, Epoch 24/25]\n",
"
\n",
" \n",
" \n",
@@ -14221,15 +14322,69 @@
" Step | \n",
" Training Loss | \n",
" Validation Loss | \n",
- " Cer | \n",
+ " Wer | \n",
" \n",
" \n",
" \n",
" \n",
- " 1 | \n",
- " No log | \n",
- " 13.300326 | \n",
- " 0.989265 | \n",
+ " 500 | \n",
+ " 3.885900 | \n",
+ " 3.760785 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 1000 | \n",
+ " 1.819300 | \n",
+ " 1.530782 | \n",
+ " 1.000613 | \n",
+ "
\n",
+ " \n",
+ " 1500 | \n",
+ " 0.598600 | \n",
+ " 0.729536 | \n",
+ " 1.005616 | \n",
+ "
\n",
+ " \n",
+ " 2000 | \n",
+ " 0.399200 | \n",
+ " 0.618558 | \n",
+ " 1.013377 | \n",
+ "
\n",
+ " \n",
+ " 2500 | \n",
+ " 0.319900 | \n",
+ " 0.597245 | \n",
+ " 1.012254 | \n",
+ "
\n",
+ " \n",
+ " 3000 | \n",
+ " 0.238800 | \n",
+ " 0.555572 | \n",
+ " 1.010109 | \n",
+ "
\n",
+ " \n",
+ " 3500 | \n",
+ " 0.188200 | \n",
+ " 0.517281 | \n",
+ " 1.014092 | \n",
+ "
\n",
+ " \n",
+ " 4000 | \n",
+ " 0.160400 | \n",
+ " 0.517009 | \n",
+ " 1.018278 | \n",
+ "
\n",
+ " \n",
+ " 4500 | \n",
+ " 0.144300 | \n",
+ " 0.526738 | \n",
+ " 1.018380 | \n",
+ "
\n",
+ " \n",
+ " 5000 | \n",
+ " 0.140400 | \n",
+ " 0.536664 | \n",
+ " 1.016747 | \n",
"
\n",
" \n",
"
"
@@ -14247,14 +14402,107 @@
"text": [
"The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
"***** Running Evaluation *****\n",
- " Num examples = 291\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-500\n",
+ "Configuration saved in ./checkpoint-500/config.json\n",
+ "Model weights saved in ./checkpoint-500/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-500/preprocessor_config.json\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-1000\n",
+ "Configuration saved in ./checkpoint-1000/config.json\n",
+ "Model weights saved in ./checkpoint-1000/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-1000/preprocessor_config.json\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
" Batch size = 8\n",
- "Saving model checkpoint to ./checkpoint-1\n",
- "Configuration saved in ./checkpoint-1/config.json\n",
- "Model weights saved in ./checkpoint-1/pytorch_model.bin\n",
- "Configuration saved in ./checkpoint-1/preprocessor_config.json\n",
- "Configuration saved in ./preprocessor_config.json\n"
+ "Saving model checkpoint to ./checkpoint-1500\n",
+ "Configuration saved in ./checkpoint-1500/config.json\n",
+ "Model weights saved in ./checkpoint-1500/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-1500/preprocessor_config.json\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-2000\n",
+ "Configuration saved in ./checkpoint-2000/config.json\n",
+ "Model weights saved in ./checkpoint-2000/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-2000/preprocessor_config.json\n",
+ "Deleting older checkpoint [checkpoint-500] due to args.save_total_limit\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-2500\n",
+ "Configuration saved in ./checkpoint-2500/config.json\n",
+ "Model weights saved in ./checkpoint-2500/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-2500/preprocessor_config.json\n",
+ "Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-3000\n",
+ "Configuration saved in ./checkpoint-3000/config.json\n",
+ "Model weights saved in ./checkpoint-3000/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-3000/preprocessor_config.json\n",
+ "Deleting older checkpoint [checkpoint-1500] due to args.save_total_limit\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-3500\n",
+ "Configuration saved in ./checkpoint-3500/config.json\n",
+ "Model weights saved in ./checkpoint-3500/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-3500/preprocessor_config.json\n",
+ "Deleting older checkpoint [checkpoint-2000] due to args.save_total_limit\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-4000\n",
+ "Configuration saved in ./checkpoint-4000/config.json\n",
+ "Model weights saved in ./checkpoint-4000/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-4000/preprocessor_config.json\n",
+ "Deleting older checkpoint [checkpoint-2500] due to args.save_total_limit\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-4500\n",
+ "Configuration saved in ./checkpoint-4500/config.json\n",
+ "Model weights saved in ./checkpoint-4500/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-4500/preprocessor_config.json\n",
+ "Deleting older checkpoint [checkpoint-3000] due to args.save_total_limit\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
+ "***** Running Evaluation *****\n",
+ " Num examples = 1647\n",
+ " Batch size = 8\n",
+ "Saving model checkpoint to ./checkpoint-5000\n",
+ "Configuration saved in ./checkpoint-5000/config.json\n",
+ "Model weights saved in ./checkpoint-5000/pytorch_model.bin\n",
+ "Configuration saved in ./checkpoint-5000/preprocessor_config.json\n",
+ "Deleting older checkpoint [checkpoint-3500] due to args.save_total_limit\n",
+ "\n",
+ "\n",
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+ "\n",
+ "\n"
]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "TrainOutput(global_step=5425, training_loss=1.241710463449153, metrics={'train_runtime': 5469.9405, 'train_samples_per_second': 15.896, 'train_steps_per_second': 0.992, 'total_flos': 1.0590512839529611e+19, 'train_loss': 1.241710463449153, 'epoch': 25.0})"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
@@ -14264,7 +14512,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "e27620ac",
+ "id": "f0587cc3",
"metadata": {},
"outputs": [],
"source": []