File size: 21,278 Bytes

ace3439

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "vnrUh3vuDSRN"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "# prepare the train, dev, test dataset for Turkish language\n",
    "tr_duration_df = pd.read_csv('data/tr/clip_durations.tsv', sep='\\t')\n",
    "tr_train_df = pd.read_csv('data/tr/train.tsv', sep='\\t')\n",
    "tr_dev_df = pd.read_csv('data/tr/dev.tsv', sep='\\t')\n",
    "tr_test_df = pd.read_csv('data/tr/test.tsv', sep='\\t')\n",
    "\n",
    "merged_tr_train_df = pd.merge(tr_train_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
    "merged_tr_dev_df = pd.merge(tr_dev_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
    "merged_tr_test_df = pd.merge(tr_test_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-2-d0e6b5d0e689>:5: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
      "<ipython-input-2-d0e6b5d0e689>:6: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
      "<ipython-input-2-d0e6b5d0e689>:7: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n"
     ]
    }
   ],
   "source": [
    "merged_tr_train_df['audio_filepath'] = merged_tr_train_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n",
    "merged_tr_dev_df['audio_filepath'] = merged_tr_dev_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n",
    "merged_tr_test_df['audio_filepath'] = merged_tr_test_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n",
    "\n",
    "merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "\n",
    "merged_tr_train_df['duration'] = merged_tr_train_df['duration'].apply(lambda x: x / 1000)\n",
    "merged_tr_dev_df['duration'] = merged_tr_dev_df['duration'].apply(lambda x: x / 1000)\n",
    "merged_tr_test_df['duration'] = merged_tr_test_df['duration'].apply(lambda x: x / 1000)\n",
    "\n",
    "merged_tr_train_df = merged_tr_train_df[['audio_filepath', 'duration', 'label']]\n",
    "merged_tr_dev_df = merged_tr_dev_df[['audio_filepath', 'duration', 'label']]\n",
    "merged_tr_test_df = merged_tr_test_df[['audio_filepath', 'duration', 'label']]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = pd.concat([merged_tr_train_df, merged_tr_dev_df, merged_tr_test_df])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_labels = all_data[\"label\"].unique()\n",
    "train_rows = []\n",
    "dev_rows = []\n",
    "test_rows = []\n",
    "for val in unique_labels:\n",
    "    subset = all_data[all_data['label'] == val].sample(frac=1).reset_index(drop=True)  # Shuffle rows for the value\n",
    "    n = len(subset)\n",
    "    \n",
    "    train_end = int(0.8 * n)\n",
    "    dev_end = train_end + int(0.1 * n)\n",
    "    \n",
    "    train_rows.append(subset.iloc[:train_end])\n",
    "    dev_rows.append(subset.iloc[train_end:dev_end])\n",
    "    test_rows.append(subset.iloc[dev_end:])\n",
    "    \n",
    "# Create the train_df first\n",
    "train_df = pd.concat(train_rows, ignore_index=True)\n",
    "dev_df = pd.concat(dev_rows, ignore_index=True)\n",
    "test_df = pd.concat(test_rows, ignore_index=True)\n",
    "test_df = test_df[test_df['label'].isin(train_df['label'].unique())]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_json('data/tr/train.json', orient='records', lines=True)\n",
    "dev_df.to_json('data/tr/dev.json', orient='records', lines=True)\n",
    "test_df.to_json('data/tr/test.json', orient='records', lines=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "devices: 1\n",
      "accelerator: cpu\n",
      "max_epochs: 10\n",
      "max_steps: -1\n",
      "num_nodes: 1\n",
      "accumulate_grad_batches: 1\n",
      "enable_checkpointing: false\n",
      "logger: false\n",
      "log_every_n_steps: 1\n",
      "val_check_interval: 1.0\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "GPU available: False, used: False\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "HPU available: False, using: 0 HPUs\n",
      "`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2023-09-29 17:44:57 exp_manager:381] Experiments will be logged at /v3io/users/User/en_tr_titanet_large/tb/TitaNet-Finetune/2023-09-29_17-44-57\n",
      "[NeMo I 2023-09-29 17:44:57 exp_manager:815] TensorboardLogger has been set up\n",
      "[NeMo I 2023-09-29 17:44:58 collections:301] Filtered duration for loading collection is  0.00 hours.\n",
      "[NeMo I 2023-09-29 17:44:58 collections:302] Dataset loaded with 41559 items, total duration of  41.01 hours.\n",
      "[NeMo I 2023-09-29 17:44:58 collections:304] # 41559 files loaded accounting to # 1328 labels\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2023-09-29 17:44:58 label_models:187] Total number of 1328 found in all the manifest files.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2023-09-29 17:44:58 collections:301] Filtered duration for loading collection is  0.00 hours.\n",
      "[NeMo I 2023-09-29 17:44:58 collections:302] Dataset loaded with 41559 items, total duration of  41.01 hours.\n",
      "[NeMo I 2023-09-29 17:44:58 collections:304] # 41559 files loaded accounting to # 1328 labels\n",
      "[NeMo I 2023-09-29 17:44:59 collections:301] Filtered duration for loading collection is  0.00 hours.\n",
      "[NeMo I 2023-09-29 17:44:59 collections:302] Dataset loaded with 4651 items, total duration of  4.47 hours.\n",
      "[NeMo I 2023-09-29 17:44:59 collections:304] # 4651 files loaded accounting to # 482 labels\n",
      "[NeMo I 2023-09-29 17:44:59 collections:301] Filtered duration for loading collection is  0.00 hours.\n",
      "[NeMo I 2023-09-29 17:44:59 collections:302] Dataset loaded with 6198 items, total duration of  6.29 hours.\n",
      "[NeMo I 2023-09-29 17:44:59 collections:304] # 6198 files loaded accounting to # 1328 labels\n",
      "[NeMo I 2023-09-29 17:44:59 features:289] PADDING: 16\n",
      "[NeMo I 2023-09-29 17:44:59 cloud:58] Found existing object /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n",
      "[NeMo I 2023-09-29 17:44:59 cloud:64] Re-using file from: /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo\n",
      "[NeMo I 2023-09-29 17:44:59 common:913] Instantiating model from pre-trained checkpoint\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2023-09-29 17:45:00 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
      "    Train config : \n",
      "    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json\n",
      "    sample_rate: 16000\n",
      "    labels: null\n",
      "    batch_size: 64\n",
      "    shuffle: true\n",
      "    is_tarred: false\n",
      "    tarred_audio_filepaths: null\n",
      "    tarred_shard_strategy: scatter\n",
      "    augmentor:\n",
      "      noise:\n",
      "        manifest_path: /manifests/noise/rir_noise_manifest.json\n",
      "        prob: 0.5\n",
      "        min_snr_db: 0\n",
      "        max_snr_db: 15\n",
      "      speed:\n",
      "        prob: 0.5\n",
      "        sr: 16000\n",
      "        resample_type: kaiser_fast\n",
      "        min_speed_rate: 0.95\n",
      "        max_speed_rate: 1.05\n",
      "    num_workers: 15\n",
      "    pin_memory: true\n",
      "    \n",
      "[NeMo W 2023-09-29 17:45:00 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
      "    Validation config : \n",
      "    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json\n",
      "    sample_rate: 16000\n",
      "    labels: null\n",
      "    batch_size: 128\n",
      "    shuffle: false\n",
      "    num_workers: 15\n",
      "    pin_memory: true\n",
      "    \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2023-09-29 17:45:00 features:289] PADDING: 16\n",
      "[NeMo I 2023-09-29 17:45:00 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n",
      "[NeMo I 2023-09-29 17:45:00 modelPT:1151] Model checkpoint partially restored from pretrained checkpoint with name `titanet_large`\n",
      "[NeMo I 2023-09-29 17:45:00 modelPT:1153] The following parameters were excluded when loading from pretrained checkpoint with name `titanet_large` : ['decoder.final.weight']\n",
      "[NeMo I 2023-09-29 17:45:00 modelPT:1156] Make sure that this is what you wanted!\n",
      "[NeMo I 2023-09-29 17:45:01 modelPT:735] Optimizer config = AdamW (\n",
      "    Parameter Group 0\n",
      "        amsgrad: False\n",
      "        betas: (0.9, 0.999)\n",
      "        capturable: False\n",
      "        eps: 1e-08\n",
      "        foreach: None\n",
      "        lr: 0.0001\n",
      "        maximize: False\n",
      "        weight_decay: 0.0002\n",
      "    \n",
      "    Parameter Group 1\n",
      "        amsgrad: False\n",
      "        betas: (0.9, 0.999)\n",
      "        capturable: False\n",
      "        eps: 1e-08\n",
      "        foreach: None\n",
      "        lr: 0.001\n",
      "        maximize: False\n",
      "        weight_decay: 0.0002\n",
      "    )\n",
      "[NeMo I 2023-09-29 17:45:01 lr_scheduler:910] Scheduler \"<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7fe14b339850>\" \n",
      "    will be used during training (effective maximum steps = 41560) - \n",
      "    Parameters : \n",
      "    (warmup_ratio: 0.1\n",
      "    min_lr: 0.0\n",
      "    max_steps: 41560\n",
      "    )\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "  | Name            | Type                              | Params\n",
      "----------------------------------------------------------------------\n",
      "0 | loss            | AngularSoftmaxLoss                | 0     \n",
      "1 | eval_loss       | AngularSoftmaxLoss                | 0     \n",
      "2 | _accuracy       | TopKClassificationAccuracy        | 0     \n",
      "3 | preprocessor    | AudioToMelSpectrogramPreprocessor | 0     \n",
      "4 | encoder         | ConvASREncoder                    | 19.4 M\n",
      "5 | decoder         | SpeakerDecoder                    | 3.0 M \n",
      "6 | _macro_accuracy | MulticlassAccuracy                | 0     \n",
      "----------------------------------------------------------------------\n",
      "22.4 M    Trainable params\n",
      "0         Non-trainable params\n",
      "22.4 M    Total params\n",
      "89.509    Total estimated model params size (MB)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Sanity Checking: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2023-09-29 17:45:01 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
      "      rank_zero_warn(\n",
      "    \n",
      "[NeMo W 2023-09-29 17:45:22 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
      "      rank_zero_warn(\n",
      "    \n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "45d1cf72025742e884ba3ff4a6b8e7eb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2023-09-29 17:45:40 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:212: UserWarning: You called `self.log('global_step', ...)` in your `training_step` but the value needs to be floating point. Converting it to torch.float32.\n",
      "      warning_cache.warn(\n",
      "    \n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Validation: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Validation: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Validation: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Validation: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Validation: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Validation: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Validation: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f692ed8064c443afb82ad1e965778fd2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Validation: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Fine-tune the model with Portuguese language\n",
    "\n",
    "import torch\n",
    "import pytorch_lightning as pl\n",
    "import nemo\n",
    "import nemo.collections.asr as nemo_asr\n",
    "from omegaconf import OmegaConf\n",
    "from nemo.utils.exp_manager import exp_manager\n",
    "\n",
    "# Fine-tune the model with Turkish language\n",
    "tr_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n",
    "## set up the trainer\n",
    "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
    "\n",
    "tr_trainer_config = OmegaConf.create(dict(\n",
    "    devices=1,\n",
    "    accelerator=accelerator,\n",
    "    #num_sanity_val_steps=0,\n",
    "    max_epochs=10,\n",
    "    max_steps=-1,  # computed at runtime if not set\n",
    "    num_nodes=1,\n",
    "    \n",
    "    accumulate_grad_batches=1,\n",
    "    enable_checkpointing=False,  # Provided by exp_manager\n",
    "    logger=False,  # Provided by exp_manager\n",
    "    log_every_n_steps=1,  # Interval of logging.\n",
    "    val_check_interval=1.0,  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n",
    "))\n",
    "print(OmegaConf.to_yaml(tr_trainer_config))\n",
    "\n",
    "tr_trainer_finetune = pl.Trainer(**tr_trainer_config)\n",
    "\n",
    "\n",
    "#set up the nemo experiment for logging and monitoring purpose\n",
    "log_dir_finetune = exp_manager(tr_trainer_finetune, tr_config.get(\"exp_manager\", None))\n",
    "\n",
    "\n",
    "# set up the manifest file for Turkish language\n",
    "tr_config.model.train_ds.manifest_filepath = 'data/tr/train.json'\n",
    "tr_config.model.validation_ds.manifest_filepath = 'data/tr/dev.json'\n",
    "tr_config.model.test_ds.manifest_filepath = 'data/tr/test.json'\n",
    "tr_config.model.decoder.num_classes = train_df['label'].nunique()\n",
    "\n",
    "\n",
    "# set up the model for Turkish language and train the model\n",
    "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=tr_config.model, trainer=tr_trainer_finetune)\n",
    "speaker_model.maybe_init_from_pretrained_checkpoint(tr_config)\n",
    "tr_trainer_finetune.fit(speaker_model)\n",
    "#tr_trainer_finetune.test(speaker_model)\n",
    "\n",
    "# Save the model after fine-tuning with Turkish language\n",
    "\n",
    "speaker_model.save_to('titanet_finetune_tr.nemo')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "Speaker_Recogniton_Verification.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "transcribe",
   "language": "python",
   "name": "conda-env-.conda-transcribe-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}