"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ "
\n",
+ " [1500/1500 1:20:33, Epoch 4/5]\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Step | \n",
+ " Training Loss | \n",
+ " Validation Loss | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 100 | \n",
+ " 3.772000 | \n",
+ " 0.420103 | \n",
+ "
\n",
+ " \n",
+ " 200 | \n",
+ " 3.591200 | \n",
+ " 0.405249 | \n",
+ "
\n",
+ " \n",
+ " 300 | \n",
+ " 3.460400 | \n",
+ " 0.394766 | \n",
+ "
\n",
+ " \n",
+ " 400 | \n",
+ " 3.389400 | \n",
+ " 0.390611 | \n",
+ "
\n",
+ " \n",
+ " 500 | \n",
+ " 3.373700 | \n",
+ " 0.386506 | \n",
+ "
\n",
+ " \n",
+ " 600 | \n",
+ " 3.362800 | \n",
+ " 0.385102 | \n",
+ "
\n",
+ " \n",
+ " 700 | \n",
+ " 3.323600 | \n",
+ " 0.382134 | \n",
+ "
\n",
+ " \n",
+ " 800 | \n",
+ " 3.306000 | \n",
+ " 0.381117 | \n",
+ "
\n",
+ " \n",
+ " 900 | \n",
+ " 3.285900 | \n",
+ " 0.379681 | \n",
+ "
\n",
+ " \n",
+ " 1000 | \n",
+ " 3.266300 | \n",
+ " 0.376319 | \n",
+ "
\n",
+ " \n",
+ " 1100 | \n",
+ " 3.236800 | \n",
+ " 0.375682 | \n",
+ "
\n",
+ " \n",
+ " 1200 | \n",
+ " 3.210700 | \n",
+ " 0.374939 | \n",
+ "
\n",
+ " \n",
+ " 1300 | \n",
+ " 3.203500 | \n",
+ " 0.372964 | \n",
+ "
\n",
+ " \n",
+ " 1400 | \n",
+ " 3.196900 | \n",
+ " 0.372788 | \n",
+ "
\n",
+ " \n",
+ " 1500 | \n",
+ " 3.210700 | \n",
+ " 0.371661 | \n",
+ "
\n",
+ " \n",
+ "
"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:2816: UserWarning: Moving the following attributes in the config to the generation config: {'max_length': 1876}. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.\n",
+ " warnings.warn(\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n",
+ "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+ " with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "TrainOutput(global_step=1500, training_loss=3.39983762105306, metrics={'train_runtime': 4837.6694, 'train_samples_per_second': 9.922, 'train_steps_per_second': 0.31, 'total_flos': 5483826441583776.0, 'train_loss': 3.39983762105306, 'epoch': 4.580152671755725})"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 34
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "trainer.push_to_hub()"
+ ],
+ "metadata": {
+ "id": "T3aPr-chnqM_",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 52
+ },
+ "outputId": "288fef0f-ba56-478a-8dcc-264dd6ddd90a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "CommitInfo(commit_url='https://huggingface.co/DeepDiveDev/speecht5_finetuned_English/commit/8bbc92b5968125e1ff51b8fea16e90aa40c5f267', commit_message='End of training', commit_description='', oid='8bbc92b5968125e1ff51b8fea16e90aa40c5f267', pr_url=None, pr_revision=None, pr_num=None)"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ }
+ },
+ "metadata": {},
+ "execution_count": 35
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Dataset on Technical Term"
+ ],
+ "metadata": {
+ "id": "yoOXf1BXOdg4"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install datasets\n",
+ "import pandas as pd\n",
+ "from datasets import Dataset, load_dataset\n",
+ "\n",
+ "# Load your dataset from Excel file using pd.read_excel\n",
+ "df = pd.read_excel('/content/drive/MyDrive/TTS_Eng/TTS-English.xlsx')\n",
+ "# Convert the pandas DataFrame to a Hugging Face Dataset\n",
+ "dataset = Dataset.from_pandas(df)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "K8YzZRrPf6mz",
+ "outputId": "109e7328-faa4-4274-fef3-ae2c1e628106"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting datasets\n",
+ " Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n",
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n",
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (16.1.0)\n",
+ "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
+ " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+ "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n",
+ "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.5)\n",
+ "Collecting xxhash (from datasets)\n",
+ " Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+ "Collecting multiprocess<0.70.17 (from datasets)\n",
+ " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
+ "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.6.1)\n",
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n",
+ "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.24.7)\n",
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.1)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n",
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.15.4)\n",
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n",
+ "Downloading datasets-3.0.2-py3-none-any.whl (472 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m472.7/472.7 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hInstalling collected packages: xxhash, dill, multiprocess, datasets\n",
+ "Successfully installed datasets-3.0.2 dill-0.3.8 multiprocess-0.70.16 xxhash-3.5.0\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "len(dataset)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "k7SgmYMUUSSj",
+ "outputId": "1a9f1ee6-58d5-406a-a921-a39647c5e0c3"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "133"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install transformers\n",
+ "from transformers import AutoProcessor\n",
+ "\n",
+ "# Assuming you want to use a specific processor, replace \"facebook/wav2vec2-base-960h\" with the desired model\n",
+ "processor = AutoProcessor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
+ "\n",
+ "tokenizer = processor.tokenizer"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 527,
+ "referenced_widgets": [
+ "bf0d33ef88d6454a9a4ea5a78883d9e9",
+ "b0700ca20203401fa7e2eb9a2b60aa0f",
+ "4225f3ecaa9046519340846aaba8616a",
+ "09b941636ca64b238d3e0a761573f45e",
+ "f2c3e3edbf5040be97bc87c1711d31bc",
+ "9b6cbb6586b24fa095476410bc1d80b9",
+ "cd3f004541194be29b29c99864fcf146",
+ "d4f6744cd7ce4a57aa784fbbee4c0621",
+ "7aaba4f57f7d4effbbf66933c1ebc61b",
+ "9f8c7e1a0b5e4942855b6a2143e1eed7",
+ "3805153ecfb743ffbb0e337a77004560",
+ "36f8c5fce0fb4b93be7295042913e208",
+ "7f3217a296c04423804d8290a17e946a",
+ "0429e54da0404f4298c47efe1c6b66e2",
+ "50b8cc6f892b4a149494951839bd3993",
+ "42e0f0238e9149468a716b765dc8baff",
+ "a81d5a2e37f945fca8b0cda4bdab0b91",
+ "22c4ce4eac7542409c62d058825e761c",
+ "2294bcb50eef48e698d6723958af9c94",
+ "56dc4123de544d50a9d5dde76876e00b",
+ "151b4d2db3bc4797ad977f722806ed64",
+ "692a45c766aa43acb1b7f7ad7ee1772d",
+ "4f20b5912c994e3986d3c4d30eb99143",
+ "a1c4cf9ad32b49a4996fbb3f321745a9",
+ "60183325ec9c4bb1967e83fe3be5dd32",
+ "369ff789de224c6f910cf8758420f0c8",
+ "f42a309ff1994451b50cf9d49d1f7a6d",
+ "f115fc561c044c8c9d79f1a189766fcf",
+ "5f5a995405d84c1b80ee0b22ad955696",
+ "79c5b6542ba34375a8f44d54709bab45",
+ "95ce8dec3e7f49b4b3311e7739da70d5",
+ "0d5030b6ed24403da75989467066f260",
+ "ef78e68136b448d3b8b603040e2de130",
+ "520715b2218b45068326603a6639746c",
+ "d30d15c3d0b745eda19f46e4c9c1d6c6",
+ "9f452748eb134bc0ac00f4038fd45a88",
+ "a3d6d1ae1a764fa782c5e42bc07823ba",
+ "db4f5325442b47f68cc0dfe18a4113b1",
+ "37dc19476c7c466482cac9c4509b3ad5",
+ "d01f1c3db49845a8bdb69a2cd8613018",
+ "48b70ae7b3a346ceac5a48b36fbddffc",
+ "e7716f8922284f30bd33b605ba90940e",
+ "38c3bbe6735e4bdf99263c6251a79eeb",
+ "6e90888770ce4e919567ec299fe5c28e",
+ "ef48464237604789b200054eaa4a23d9",
+ "5bc9d68ab2a04312a5a9d4b5977fd691",
+ "7586d1a50f924338887b3d66b6a7947a",
+ "c6cbd5b7493746299e01d7b7a3a1e3be",
+ "ab76bf76b190487b96317ce4f4583ae6",
+ "b37be5cbddd94530bac98f7f2301ac6d",
+ "8f51b4c1f2b043d4b640baf4a8cec808",
+ "c62b866ecf204cb3803f0226d166d213",
+ "f0aa4b397e3c402cbfb4d1c3af8aacf6",
+ "d05fe02294354059a6ce512f94ff1b17",
+ "1397290d20c44f67b51176023c0090bb"
+ ]
+ },
+ "id": "ZyUozQcJV0w2",
+ "outputId": "44da5f87-59e3-4bc1-a64f-3582c58602af"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n",
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.24.7)\n",
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n",
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n",
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n",
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n",
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n",
+ "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n",
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.5)\n",
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (2024.6.1)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "preprocessor_config.json: 0%| | 0.00/159 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "bf0d33ef88d6454a9a4ea5a78883d9e9"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "tokenizer_config.json: 0%| | 0.00/163 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "36f8c5fce0fb4b93be7295042913e208"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "config.json: 0%| | 0.00/1.60k [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "4f20b5912c994e3986d3c4d30eb99143"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "vocab.json: 0%| | 0.00/291 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "520715b2218b45068326603a6639746c"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "special_tokens_map.json: 0%| | 0.00/85.0 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "ef48464237604789b200054eaa4a23d9"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "from datasets import Dataset, load_dataset\n",
+ "\n",
+ "# Load your dataset from Excel file using pd.read_excel\n",
+ "df = pd.read_excel('/content/drive/MyDrive/TTS_Eng/TTS-English.xlsx')\n",
+ "# Convert the pandas DataFrame to a Hugging Face Dataset\n",
+ "dataset = Dataset.from_pandas(df)\n",
+ "\n",
+ "tokenizer = processor.tokenizer\n",
+ "\n",
+ "def extract_all_chars(batch):\n",
+ " text_examples = [text for text in batch[\"Text Example\"] if text is not None]\n",
+ " all_text = \" \".join(text_examples)\n",
+ " vocab = list(set(all_text))\n",
+ " return {\"vocab\": [vocab], \"all_text\": [all_text]}\n",
+ "\n",
+ "# Only remove unnecessary columns, but keep the text column\n",
+ "vocabs = dataset.map(\n",
+ " extract_all_chars,\n",
+ " batched=True,\n",
+ " batch_size=-1,\n",
+ " keep_in_memory=True,\n",
+ " # Instead of removing all columns, specify only the ones you want to remove.\n",
+ " # Replace with the actual columns you want to remove if needed.\n",
+ " remove_columns=[col for col in dataset.column_names if col != \"transcript\"],\n",
+ ")\n",
+ "\n",
+ "dataset_vocab = set(vocabs[\"vocab\"][0])\n",
+ "tokenizer_vocab = {k for k,_ in tokenizer.get_vocab().items()}"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 49,
+ "referenced_widgets": [
+ "3659e5c8996d42ccbd6b165960b6ebcb",
+ "221fa7adbd3046fa813a272a333f0a58",
+ "1c65128e06bd45ddafbb996947f4746d",
+ "11faa4b79c774470a6256e5bed9fc606",
+ "038fbc256aa943818c51112507e502f7",
+ "f3ba4f7daed0414b8c28cc7354af8d99",
+ "d17f3eb739dd479b80a7ef85471b7937",
+ "ed20aaccf62041a6a3fdaef43fc0b9f7",
+ "ae882f381a714169a48f4d34124c6d02",
+ "c1f6fb263ee04e1482b3d95414abb0a1",
+ "c607e763b8b940e6afbe677799d79fa6"
+ ]
+ },
+ "id": "_7K8SGgLgohy",
+ "outputId": "b1e65d74-316e-4fda-eeb5-b461dca82aa5"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "Map: 0%| | 0/133 [00:00, ? examples/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "3659e5c8996d42ccbd6b165960b6ebcb"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "dataset_vocab - tokenizer_vocab"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "KuF4QM_DV8RK",
+ "outputId": "17ef0ff3-3914-4a50-dd66-475b943df7a9"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "{' ',\n",
+ " '\"',\n",
+ " '%',\n",
+ " ',',\n",
+ " '-',\n",
+ " '.',\n",
+ " '0',\n",
+ " '2',\n",
+ " '4',\n",
+ " '5',\n",
+ " 'a',\n",
+ " 'b',\n",
+ " 'c',\n",
+ " 'd',\n",
+ " 'e',\n",
+ " 'f',\n",
+ " 'g',\n",
+ " 'h',\n",
+ " 'i',\n",
+ " 'j',\n",
+ " 'k',\n",
+ " 'l',\n",
+ " 'm',\n",
+ " 'n',\n",
+ " 'o',\n",
+ " 'p',\n",
+ " 'q',\n",
+ " 'r',\n",
+ " 's',\n",
+ " 't',\n",
+ " 'u',\n",
+ " 'v',\n",
+ " 'w',\n",
+ " 'x',\n",
+ " 'y',\n",
+ " 'z',\n",
+ " '’'}"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "replacements = [\n",
+ " ('à', 'a'),\n",
+ " ('ç', 'c'),\n",
+ " ('è', 'e'),\n",
+ " ('ë', 'e'),\n",
+ " ('í', 'i'),\n",
+ " ('ï', 'i'),\n",
+ " ('ö', 'o'),\n",
+ " ('ü', 'u'),\n",
+ " ('’', \"'\"), # Replacing curly apostrophe with a standard one\n",
+ " ('%', ''), # Option to remove the percentage symbol\n",
+ " ('0', '0'), # Keep 0 as it is (no change)\n",
+ " ('2', '2'), # Keep 2 as it is (no change)\n",
+ " ('4', '4'), # Keep 4 as it is (no change)\n",
+ " ('5', '5'), # Keep 5 as it is (no change)\n",
+ " (' ', ' ') # Ensure spaces remain unchanged\n",
+ "]\n",
+ "\n",
+ "def cleanup_text(inputs):\n",
+ " text_column_name = \"Text Example\" # Update with the correct column name from your dataset\n",
+ " # Check if the value is not None using the correct column name\n",
+ " if inputs[text_column_name] is not None:\n",
+ " for src, dst in replacements:\n",
+ " # Update this line to use the correct column name\n",
+ " inputs[text_column_name] = inputs[text_column_name].replace(src, dst)\n",
+ " return inputs\n",
+ "\n",
+ "# Apply the function to the dataset\n",
+ "dataset = dataset.map(cleanup_text)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 49,
+ "referenced_widgets": [
+ "25e3bbaabea54530b09017d1b89614c2",
+ "bc2c8c6b5a63418cbf967553012c6fb2",
+ "1838da789aae4c07874c9d3b777dde5a",
+ "a6b1e677a10b4d2ca91dd2e9da1d7aea",
+ "cceb7ceda8f94f9d87c62562f6803efe",
+ "35d690584d9942c5922d17b3d6efda15",
+ "ec18c6121c734b7788387cb7eb903657",
+ "370e5ff52db4411d914bbaa5a2ccc5d3",
+ "6a467e0fbed14e63b7884a8abd3c802b",
+ "b0f3fa5644784a39b02491fd820d4868",
+ "d413f0963826479cabada7a15c680ce5"
+ ]
+ },
+ "id": "0snxxfPdje6w",
+ "outputId": "46aed7b0-d27b-42f2-fd33-d45547d40a50"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "Map: 0%| | 0/133 [00:00, ? examples/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "25e3bbaabea54530b09017d1b89614c2"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install speechbrain\n",
+ "import os\n",
+ "import torch\n",
+ "from speechbrain.pretrained import EncoderClassifier\n",
+ "\n",
+ "spk_model_name = \"speechbrain/spkrec-xvect-voxceleb\"\n",
+ "\n",
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+ "speaker_model = EncoderClassifier.from_hparams(\n",
+ " source=spk_model_name,\n",
+ " run_opts={\"device\": device},\n",
+ " savedir=os.path.join(\"/tmp\", spk_model_name),\n",
+ ")\n",
+ "\n",
+ "\n",
+ "def create_speaker_embedding(waveform):\n",
+ " with torch.no_grad():\n",
+ " speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))\n",
+ " speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)\n",
+ " speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()\n",
+ " return speaker_embeddings"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000,
+ "referenced_widgets": [
+ "fa8757aaa0a741cf839b9241c827d971",
+ "b13821240ecc45a385b2a2b1effa86f7",
+ "e68376643bcf4350b60e91f57784a5bd",
+ "c43dde973e8e437185ae3fc4a70223e7",
+ "408e99ba3be2433eaed5461ad9f09867",
+ "e62bf5257aec4e688b52a275fd157da3",
+ "cab5270d6de94e1592d9bda2e2cdfe1a",
+ "624ab1702ede421682fa47530ed99913",
+ "e58492565fa045fbaf0d9052e8574922",
+ "6dafbc1cdbbd4a099ded9ed44e88fc50",
+ "ea9dc779cb664fb7835a0d5a132f82f1",
+ "1289e1adffb040c79930ab4709dc7224",
+ "bbf4f9046aa24a69b65d2361081f0288",
+ "50dae551f435430f9e95905c36a5d9e3",
+ "c33e6a32b0634c7ea1b1131127475fdf",
+ "92e09ba91998416a9797b0f1def75add",
+ "ff4914ff0f924cb18d743ecb8f10fec4",
+ "74d02c59834942b69fa4c9b80b09f48a",
+ "50d58beaee4442b4a69197fea3f1d442",
+ "942355e5534f4404899e18436e07fa30",
+ "d158d0fac7674b45a58d83924abbc07c",
+ "fc0e64fad426421896620ac66dcafa63",
+ "2d381129b2474b9ca0227cd790c4439b",
+ "14bbec4c5e7f4f11b4bf1c6387991a7d",
+ "a85c5408987049d893af6c46b40b4ee6",
+ "b962c1297c6540659780411a92632a5f",
+ "737785d6a113473c8e7aec888505713b",
+ "630d28962ac542659f4c424f385a65f4",
+ "d39893e895c54010bf6f2aa00ec1d8d2",
+ "2263fcc46a3346c499619903ef1309c8",
+ "8c4a3e2f5e0241b5bb88c2871d2bfb10",
+ "1de033bae98c4f718c133b34c74f07a6",
+ "32804329bfe34cc0adb781561a371878",
+ "30b68f4079514c8db53790ed98a57a0f",
+ "fd93f9dfc78e495f8d8c1de4ad4399fe",
+ "73576e72b9164e7f87ff44b0200e4b7e",
+ "b4e4186c328147d3b71ef5444f59f78f",
+ "72e6d7b5d2b0416a8014b15f2c0f131d",
+ "d8e396928f8e4cd49bb56fab121e0195",
+ "13de4de37676426b95639c1b1eb6cc90",
+ "e5f0d98066fa4f28851baa54938c861a",
+ "9bcba59f919341f7973cc6e842016a6d",
+ "09cba933c19345538cfe6985485f3f28",
+ "1980e08341ad43f4aab42dd43f033d09",
+ "8c4a1685d4af4fd2bd4e403a7c333744",
+ "d03b3cb3e8444b4f83ce92c4c8a12195",
+ "97c5de7959924eb39c57ec305cb7043d",
+ "28dc36b8a8b341e2adedefd37b394d00",
+ "e5dc20b4193d4bc2afe967914d46a7dd",
+ "ad2247b2855b4660a2e08ebb1f005182",
+ "331fb19489804a7097effd9eca84a310",
+ "b9c1f6499f214190b4b9f90fa32c9049",
+ "95f57a74b13b4d0ab1102be0f45e3c3d",
+ "27a28aff2f464af58bf82bb65f4662ab",
+ "d610309145cf47fcbf587ba4ea6f6c10"
+ ]
+ },
+ "id": "j-izN78sjlR9",
+ "outputId": "0518403a-2cbc-4d96-e7fc-e8611c1e5263"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting speechbrain\n",
+ " Downloading speechbrain-1.0.1-py3-none-any.whl.metadata (24 kB)\n",
+ "Collecting hyperpyyaml (from speechbrain)\n",
+ " Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)\n",
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from speechbrain) (1.4.2)\n",
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from speechbrain) (1.26.4)\n",
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from speechbrain) (24.1)\n",
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from speechbrain) (1.13.1)\n",
+ "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from speechbrain) (0.2.0)\n",
+ "Requirement already satisfied: torch>=1.9 in /usr/local/lib/python3.10/dist-packages (from speechbrain) (2.4.1+cu121)\n",
+ "Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (from speechbrain) (2.4.1+cu121)\n",
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from speechbrain) (4.66.5)\n",
+ "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from speechbrain) (0.24.7)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->speechbrain) (3.16.1)\n",
+ "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->speechbrain) (4.12.2)\n",
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->speechbrain) (1.13.3)\n",
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->speechbrain) (3.4.1)\n",
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->speechbrain) (3.1.4)\n",
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->speechbrain) (2024.6.1)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->speechbrain) (6.0.2)\n",
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->speechbrain) (2.32.3)\n",
+ "Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)\n",
+ " Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)\n",
+ "Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)\n",
+ " Downloading ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)\n",
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.9->speechbrain) (3.0.2)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->speechbrain) (3.4.0)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->speechbrain) (3.10)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->speechbrain) (2.2.3)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->speechbrain) (2024.8.30)\n",
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.9->speechbrain) (1.3.0)\n",
+ "Downloading speechbrain-1.0.1-py3-none-any.whl (807 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m807.2/807.2 kB\u001b[0m \u001b[31m16.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)\n",
+ "Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.8/117.8 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (722 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m722.2/722.2 kB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hInstalling collected packages: ruamel.yaml.clib, ruamel.yaml, hyperpyyaml, speechbrain\n",
+ "Successfully installed hyperpyyaml-1.2.2 ruamel.yaml-0.18.6 ruamel.yaml.clib-0.2.12 speechbrain-1.0.1\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ ":4: UserWarning: Module 'speechbrain.pretrained' was deprecated, redirecting to 'speechbrain.inference'. Please update your script. This is a change from SpeechBrain 1.0. See: https://github.com/speechbrain/speechbrain/releases/tag/v1.0.0\n",
+ " from speechbrain.pretrained import EncoderClassifier\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "hyperparams.yaml: 0%| | 0.00/2.04k [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "fa8757aaa0a741cf839b9241c827d971"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/speechbrain/utils/autocast.py:68: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.\n",
+ " wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "embedding_model.ckpt: 0%| | 0.00/16.9M [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "1289e1adffb040c79930ab4709dc7224"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "mean_var_norm_emb.ckpt: 0%| | 0.00/3.20k [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "2d381129b2474b9ca0227cd790c4439b"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "classifier.ckpt: 0%| | 0.00/15.9M [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "30b68f4079514c8db53790ed98a57a0f"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "label_encoder.txt: 0%| | 0.00/129k [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "8c4a1685d4af4fd2bd4e403a7c333744"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/speechbrain/utils/checkpoints.py:194: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+ " state_dict = torch.load(path, map_location=device)\n",
+ "/usr/local/lib/python3.10/dist-packages/speechbrain/processing/features.py:1311: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+ " stats = torch.load(path, map_location=device)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def prepare_dataset(example):\n",
+ " audio = example[\"audio\"]\n",
+ "\n",
+ " example = processor(\n",
+ " text=example[\"normalized_text\"],\n",
+ " audio_target=audio[\"array\"],\n",
+ " sampling_rate=audio[\"sampling_rate\"],\n",
+ " return_attention_mask=False,\n",
+ " )\n",
+ "\n",
+ " # strip off the batch dimension\n",
+ " example[\"labels\"] = example[\"labels\"][0]\n",
+ "\n",
+ " # use SpeechBrain to obtain x-vector\n",
+ " example[\"speaker_embeddings\"] = create_speaker_embedding(audio[\"array\"])\n",
+ "\n",
+ " return example"
+ ],
+ "metadata": {
+ "id": "u1SOUSD4j2uh"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def prepare_dataset(example):\n",
+ " # Check if 'audio' key exists before accessing it\n",
+ " if 'audio' in example:\n",
+ " audio = example[\"audio\"]\n",
+ " else:\n",
+ " # Handle the case where 'audio' key is missing\n",
+ " # This could be raising an error, skipping the example, or using a default value\n",
+ " # For example, to skip the example:\n",
+ " print(\"Skipping example due to missing 'audio' key\")\n",
+ " return None\n",
+ "\n",
+ " example = processor(\n",
+ " text=example[\"normalized_text\"],\n",
+ " audio_target=audio[\"array\"],\n",
+ " sampling_rate=audio[\"sampling_rate\"],\n",
+ " return_attention_mask=False,\n",
+ " )\n",
+ "\n",
+ " # strip off the batch dimension\n",
+ " example[\"labels\"] = example[\"labels\"][0]\n",
+ "\n",
+ " # use SpeechBrain to obtain x-vector\n",
+ " example[\"speaker_embeddings\"] = create_speaker_embedding(audio[\"array\"])\n",
+ "\n",
+ " return example"
+ ],
+ "metadata": {
+ "id": "F9vyasCLkAAd"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000,
+ "referenced_widgets": [
+ "7647fa5286d44b689acf83d9022b887f",
+ "b525f28a822340f9a1a960f1a0b7aeaf",
+ "ffb6181ea80840b2a225fdf85e2a134b",
+ "e1111e23ff4849a9b89c8f845eb33b6d",
+ "da49275b933040a0b93da7feee8c4c95",
+ "1d3538f3b26543cfb334d1e2ca168607",
+ "40425a5382bc484dbc0250f00dc1b44b",
+ "7e8295acdab3420ea019548ab2eea90c",
+ "e17732d2cc874ac497dba8bff9312480",
+ "ca781213ede347fba54ade5ffd5be0d0",
+ "ee86a6b031334d0c99a5494309e9d482"
+ ]
+ },
+ "id": "sRaWDQU9kM0C",
+ "outputId": "8f72dc0a-9ad1-4fe0-a619-ed718720eae3"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "Map: 0%| | 0/133 [00:00, ? examples/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "7647fa5286d44b689acf83d9022b887f"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n",
+ "Skipping example due to missing 'audio' key\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def is_not_too_long(example):\n",
+ " input_ids = example.get('Text Example') # Safely access 'Text Example'\n",
+ " if input_ids is not None:\n",
+ " input_length = len(input_ids)\n",
+ " return input_length < 300\n",
+ " else:\n",
+ " return False # Skip examples with missing or None 'Text Example'\n",
+ "\n",
+ "\n",
+ "dataset = dataset.filter(is_not_too_long) # No need to specify input_columns\n",
+ "len(dataset)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 66,
+ "referenced_widgets": [
+ "d843722bfdda42c09531ea8d1c314946",
+ "a6a742b781754e0a8639cab0ae999217",
+ "9e6e888085074714b2a19778ec69749d",
+ "c5795368e4fc4436bf747ad2410d406d",
+ "fddb9c7814db4992a3a1651cfc90de32",
+ "1fe90e619a9e419abbd7afbe153801c7",
+ "ea7bdc114d204b85ab9961af33c38afb",
+ "c510643600ac49019722461ae6f42d45",
+ "aaddf15fde274701b5fd1393d07ba601",
+ "4e81ca67742f49daacc0adf6fd6b91bf",
+ "6a851c8e195941df945af7704622a914"
+ ]
+ },
+ "id": "KAF3OhVTkTJf",
+ "outputId": "725f6afa-6cab-4fb5-b97e-ac7e8721df2d"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "Filter: 0%| | 0/133 [00:00, ? examples/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "d843722bfdda42c09531ea8d1c314946"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "105"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 24
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from dataclasses import dataclass\n",
+ "from typing import Any, Dict, List, Union\n",
+ "\n",
+ "\n",
+ "@dataclass\n",
+ "class TTSDataCollatorWithPadding:\n",
+ " processor: Any\n",
+ "\n",
+ " def __call__(\n",
+ " self, features: List[Dict[str, Union[List[int], torch.Tensor]]]\n",
+ " ) -> Dict[str, torch.Tensor]:\n",
+ " input_ids = [{\"input_ids\": feature[\"input_ids\"]} for feature in features]\n",
+ " label_features = [{\"input_values\": feature[\"labels\"]} for feature in features]\n",
+ " speaker_features = [feature[\"speaker_embeddings\"] for feature in features]\n",
+ "\n",
+ " # collate the inputs and targets into a batch\n",
+ " batch = processor.pad(\n",
+ " input_ids=input_ids, labels=label_features, return_tensors=\"pt\"\n",
+ " )\n",
+ "\n",
+ " # replace padding with -100 to ignore loss correctly\n",
+ " batch[\"labels\"] = batch[\"labels\"].masked_fill(\n",
+ " batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100\n",
+ " )# not used during fine-tuning\n",
+ " del batch[\"decoder_attention_mask\"]\n",
+ "\n",
+ " # round down target lengths to multiple of reduction factor\n",
+ " if model.config.reduction_factor > 1:\n",
+ " target_lengths = torch.tensor(\n",
+ " [len(feature[\"input_values\"]) for feature in label_features]\n",
+ " )\n",
+ " target_lengths = target_lengths.new(\n",
+ " [\n",
+ " length - length % model.config.reduction_factor\n",
+ " for length in target_lengths\n",
+ " ]\n",
+ " )\n",
+ " max_length = max(target_lengths)\n",
+ " batch[\"labels\"] = batch[\"labels\"][:, :max_length]\n",
+ "\n",
+ " # also add in the speaker embeddings\n",
+ " batch[\"speaker_embeddings\"] = torch.tensor(speaker_features)\n",
+ "\n",
+ " return batch"
+ ],
+ "metadata": {
+ "id": "VPmLwOUIkXVJ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data_collator = TTSDataCollatorWithPadding(processor=processor)"
+ ],
+ "metadata": {
+ "id": "t7TCP0SemAV4"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from transformers import SpeechT5ForTextToSpeech\n",
+ "\n",
+ "# Define the checkpoint variable with the desired model name or path\n",
+ "checkpoint = \"microsoft/speecht5_tts\" # Example checkpoint for SpeechT5 text-to-speech\n",
+ "\n",
+ "model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 81,
+ "referenced_widgets": [
+ "7e9f6e39ae5b4a67a6c3f6784c1b27fd",
+ "f9f524d062c94f8292546b8e7725e173",
+ "644230da1cf94c03a94fdc6f5d50e679",
+ "06b1f0f29e1443c2872991a640069f9a",
+ "7fdf330be2814e1ebaed555e6ffb741d",
+ "e764e6b820444e2dbc8662de23fe4fe2",
+ "1e96402bce724195888a2ee988c47949",
+ "a494bdb761414c678e6df357b5d17f33",
+ "ff767d13a25b4392ace435eefeead147",
+ "5d3d3d387ac54d6bb5127ade744b6ce7",
+ "e04eeb09c5d7469ea0737041d4166d20",
+ "1d9c4b97bb814dec804419862b9d4519",
+ "bbebee553dfa41a8b972c9b11ca28db1",
+ "9385cbb7ddec444c8de50ea8f19fe41a",
+ "8ff4fc045d6d4bb99431309f51e7f3f2",
+ "b96bf4c87a39442fa7321e1a6a17a668",
+ "8c21060511fa459b93af66c63ab89614",
+ "a687e10459064680a1b632a7e6d25253",
+ "fd0d2e45e27547d097870d268a37cc7b",
+ "0c0526a71b254c51b830c9544fd5e70f",
+ "1f69dc2914274109a9cb12d0567193a0",
+ "8e7329e6d9da48379e07cafe8633b8af"
+ ]
+ },
+ "id": "o5jN1MH2mE0M",
+ "outputId": "85be9a6e-c759-445c-d288-6b552cc7aef7"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "config.json: 0%| | 0.00/2.06k [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "7e9f6e39ae5b4a67a6c3f6784c1b27fd"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "pytorch_model.bin: 0%| | 0.00/585M [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "1d9c4b97bb814dec804419862b9d4519"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from functools import partial\n",
+ "\n",
+ "# disable cache during training since it's incompatible with gradient checkpointing\n",
+ "model.config.use_cache = False\n",
+ "\n",
+ "# set language and task for generation and re-enable cache\n",
+ "model.generate = partial(model.generate, use_cache=True)"
+ ],
+ "metadata": {
+ "id": "DfnSHysemIDV"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from transformers import Seq2SeqTrainingArguments\n",
+ "\n",
+ "training_args = Seq2SeqTrainingArguments(\n",
+ " output_dir=\"/content/speecht5_finetuned_English\", # change to a repo name of your choice\n",
+ " per_device_train_batch_size=4,\n",
+ " gradient_accumulation_steps=8,\n",
+ " learning_rate=1e-4,\n",
+ " warmup_steps=100,\n",
+ " max_steps=1500,\n",
+ " gradient_checkpointing=True,\n",
+ " fp16=True,\n",
+ " evaluation_strategy=\"steps\",\n",
+ " per_device_eval_batch_size=2,\n",
+ " save_steps=100,\n",
+ " eval_steps=100,\n",
+ " logging_steps=25,\n",
+ " report_to=[\"tensorboard\"],\n",
+ " load_best_model_at_end=True,\n",
+ " greater_is_better=False,\n",
+ " label_names=[\"labels\"],\n",
+ " push_to_hub=True,\n",
+ ")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "gl-7E0SbmWBX",
+ "outputId": "93536bd1-261d-4d1b-bc10-85db55fd0ccd"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from huggingface_hub import notebook_login\n",
+ "\n",
+ "notebook_login()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 145,
+ "referenced_widgets": [
+ "872ae79283d844358684cf964109b39a",
+ "58560800f00a4b1e9a5762a270c5c856",
+ "e17f80cdc518426190e3473282b14f88",
+ "0f2c0d02f9834b98bd2249fb6cbb68f0",
+ "d83f7fc747cd43d489698f41d720cb9e",
+ "e81ca8e7750d4d3f96ccf5efb56be885",
+ "d7259524214047f29d20af50cedece94",
+ "00921b12291c445d899afe15a74ae942",
+ "8e6488d2f9344d6785cc20d6cadfc039",
+ "4f53aaf8fabf42e88ad67a3e2e94275b",
+ "d77ac34cc6e44d248304a66938d9125f",
+ "c0aa1da1b83c42d6b1322b3e413d267f",
+ "301834cb31224c8ebd87d38d883cb32b",
+ "e8210f2315a245f6a5de23de281fda91",
+ "701c416ec0a7435a82440d89a1f3f3d1",
+ "18f28609d8234c6eaa33526f4f9e63a1",
+ "f2be2eaddd254f57964b3d4e0ffe61c2",
+ "9803190fe0af490e8e2af59b40fb99b2",
+ "d2cf1af5949a490680a1eab00811134e",
+ "9dd703c79e074bb4ae16b951be1eacad",
+ "6ddfe455fce2484d8d202e91ebb14b58",
+ "32492dc2b473452a96cbdb13f21bb962",
+ "22c836e57a6b46bd9247a37ac22a46de",
+ "637273ba4c9f4213a0bac13d1717a451",
+ "7cc35c734f1142e397551c953562058e",
+ "ed5aaf221249493592da60030146dad4",
+ "31cac28abdac4de0903d07a500e73a73",
+ "73939060d69346a790a02a18b090a2dd",
+ "5f806d2631034fb38f84bd146177f001",
+ "daef6b610c284f4a8b5620a4fa21da37",
+ "bef5422495b34cf28baddbd74d83254a",
+ "c1d5127b256d4348bf1c238f183d4eb5"
+ ]
+ },
+ "id": "2ubkRqMSw_ev",
+ "outputId": "d1fe7c36-5aaf-4cbb-91c3-21f836653973"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "VBox(children=(HTML(value=' =1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n",
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (16.1.0)\n",
+ "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+ "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n",
+ "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.5)\n",
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n",
+ "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+ "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.6.1)\n",
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n",
+ "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.24.7)\n",
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.1)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n",
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2024.9.11)\n",
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.4.5)\n",
+ "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.19.1)\n",
+ "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.4.1+cu121)\n",
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.15.4)\n",
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n",
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (1.13.3)\n",
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.4.1)\n",
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.1.4)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n",
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->transformers[torch]) (3.0.2)\n",
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->transformers[torch]) (1.3.0)\n",
+ "Collecting sklearn\n",
+ " Using cached sklearn-0.0.post12.tar.gz (2.6 kB)\n",
+ " \u001b[1;31merror\u001b[0m: \u001b[1msubprocess-exited-with-error\u001b[0m\n",
+ " \n",
+ " \u001b[31m×\u001b[0m \u001b[32mpython setup.py egg_info\u001b[0m did not run successfully.\n",
+ " \u001b[31m│\u001b[0m exit code: \u001b[1;36m1\u001b[0m\n",
+ " \u001b[31m╰─>\u001b[0m See above for output.\n",
+ " \n",
+ " \u001b[1;35mnote\u001b[0m: This error originates from a subprocess, and is likely not a problem with pip.\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25herror\n",
+ "\u001b[1;31merror\u001b[0m: \u001b[1mmetadata-generation-failed\u001b[0m\n",
+ "\n",
+ "\u001b[31m×\u001b[0m Encountered error while generating package metadata.\n",
+ "\u001b[31m╰─>\u001b[0m See above for output.\n",
+ "\n",
+ "\u001b[1;35mnote\u001b[0m: This is an issue with the package mentioned above, not pip.\n",
+ "\u001b[1;36mhint\u001b[0m: See above for details.\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "max_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install datasets transformers[torch] accelerate\n",
+ "!pip install sklearn\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "import datasets\n",
+ "import numpy as np\n",
+ "\n",
+ "# Assuming 'dataset' is your Hugging Face Dataset\n",
+ "# Rename or add the required columns to match the model's expected input.\n",
+ "# For example, if your dataset has a column named 'text' containing the target text,\n",
+ "# rename it to 'decoder_input_values':\n",
+ "dataset = dataset.rename_column('Text Example', 'decoder_input_values')\n",
+ "# Similarly, rename or add other necessary columns like 'input_values', 'attention_mask', and 'decoder_attention_mask'.\n",
+ "\n",
+ "# Convert the Hugging Face Dataset to a NumPy array or list\n",
+ "# before splitting. Assuming 'dataset' is a Hugging Face Dataset\n",
+ "dataset_np = dataset[:] # If dataset is a DatasetDict, select the desired split, e.g., dataset[\"train\"][:]\n",
+ "#dataset_np = np.array(dataset) #If you need a NumPy array specifically\n",
+ "\n",
+ "# Split the NumPy array using train_test_split\n",
+ "train_indices, test_indices = train_test_split(np.arange(len(dataset_np)), test_size=0.2, random_state=42) # Adjust test_size and random_state as needed\n",
+ "\n",
+ "# Create new Hugging Face Datasets using the split indices\n",
+ "train_dataset = dataset.select(train_indices)\n",
+ "test_dataset = dataset.select(test_indices)\n",
+ "\n",
+ "# You might need to define a data collator to prepare the data for the model:\n",
+ "def data_collator(features):\n",
+ " # Implement logic to collate data based on the columns present in your dataset\n",
+ " # and the format required by the SpeechT5ForTextToSpeech model.\n",
+ " # This might involve padding, creating attention masks, etc.\n",
+ " # You can refer to the documentation for examples of data collators:\n",
+ " # https://huggingface.co/docs/transformers/main_classes/data_collator\n",
+ " pass # Replace with your data collation logic\n",
+ "\n",
+ "# Now you can pass these datasets to the trainer\n",
+ "from transformers import Seq2SeqTrainer\n",
+ "trainer = Seq2SeqTrainer(\n",
+ " args=training_args,\n",
+ " model=model,\n",
+ " train_dataset=train_dataset, # Use train_dataset instead of dataset[\"train\"]\n",
+ " eval_dataset=test_dataset, # Use test_dataset instead of dataset[\"test\"]\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=processor, # Assuming 'processor' is your SpeechT5Processor\n",
+ ")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "f5200UGhmm1y",
+ "outputId": "589f5ef9-f923-4e43-db28-a2cb3187fae3"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.0.2)\n",
+ "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.34.2)\n",
+ "Requirement already satisfied: transformers[torch] in /usr/local/lib/python3.10/dist-packages (4.44.2)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n",
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n",
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (16.1.0)\n",
+ "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+ "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n",
+ "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.5)\n",
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n",
+ "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+ "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.6.1)\n",
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n",
+ "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.24.7)\n",
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.1)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n",
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2024.9.11)\n",
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.4.5)\n",
+ "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.19.1)\n",
+ "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.4.1+cu121)\n",
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.15.4)\n",
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n",
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (1.13.3)\n",
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.4.1)\n",
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.1.4)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n",
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->transformers[torch]) (3.0.2)\n",
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->transformers[torch]) (1.3.0)\n",
+ "Collecting sklearn\n",
+ " Using cached sklearn-0.0.post12.tar.gz (2.6 kB)\n",
+ " \u001b[1;31merror\u001b[0m: \u001b[1msubprocess-exited-with-error\u001b[0m\n",
+ " \n",
+ " \u001b[31m×\u001b[0m \u001b[32mpython setup.py egg_info\u001b[0m did not run successfully.\n",
+ " \u001b[31m│\u001b[0m exit code: \u001b[1;36m1\u001b[0m\n",
+ " \u001b[31m╰─>\u001b[0m See above for output.\n",
+ " \n",
+ " \u001b[1;35mnote\u001b[0m: This error originates from a subprocess, and is likely not a problem with pip.\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25herror\n",
+ "\u001b[1;31merror\u001b[0m: \u001b[1mmetadata-generation-failed\u001b[0m\n",
+ "\n",
+ "\u001b[31m×\u001b[0m Encountered error while generating package metadata.\n",
+ "\u001b[31m╰─>\u001b[0m See above for output.\n",
+ "\n",
+ "\u001b[1;35mnote\u001b[0m: This is an issue with the package mentioned above, not pip.\n",
+ "\u001b[1;36mhint\u001b[0m: See above for details.\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "max_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from transformers import SpeechT5HifiGan\n",
+ "vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 81,
+ "referenced_widgets": [
+ "f7f64603670540cca1cfa1a6bb95d8d4",
+ "eecd754bff9d41f5803b1d543d8eb859",
+ "af79f746f2d94ac9802b1f0e3bac9ef5",
+ "b15adb94a0544448aa19d7847e0bc3d2",
+ "2d6eef2155c94cd0aa87769b945f77cb",
+ "b57381a0a4b140118f4acb711ae0b930",
+ "39bab19c0cf64667b772f2292c5f70b7",
+ "54e297f441114a2eb00a418eee763bde",
+ "24a3f21743fe43e49b5eac5b13070564",
+ "b4316acb39974b45beae8e5adadfdbeb",
+ "ddbdb7af43a147eb9a4b74a6677fe0aa",
+ "7e85f12904de48beabb658b439d35fc4",
+ "211f9427940545128436e2472903176f",
+ "8cf800c4dfce4ea1b513881fd9e0a101",
+ "f887a58fdd7a495f8420760c9566deb1",
+ "b5991c8c911b46b3a4d7889b51df7759",
+ "442e7734e9874c72b085c11a49d1e710",
+ "f1858842eec545a9bfa48a8b49db3966",
+ "5787962f856f46e9b4d15b9b8f009a15",
+ "102b5eb12c8344f587cfeba6f70f78f8",
+ "20d218dcba4b48b6afc9ccbf43904cac",
+ "4452e9c0da20434b976c3719764ea341"
+ ]
+ },
+ "id": "2VZIjVFjnCzW",
+ "outputId": "c6a477c3-5761-4606-ef64-ab907b1b1532"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "config.json: 0%| | 0.00/636 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "f7f64603670540cca1cfa1a6bb95d8d4"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "pytorch_model.bin: 0%| | 0.00/50.7M [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "7e85f12904de48beabb658b439d35fc4"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(example[\"decoder_input_values\"]) # Check if the values exist and are valid"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "amRLGntkzAZM",
+ "outputId": "332bce82-6c89-4963-fc23-efcd67645d4a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "When you connect to an external service, make sure the API returns the correct status codes.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(processed_example[\"input_ids\"]) # Ensure input_ids are not None\n",
+ "print(processed_example[\"attention_mask\"]) # Ensure attention_mask is not None"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "010uMrqWzHkU",
+ "outputId": "562a7f45-4beb-4583-9fff-809338c88499"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "tensor([[ 4, 38, 11, 5, 9, 4, 22, 8, 16, 4, 17, 8, 9, 9, 5, 17, 6, 4,\n",
+ " 6, 8, 4, 7, 9, 4, 5, 37, 6, 5, 13, 9, 7, 15, 4, 12, 5, 13,\n",
+ " 27, 10, 17, 5, 23, 4, 18, 7, 28, 5, 4, 12, 16, 13, 5, 4, 6, 11,\n",
+ " 5, 4, 33, 49, 30, 4, 13, 5, 6, 16, 13, 9, 12, 4, 6, 11, 5, 4,\n",
+ " 17, 8, 13, 13, 5, 17, 6, 2]])\n",
+ "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+ " 1, 1, 1, 1, 1, 1, 1, 1]])\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Interence"
+ ],
+ "metadata": {
+ "id": "mlFzR1_9pNgV"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from transformers import SpeechT5ForTextToSpeech\n",
+ "\n",
+ "model = SpeechT5ForTextToSpeech.from_pretrained(\"DeepDiveDev/speecht5_finetuned_English\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 237,
+ "referenced_widgets": [
+ "9371b25e7a2744e49a56e1aac8741469",
+ "a6fa0e1363884ef3b6ee59f4daf0dc88",
+ "4e4030b299e848daa74d5675b1fc67f9",
+ "700c531271eb42348cc80187b5d98a5f",
+ "0df8bc3570e8417bbb72c8e186e9194d",
+ "dd184a876f784f408a5e1e2cd23cf1f6",
+ "5f5db4187db647598bc479557e509fd4",
+ "7f9773fbb2964a87b750e47bd28c9b44",
+ "459fe1e5144d4844baafccfae7e4bbd4",
+ "c0cae8dde43c40f8a4b8afe007443662",
+ "f434847f1a814f00ab26b49d0f83d100",
+ "4eeb7fb7774b4a3a8c033a5106976dfc",
+ "505cdbd6d5ce42048e7a2b19dab3cb15",
+ "3fee0d226f1c41be9fd34f4d81d6c198",
+ "22cb9edee06d4b9d910e6ed0510faef3",
+ "5d915c0eac7841a1b6fc639a38514fee",
+ "1f3922bab8a34c37a3ccac4438b65de0",
+ "eef5ca64f98f43778470edef77154ca0",
+ "c8764666c2cf48fbb2f35f8171d38eb8",
+ "90d69e36ac8f4f24bcf6aefc9eccc894",
+ "07af8551040b407abb8fbef1d1671205",
+ "75452d64c2b942fa80103b60f20793e6",
+ "a91ef2fb784f4e299f72478283098550",
+ "dc28dfbb43dd4216a4f57b25bdacb85a",
+ "199eede9e0ac456e8fb86ac751278e67",
+ "377081804d714eb59394da49bf06704d",
+ "e2f2fb59c3454794a4959eef3b4d3c11",
+ "dac011b1128649c9b1f2f75f4d822b91",
+ "a90c9a9f1a9748ac82ef3aad7c0566dd",
+ "52231342af134f188e345da31791a488",
+ "d88147051650489ebaa9b8daf2b5bbec",
+ "4530108e89904ba3ad13707a4782a720",
+ "ff322dafaa184d45a5971966cb194955"
+ ]
+ },
+ "id": "n0cNBOWQq_Ae",
+ "outputId": "85f16e3b-2d43-4f58-9dd0-5002704ac2db"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
+ "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
+ "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
+ "You will be able to reuse this secret in all of your notebooks.\n",
+ "Please note that authentication is recommended but still optional to access public models or datasets.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "config.json: 0%| | 0.00/2.11k [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "9371b25e7a2744e49a56e1aac8741469"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "model.safetensors: 0%| | 0.00/578M [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "4eeb7fb7774b4a3a8c033a5106976dfc"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "generation_config.json: 0%| | 0.00/190 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "a91ef2fb784f4e299f72478283098550"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Example MOS collection\n",
+ "mos_scores = {\n",
+ " 'sentence_1.wav': [4, 5, 4],\n",
+ " 'sentence_2.wav': [3, 4, 4],\n",
+ " # Add more scores for each audio sample\n",
+ "}\n",
+ "\n",
+ "# Calculate the average MOS score for each audio\n",
+ "average_mos = {file: sum(scores)/len(scores) for file, scores in mos_scores.items()}\n",
+ "print(average_mos)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HJ8GNhpM3awJ",
+ "outputId": "c6c5c3d1-7214-456e-cd73-e9abeb175eeb"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "{'sentence_1.wav': 4.333333333333333, 'sentence_2.wav': 3.6666666666666665}\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import re\n",
+ "\n",
+ "number_words = {\n",
+ " 0: \"zero\", 1: \"one\", 2: \"two\", 3: \"three\", 4: \"four\", 5: \"five\", 6: \"six\", 7: \"seven\", 8: \"eight\", 9: \"nine\",\n",
+ " 10: \"ten\", 11: \"eleven\", 12: \"twelve\", 13: \"thirteen\", 14: \"fourteen\", 15: \"fifteen\", 16: \"sixteen\", 17: \"seventeen\",\n",
+ " 18: \"eighteen\", 19: \"nineteen\", 20: \"twenty\", 30: \"thirty\", 40: \"forty\", 50: \"fifty\", 60: \"sixty\", 70: \"seventy\",\n",
+ " 80: \"eighty\", 90: \"ninety\", 100: \"hundred\", 1000: \"thousand\"\n",
+ "}\n",
+ "\n",
+ "def number_to_words(number):\n",
+ " if number < 20:\n",
+ " return number_words[number]\n",
+ " elif number < 100:\n",
+ " tens, unit = divmod(number, 10)\n",
+ " return number_words[tens * 10] + (\"-\" + number_words[unit] if unit else \"\")\n",
+ " elif number < 1000:\n",
+ " hundreds, remainder = divmod(number, 100)\n",
+ " return (number_words[hundreds] + \" hundred\" if hundreds > 1 else \"hundred\") + (\" and \" + number_to_words(remainder) if remainder else \"\")\n",
+ " elif number < 1000000:\n",
+ " thousands, remainder = divmod(number, 1000)\n",
+ " return number_to_words(thousands) + \" thousand\" + (\" \" + number_to_words(remainder) if remainder else \"\")\n",
+ " elif number < 1000000000:\n",
+ " millions, remainder = divmod(number, 1000000)\n",
+ " return number_to_words(millions) + \" million\" + (\" \" + number_to_words(remainder) if remainder else \"\")\n",
+ " elif number < 1000000000000:\n",
+ " billions, remainder = divmod(number, 1000000000)\n",
+ " return number_to_words(billions) + \" billion\" + (\" \" + number_to_words(remainder) if remainder else \"\")\n",
+ " else:\n",
+ " return str(number)\n",
+ "\n",
+ "def replace_numbers_with_words(text):\n",
+ " def replace(match):\n",
+ " number = int(match.group())\n",
+ " return number_to_words(number)\n",
+ "\n",
+ " # Find the numbers and change with words.\n",
+ " result = re.sub(r'\\b\\d+\\b', replace, text)\n",
+ "\n",
+ " return result"
+ ],
+ "metadata": {
+ "id": "WcG5qSO89itj"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import re\n",
+ "\n",
+ "# Dictionary of technical words and their spoken equivalents (each letter separated)\n",
+ "technical_words = {\n",
+ " \"API\": \"A P I\",\n",
+ " \"CUDA\": \"C U D A\",\n",
+ " \"OAuth\": \"O Auth\",\n",
+ " \"LLM\": \"L L M\",\n",
+ " \"HTTP\": \"H T T P\",\n",
+ " \"HTTPS\": \"H T T P S\",\n",
+ " \"URL\": \"U R L\",\n",
+ " \"SQL\": \"S Q L\",\n",
+ " \"JSON\": \"J S O N\",\n",
+ " \"XML\": \"X M L\",\n",
+ " \"REST\": \"R E S T\",\n",
+ " \"JWT\": \"J W T\",\n",
+ " \"FTP\": \"F T P\",\n",
+ " \"SSH\": \"S S H\",\n",
+ " \"GPU\": \"G P U\",\n",
+ " \"CPU\": \"C P U\",\n",
+ " \"IP\": \"I P\",\n",
+ " \"RAM\": \"R A M\",\n",
+ " \"ROM\": \"R O M\",\n",
+ " \"ID\": \"I D\",\n",
+ " \"UID\": \"U I D\",\n",
+ " \"UUID\": \"U U I D\",\n",
+ " \"NLP\": \"N L P\",\n",
+ " \"ML\": \"M L\",\n",
+ " \"AI\": \"A I\",\n",
+ " \"IoT\": \"I O T\",\n",
+ " \"VPN\": \"V P N\",\n",
+ " \"DNS\": \"D N S\",\n",
+ " \"SMTP\": \"S M T P\",\n",
+ " \"KNN\": \"K N N\",\n",
+ " \"CNN\": \"C N N\",\n",
+ " \"LSTM\": \"L S T M\",\n",
+ " \"GRU\": \"G R U\",\n",
+ " # Add more technical terms with letters separated\n",
+ "}\n",
+ "\n",
+ "def technical_words_to_speech(word):\n",
+ " # Convert technical words to their spoken form, with each letter separated by spaces\n",
+ " return technical_words.get(word, word) # If not in the dictionary, return the original word\n",
+ "\n",
+ "def replace_technical_words_with_speech(text):\n",
+ " # Split the text into words and check for technical terms\n",
+ " words = re.findall(r'\\b\\w+\\b', text)\n",
+ "\n",
+ " result = []\n",
+ " for word in words:\n",
+ " # Convert each word to its spoken form if it's a technical term\n",
+ " spoken_form = technical_words_to_speech(word)\n",
+ " result.append(spoken_form)\n",
+ "\n",
+ " return \" \".join(result)\n",
+ "\n",
+ "# Example usage\n",
+ "text = \"I will use an API with OAuth and CUDA to train the LLM model on a GPU.\"\n",
+ "spoken_text = replace_technical_words_with_speech(text)\n",
+ "print(spoken_text)"
+ ],
+ "metadata": {
+ "id": "WaHEQ1eX9nHx"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "inputs = processor(text=final_text, return_tensors=\"pt\")"
+ ],
+ "metadata": {
+ "id": "pGmsxMrG-I_9"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from transformers import SpeechT5HifiGan\n",
+ "\n",
+ "vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n",
+ "speech = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings, vocoder=vocoder)"
+ ],
+ "metadata": {
+ "id": "ayrv9MJu-N-c"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from IPython.display import Audio\n",
+ "import soundfile as sf\n",
+ "\n",
+ "Audio(speech.numpy(), rate=16000)\n",
+ "# Save the audio to a file (e.g., 'output.wav')\n",
+ "sf.write('output.wav', speech.numpy(), 16000)"
+ ],
+ "metadata": {
+ "id": "9znt0YGZ-UXw"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file