diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..19eb09dff8fa100af1f9808efffa83237b782d09 --- /dev/null +++ b/app.py @@ -0,0 +1 @@ +from vakyansh-tts.utils.inference.run_gradio import * \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb2a48e8fd2c541b1a5f858fbccfddc547b68662 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,17 @@ +Cython==0.29.24 +layers==0.1.5 +librosa==0.8.1 +matplotlib==3.3.4 +numpy==1.21.0 +scipy==1.5.4 +tensorboardX==2.4 +tensorboard==2.7.0 +tqdm==4.62.3 +fastapi==0.70.0 +uvicorn==0.15.0 +gradio==2.5.2 +wavio==0.0.4 +mosestokenizer==1.2.1 +indic-nlp-library==0.81 +inflect==5.3.0 +Unidecode==1.3.2 diff --git a/vakyansh-tts/.gitignore b/vakyansh-tts/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..45bf260a4a2f452da0b71f9adaacd7b49a6b51de --- /dev/null +++ b/vakyansh-tts/.gitignore @@ -0,0 +1,132 @@ +# Byte-compiled / optimized / DLL files +.DS_Store +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.idea/ diff --git a/vakyansh-tts/LICENSE.md b/vakyansh-tts/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..5fd2e54913fd05b69de2874ec8f9a10c7f4e8d3f --- /dev/null +++ b/vakyansh-tts/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Open-Speech-EkStep + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vakyansh-tts/README.md b/vakyansh-tts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..02892bc9dd4344e550596d238e2b71870cfc7dd3 --- /dev/null +++ b/vakyansh-tts/README.md @@ -0,0 +1,220 @@ +# vakyansh-tts +Text to Speech for Indic languages + +## 1. Installation and Setup for training + +Clone repo +Note : for multspeaker glow-tts training use branch [multispeaker](https://github.com/Open-Speech-EkStep/vakyansh-tts/tree/multispeaker) +``` +git clone https://github.com/Open-Speech-EkStep/vakyansh-tts +``` +Build conda virtual environment +``` +cd ./vakyansh-tts +conda create --name python=3.7 +conda activate +pip install -r requirements.txt +``` +Install [apex](https://github.com/NVIDIA/apex); commit: 37cdaf4 for Mixed-precision training + +Note : used only for glow-tts +``` +cd .. +git clone https://github.com/NVIDIA/apex +cd apex +git checkout 37cdaf4 +pip install -v --disable-pip-version-check --no-cache-dir ./ +cd ../vakyansh-tts +``` +Build Monotonic Alignment Search Code (Cython) + +Note : used only for glow-tts +``` +bash install.sh +``` + +## 2. Data Resampling + +The data format should have a folder containing all the .wav files for glow-tts and a text file containing filenames with their sentences. + +Directory structure: + +langauge_folder_name +``` +language_folder_name +|-- ./wav/*.wav +|-- ./text_file_name.txt +``` +The format for text_file_name.txt (Text file is only needed for glow-tts training) + +``` +( audio1.wav "Sentence1." ) +( audio2.wav "Sentence2." ) +``` + +To resample the .wav files to 22050 sample rate, change the following parameters in the vakyansh-tts/scripts/data/resample.sh + +``` +input_wav_path : absolute path to wav file folder in vakyansh_tts/data/ +output_wav_path : absolute path to vakyansh_tts/data/resampled_wav_folder_name +output_sample_rate : 22050 (or any other desired sample rate) +``` + +To run: +```bash +cd scripts/data/ +bash resample.sh +``` + + +## 3. Spectogram Training (glow-tts) + +### 3.1 Data Preparation + + +To prepare the data edit the vakyansh-tts/scripts/glow/prepare_data.sh file and change the following parameters +``` +input_text_path : absolute path to vakyansh_tts/data/text_file_name.txt +input_wav_path : absolute path to vakyansh_tts/data/resampled_wav_folder_name +gender : female or male voice +``` +To run: +```bash +cd scripts/glow/ +bash prepare_data.sh +``` +### 3.2 Training glow-tts + +To start the spectogram-training edit the vakyansh-tts/scripts/glow/train_glow.sh file and change the following parameter: +``` +gender : female or male voice +``` +Make sure that the gender is same as that of the prepare_data.sh file + +To start the training, run: +```bash +cd scripts/glow/ +bash train_glow.sh +``` +## 4. Vocoder Training (hifi-gan) + +### 4.1 Data Preparation + +To prepare the data edit the vakyansh-tts/scripts/hifi/prepare_data.sh file and change the following parameters +``` +input_wav_path : absolute path to vakyansh_tts/data/resampled_wav_folder_name +gender : female or male voice +``` +To run: +```bash +cd scripts/hifi/ +bash prepare_data.sh +``` +### 4.2 Training hifi-gan + +To start the spectogram-training edit the vakyansh-tts/scripts/hifi/train_hifi.sh file and change the following parameter: +``` +gender : female or male voice +``` +Make sure that the gender is same as that of the prepare_data.sh file + +To start the training, run: +```bash +cd scripts/hifi/ +bash train_hifi.sh +``` + +## 5. Inference + +### 5.1 Using Gradio + +To use the gradio link edit the following parameters in the vakyansh-tts/scripts/inference/gradio.sh file: +``` +gender : female or male voice +device : cpu or cuda +lang : langauge code +``` + +To run: +```bash +cd scripts/inference/ +bash gradio.sh +``` +### 5.2 Using fast API +To use the fast api link edit the parameters in the vakyansh-tts/scripts/inference/api.sh file similar to section 5.1 + +To run: +```bash +cd scripts/inference/ +bash api.sh +``` + +### 5.3 Direct Inference using text +To infer, edit the parameters in the vakyansh-tts/scripts/inference/infer.sh file similar to section 5.1 and set the text to the text variable + +To run: +```bash +cd scripts/inference/ +bash infer.sh +``` + +To configure other parameters there is a version that runs the advanced inference as well. Additional Parameters: +``` +noise_scale : can vary from 0 to 1 for noise factor +length_scale : can vary from 0 to 2 for changing the speed of the generated audio +transliteration : whether to switch on/off transliteration. 1: ON, 0: OFF +number_conversion : whether to switch on/off number to words conversion. 1: ON, 0: OFF +split_sentences : whether to switch on/off splitting of sentences. 1: ON, 0: OFF +``` +To run: +``` +cd scripts/inference/ +bash advanced_infer.sh +``` + +### 5.4 Installation of tts_infer package + +In tts_infer package, we currently have two components: + + 1. Transliteration (AI4bharat's open sourced models) (Languages supported: {'hi', 'gu', 'mr', 'bn', 'te', 'ta', 'kn', 'pa', 'gom', 'mai', 'ml', 'sd', 'si', 'ur'} ) + + 2. Num to Word (Languages supported: {'en', 'hi', 'gu', 'mr', 'bn', 'te', 'ta', 'kn', 'or', 'pa'} ) +``` +git clone https://github.com/Open-Speech-EkStep/vakyansh-tts +cd vakyansh-tts +bash install.sh +python setup.py bdist_wheel +pip install -e . +cd tts_infer +gsutil -m cp -r gs://vakyaansh-open-models/translit_models . +``` + +Usage: Refer to example file in tts_infer/ +``` +from tts_infer.tts import TextToMel, MelToWav +from tts_infer.transliterate import XlitEngine +from tts_infer.num_to_word_on_sent import normalize_nums + +import re +from scipy.io.wavfile import write + +text_to_mel = TextToMel(glow_model_dir='/path/to/glow-tts/checkpoint/dir', device='cuda') +mel_to_wav = MelToWav(hifi_model_dir='/path/to/hifi/checkpoint/dir', device='cuda') + +def translit(text, lang): + reg = re.compile(r'[a-zA-Z]') + engine = XlitEngine(lang) + words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()] + updated_sent = ' '.join(words) + return updated_sent + +def run_tts(text, lang): + text = text.replace('।', '.') # only for hindi models + text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang + text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang + + mel = text_to_mel.generate_mel(text_num_to_word_and_transliterated) + audio, sr = mel_to_wav.generate_wav(mel) + write(filename='temp.wav', rate=sr, data=audio) # for saving wav file, if needed + return (sr, audio) +``` diff --git a/vakyansh-tts/checkpoints/glow/.gitkeep b/vakyansh-tts/checkpoints/glow/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vakyansh-tts/checkpoints/hifi/.gitkeep b/vakyansh-tts/checkpoints/hifi/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vakyansh-tts/config/.gitkeep b/vakyansh-tts/config/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vakyansh-tts/config/glow/base.json b/vakyansh-tts/config/glow/base.json new file mode 100644 index 0000000000000000000000000000000000000000..c87165196faa226bcef5f995113281489aea0de7 --- /dev/null +++ b/vakyansh-tts/config/glow/base.json @@ -0,0 +1,54 @@ +{ + "train": { + "use_cuda": true, + "log_interval": 20, + "seed": 1234, + "epochs": 10000, + "learning_rate": 1e0, + "betas": [0.9, 0.98], + "eps": 1e-9, + "warmup_steps": 4000, + "scheduler": "noam", + "batch_size": 16, + "ddi": true, + "fp16_run": true, + "save_epoch": 1 + }, + "data": { + "load_mel_from_disk": false, + "training_files":"../data/training/train.txt", + "validation_files":"../data/training/valid.txt", + "chars":"", + "punc":"", + "text_cleaners":["basic_indic_cleaners"], + "max_wav_value": 32768.0, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 80.0, + "mel_fmax": 7600.0, + "add_noise": true + }, + "model": { + "hidden_channels": 192, + "filter_channels": 768, + "filter_channels_dp": 256, + "kernel_size": 3, + "p_dropout": 0.1, + "n_blocks_dec": 12, + "n_layers_enc": 6, + "n_heads": 2, + "p_dropout_dec": 0.05, + "dilation_rate": 1, + "kernel_size_dec": 5, + "n_block_layers": 4, + "n_sqz": 2, + "prenet": true, + "mean_only": true, + "hidden_channels_enc": 192, + "hidden_channels_dec": 192, + "window_size": 4 + } +} diff --git a/vakyansh-tts/config/glow/base_blank.json b/vakyansh-tts/config/glow/base_blank.json new file mode 100644 index 0000000000000000000000000000000000000000..2c359a73317e7769f51cedcac2127affbf40b7ff --- /dev/null +++ b/vakyansh-tts/config/glow/base_blank.json @@ -0,0 +1,55 @@ +{ + "train": { + "use_cuda": true, + "log_interval": 20, + "seed": 1234, + "epochs": 10000, + "learning_rate": 1e0, + "betas": [0.9, 0.98], + "eps": 1e-9, + "warmup_steps": 4000, + "scheduler": "noam", + "batch_size": 16, + "ddi": true, + "fp16_run": true, + "save_epoch": 1 + }, + "data": { + "load_mel_from_disk": false, + "training_files":"../data/training/train.txt", + "validation_files":"../data/training/valid.txt", + "chars":"", + "punc":"", + "text_cleaners":["basic_indic_cleaners"], + "max_wav_value": 32768.0, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 80.0, + "mel_fmax": 7600.0, + "add_noise": true, + "add_blank": true + }, + "model": { + "hidden_channels": 192, + "filter_channels": 768, + "filter_channels_dp": 256, + "kernel_size": 3, + "p_dropout": 0.1, + "n_blocks_dec": 12, + "n_layers_enc": 6, + "n_heads": 2, + "p_dropout_dec": 0.05, + "dilation_rate": 1, + "kernel_size_dec": 5, + "n_block_layers": 4, + "n_sqz": 2, + "prenet": true, + "mean_only": true, + "hidden_channels_enc": 192, + "hidden_channels_dec": 192, + "window_size": 4 + } +} diff --git a/vakyansh-tts/config/hifi/config_v1.json b/vakyansh-tts/config/hifi/config_v1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb82eda796ce9f9e60de119cd503b617d4efdba2 --- /dev/null +++ b/vakyansh-tts/config/hifi/config_v1.json @@ -0,0 +1,37 @@ +{ + "resblock": "1", + "num_gpus": 0, + "batch_size": 24, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [8,8,2,2], + "upsample_kernel_sizes": [16,16,4,4], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + + "segment_size": 8192, + "num_mels": 80, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 256, + "win_size": 1024, + + "sampling_rate": 22050, + + "fmin": 80, + "fmax": 7600, + "fmax_for_loss": null, + + "num_workers": 4, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/vakyansh-tts/config/hifi/config_v2.json b/vakyansh-tts/config/hifi/config_v2.json new file mode 100644 index 0000000000000000000000000000000000000000..b5a85ef874ed03d4002258ab5901f9bdf9a4f07b --- /dev/null +++ b/vakyansh-tts/config/hifi/config_v2.json @@ -0,0 +1,37 @@ +{ + "resblock": "1", + "num_gpus": 0, + "batch_size": 24, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [8,8,2,2], + "upsample_kernel_sizes": [16,16,4,4], + "upsample_initial_channel": 128, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + + "segment_size": 8192, + "num_mels": 80, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 256, + "win_size": 1024, + + "sampling_rate": 22050, + + "fmin": 80, + "fmax": 7600, + "fmax_for_loss": null, + + "num_workers": 4, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/vakyansh-tts/config/hifi/config_v3.json b/vakyansh-tts/config/hifi/config_v3.json new file mode 100644 index 0000000000000000000000000000000000000000..7d6bafd26a180906df23e38f8ff59ce6f3469a03 --- /dev/null +++ b/vakyansh-tts/config/hifi/config_v3.json @@ -0,0 +1,37 @@ +{ + "resblock": "2", + "num_gpus": 0, + "batch_size": 24, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [8,8,4], + "upsample_kernel_sizes": [16,16,8], + "upsample_initial_channel": 256, + "resblock_kernel_sizes": [3,5,7], + "resblock_dilation_sizes": [[1,2], [2,6], [3,12]], + + "segment_size": 8192, + "num_mels": 80, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 256, + "win_size": 1024, + + "sampling_rate": 22050, + + "fmin": 80, + "fmax": 7600, + "fmax_for_loss": null, + + "num_workers": 4, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/vakyansh-tts/data/.gitkeep b/vakyansh-tts/data/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vakyansh-tts/install.sh b/vakyansh-tts/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..51e038d5a0098f21d4efd8051a15b7f0cdeb4b73 --- /dev/null +++ b/vakyansh-tts/install.sh @@ -0,0 +1,6 @@ +cd src/glow_tts/monotonic_align/ +pip install . +cd ../../../ + +# torch +pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html diff --git a/vakyansh-tts/logs/glow/.gitkeep b/vakyansh-tts/logs/glow/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vakyansh-tts/logs/hifi/.gitkeep b/vakyansh-tts/logs/hifi/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vakyansh-tts/notebooks/vakyansh_tts_demo.ipynb b/vakyansh-tts/notebooks/vakyansh_tts_demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a39c80d91a8a2f1edef63bf09e7337712c54d9d3 --- /dev/null +++ b/vakyansh-tts/notebooks/vakyansh_tts_demo.ipynb @@ -0,0 +1,546 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "vakyansh_tts_demo.ipynb", + "provenance": [], + "authorship_tag": "ABX9TyNhhwduU9+eajfOP6r1Y98A", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Installing Dependencies" + ], + "metadata": { + "id": "oyoFPN29HrRt" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "5x4wJQGUaysK", + "outputId": "90d49030-311e-4100-b42a-3849df217887" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'vakyansh-tts'...\n", + "remote: Enumerating objects: 466, done.\u001b[K\n", + "remote: Counting objects: 100% (201/201), done.\u001b[K\n", + "remote: Compressing objects: 100% (175/175), done.\u001b[K\n", + "remote: Total 466 (delta 89), reused 64 (delta 22), pack-reused 265\u001b[K\n", + "Receiving objects: 100% (466/466), 259.27 KiB | 1.39 MiB/s, done.\n", + "Resolving deltas: 100% (229/229), done.\n", + "Processing /content/vakyansh-tts/src/glow_tts/monotonic_align\n", + "\u001b[33m DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.\n", + " pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.\u001b[0m\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from monotonic-align==1.1) (1.19.5)\n", + "Building wheels for collected packages: monotonic-align\n", + " Building wheel for monotonic-align (PEP 517) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for monotonic-align: filename=monotonic_align-1.1-cp37-cp37m-linux_x86_64.whl size=237012 sha256=3ffba87629daf17ecf86f538ead38094792d74d16b36cf691371c36f2e2c8ead\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-m1jlgsel/wheels/3a/e4/2d/953a66d439600fcb1836ffba5ef6915b944df396e8228909cb\n", + "Successfully built monotonic-align\n", + "Installing collected packages: monotonic-align\n", + "Successfully installed monotonic-align-1.1\n", + "running bdist_wheel\n", + "running build\n", + "running build_py\n", + "creating build\n", + "creating build/lib\n", + "creating build/lib/tts_infer\n", + "copying tts_infer/tts.py -> build/lib/tts_infer\n", + "copying tts_infer/num_to_word_on_sent.py -> build/lib/tts_infer\n", + "copying tts_infer/transliterate.py -> build/lib/tts_infer\n", + "copying tts_infer/__init__.py -> build/lib/tts_infer\n", + "running egg_info\n", + "creating vakyansh_tts.egg-info\n", + "writing vakyansh_tts.egg-info/PKG-INFO\n", + "writing dependency_links to vakyansh_tts.egg-info/dependency_links.txt\n", + "writing requirements to vakyansh_tts.egg-info/requires.txt\n", + "writing top-level names to vakyansh_tts.egg-info/top_level.txt\n", + "writing manifest file 'vakyansh_tts.egg-info/SOURCES.txt'\n", + "adding license file 'LICENSE.md'\n", + "writing manifest file 'vakyansh_tts.egg-info/SOURCES.txt'\n", + "copying tts_infer/requirements.txt -> build/lib/tts_infer\n", + "installing to build/bdist.linux-x86_64/wheel\n", + "running install\n", + "running install_lib\n", + "creating build/bdist.linux-x86_64\n", + "creating build/bdist.linux-x86_64/wheel\n", + "creating build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/tts.py -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/num_to_word_on_sent.py -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/transliterate.py -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/__init__.py -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/requirements.txt -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "running install_egg_info\n", + "Copying vakyansh_tts.egg-info to build/bdist.linux-x86_64/wheel/vakyansh_tts-0.0.1-py3.7.egg-info\n", + "running install_scripts\n", + "adding license file \"LICENSE.md\" (matched pattern \"LICEN[CS]E*\")\n", + "creating build/bdist.linux-x86_64/wheel/vakyansh_tts-0.0.1.dist-info/WHEEL\n", + "creating 'dist/vakyansh_tts-0.0.1-py3-none-any.whl' and adding 'build/bdist.linux-x86_64/wheel' to it\n", + "adding 'tts_infer/__init__.py'\n", + "adding 'tts_infer/num_to_word_on_sent.py'\n", + "adding 'tts_infer/requirements.txt'\n", + "adding 'tts_infer/transliterate.py'\n", + "adding 'tts_infer/tts.py'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/LICENSE.md'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/METADATA'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/WHEEL'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/top_level.txt'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/RECORD'\n", + "removing build/bdist.linux-x86_64/wheel\n", + "Obtaining file:///content/vakyansh-tts\n", + "Requirement already satisfied: Cython==0.29.24 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (0.29.24)\n", + "Collecting inflect==5.3.0\n", + " Downloading inflect-5.3.0-py3-none-any.whl (32 kB)\n", + "Collecting layers==0.1.5\n", + " Downloading layers-0.1.5.tar.gz (5.5 kB)\n", + "Requirement already satisfied: librosa==0.8.1 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (0.8.1)\n", + "Collecting matplotlib==3.3.4\n", + " Downloading matplotlib-3.3.4-cp37-cp37m-manylinux1_x86_64.whl (11.5 MB)\n", + "\u001b[K |████████████████████████████████| 11.5 MB 11.3 MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (1.19.5)\n", + "Collecting scipy==1.5.4\n", + " Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)\n", + "\u001b[K |████████████████████████████████| 25.9 MB 1.2 MB/s \n", + "\u001b[?25hCollecting tensorboardX==2.4\n", + " Downloading tensorboardX-2.4-py2.py3-none-any.whl (124 kB)\n", + "\u001b[K |████████████████████████████████| 124 kB 57.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: tensorboard==2.7.0 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (2.7.0)\n", + "Collecting torch==1.5.1\n", + " Downloading torch-1.5.1-cp37-cp37m-manylinux1_x86_64.whl (753.2 MB)\n", + "\u001b[K |████████████████████████████████| 753.2 MB 13 kB/s \n", + "\u001b[?25hCollecting Unidecode==1.3.2\n", + " Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)\n", + "\u001b[K |████████████████████████████████| 235 kB 65.7 MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm==4.62.3 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (4.62.3)\n", + "Collecting fastapi==0.70.0\n", + " Downloading fastapi-0.70.0-py3-none-any.whl (51 kB)\n", + "\u001b[K |████████████████████████████████| 51 kB 706 kB/s \n", + "\u001b[?25hCollecting uvicorn==0.15.0\n", + " Downloading uvicorn-0.15.0-py3-none-any.whl (54 kB)\n", + "\u001b[K |████████████████████████████████| 54 kB 3.2 MB/s \n", + "\u001b[?25hCollecting gradio==2.5.2\n", + " Downloading gradio-2.5.2-py3-none-any.whl (982 kB)\n", + "\u001b[K |████████████████████████████████| 982 kB 61.2 MB/s \n", + "\u001b[?25hCollecting wavio==0.0.4\n", + " Downloading wavio-0.0.4-py2.py3-none-any.whl (9.0 kB)\n", + "Collecting pydload==1.0.9\n", + " Downloading pydload-1.0.9-py2.py3-none-any.whl (16 kB)\n", + "Collecting pydantic!=1.7,!=1.7.1,!=1.7.2,!=1.7.3,!=1.8,!=1.8.1,<2.0.0,>=1.6.2\n", + " Downloading pydantic-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)\n", + "\u001b[K |████████████████████████████████| 10.9 MB 35.1 MB/s \n", + "\u001b[?25hCollecting starlette==0.16.0\n", + " Downloading starlette-0.16.0-py3-none-any.whl (61 kB)\n", + "\u001b[K |████████████████████████████████| 61 kB 298 kB/s \n", + "\u001b[?25hCollecting Flask-Login\n", + " Downloading Flask_Login-0.5.0-py2.py3-none-any.whl (16 kB)\n", + "Collecting flask-cachebuster\n", + " Downloading Flask-CacheBuster-1.0.0.tar.gz (3.1 kB)\n", + "Collecting ffmpy\n", + " Downloading ffmpy-0.3.0.tar.gz (4.8 kB)\n", + "Collecting pydub\n", + " Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from gradio==2.5.2->vakyansh-tts==0.0.1) (1.1.5)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from gradio==2.5.2->vakyansh-tts==0.0.1) (2.23.0)\n", + "Requirement already satisfied: Flask>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from gradio==2.5.2->vakyansh-tts==0.0.1) (1.1.4)\n", + "Collecting Flask-Cors>=3.0.8\n", + " Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)\n", + "Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from gradio==2.5.2->vakyansh-tts==0.0.1) (7.1.2)\n", + "Collecting markdown2\n", + " Downloading markdown2-2.4.2-py2.py3-none-any.whl (34 kB)\n", + "Collecting analytics-python\n", + " Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)\n", + "Collecting paramiko\n", + " Downloading paramiko-2.9.1-py2.py3-none-any.whl (210 kB)\n", + "\u001b[K |████████████████████████████████| 210 kB 61.1 MB/s \n", + "\u001b[?25hCollecting pycryptodome\n", + " Downloading pycryptodome-3.12.0-cp35-abi3-manylinux2010_x86_64.whl (2.0 MB)\n", + "\u001b[K |████████████████████████████████| 2.0 MB 42.3 MB/s \n", + "\u001b[?25hRequirement already satisfied: PyYaml in /usr/local/lib/python3.7/dist-packages (from layers==0.1.5->vakyansh-tts==0.0.1) (3.13)\n", + "Collecting bashutils\n", + " Downloading Bashutils-0.0.4.tar.gz (4.2 kB)\n", + "Requirement already satisfied: resampy>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (0.2.2)\n", + "Requirement already satisfied: pooch>=1.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (1.5.2)\n", + "Requirement already satisfied: numba>=0.43.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (0.51.2)\n", + "Requirement already satisfied: audioread>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (2.1.9)\n", + "Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (1.0.1)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (21.3)\n", + "Requirement already satisfied: soundfile>=0.10.2 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (0.10.3.post1)\n", + "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (1.1.0)\n", + "Requirement already satisfied: decorator>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (4.4.2)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.4->vakyansh-tts==0.0.1) (3.0.6)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.4->vakyansh-tts==0.0.1) (2.8.2)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.4->vakyansh-tts==0.0.1) (0.11.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.4->vakyansh-tts==0.0.1) (1.3.2)\n", + "Requirement already satisfied: progressbar2 in /usr/local/lib/python3.7/dist-packages (from pydload==1.0.9->vakyansh-tts==0.0.1) (3.38.0)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from starlette==0.16.0->fastapi==0.70.0->vakyansh-tts==0.0.1) (3.10.0.2)\n", + "Collecting anyio<4,>=3.0.0\n", + " Downloading anyio-3.4.0-py3-none-any.whl (78 kB)\n", + "\u001b[K |████████████████████████████████| 78 kB 7.8 MB/s \n", + "\u001b[?25hRequirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (3.3.6)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.4.6)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.6.1)\n", + "Requirement already satisfied: protobuf>=3.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (3.17.3)\n", + "Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.12.0)\n", + "Requirement already satisfied: grpcio>=1.24.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.42.0)\n", + "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.37.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.0.1)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.35.0)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (57.4.0)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.8.0)\n", + "Requirement already satisfied: future in /usr/local/lib/python3.7/dist-packages (from torch==1.5.1->vakyansh-tts==0.0.1) (0.16.0)\n", + "Collecting asgiref>=3.4.0\n", + " Downloading asgiref-3.4.1-py3-none-any.whl (25 kB)\n", + "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.7/dist-packages (from uvicorn==0.15.0->vakyansh-tts==0.0.1) (7.1.2)\n", + "Collecting h11>=0.8\n", + " Downloading h11-0.12.0-py3-none-any.whl (54 kB)\n", + "\u001b[K |████████████████████████████████| 54 kB 3.7 MB/s \n", + "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from absl-py>=0.4->tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.15.0)\n", + "Collecting sniffio>=1.1\n", + " Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.7/dist-packages (from anyio<4,>=3.0.0->starlette==0.16.0->fastapi==0.70.0->vakyansh-tts==0.0.1) (2.10)\n", + "Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.1.1->gradio==2.5.2->vakyansh-tts==0.0.1) (1.1.0)\n", + "Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.1.1->gradio==2.5.2->vakyansh-tts==0.0.1) (2.11.3)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard==2.7.0->vakyansh-tts==0.0.1) (4.8)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.2.8)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard==2.7.0->vakyansh-tts==0.0.1) (4.2.4)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.3.0)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask>=1.1.1->gradio==2.5.2->vakyansh-tts==0.0.1) (2.0.1)\n", + "Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard==2.7.0->vakyansh-tts==0.0.1) (4.8.2)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard==2.7.0->vakyansh-tts==0.0.1) (3.6.0)\n", + "Requirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in /usr/local/lib/python3.7/dist-packages (from numba>=0.43.0->librosa==0.8.1->vakyansh-tts==0.0.1) (0.34.0)\n", + "Requirement already satisfied: appdirs in /usr/local/lib/python3.7/dist-packages (from pooch>=1.0->librosa==0.8.1->vakyansh-tts==0.0.1) (1.4.4)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.4.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->gradio==2.5.2->vakyansh-tts==0.0.1) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->gradio==2.5.2->vakyansh-tts==0.0.1) (2021.10.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->gradio==2.5.2->vakyansh-tts==0.0.1) (1.24.3)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard==2.7.0->vakyansh-tts==0.0.1) (3.1.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn!=0.19.0,>=0.14.0->librosa==0.8.1->vakyansh-tts==0.0.1) (3.0.0)\n", + "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.7/dist-packages (from soundfile>=0.10.2->librosa==0.8.1->vakyansh-tts==0.0.1) (1.15.0)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.0->soundfile>=0.10.2->librosa==0.8.1->vakyansh-tts==0.0.1) (2.21)\n", + "Collecting monotonic>=1.5\n", + " Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", + "Collecting backoff==1.10.0\n", + " Downloading backoff-1.10.0-py2.py3-none-any.whl (31 kB)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->gradio==2.5.2->vakyansh-tts==0.0.1) (2018.9)\n", + "Collecting bcrypt>=3.1.3\n", + " Downloading bcrypt-3.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (61 kB)\n", + "\u001b[K |████████████████████████████████| 61 kB 386 kB/s \n", + "\u001b[?25hCollecting cryptography>=2.5\n", + " Downloading cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)\n", + "\u001b[K |████████████████████████████████| 3.6 MB 40.6 MB/s \n", + "\u001b[?25hCollecting pynacl>=1.0.1\n", + " Downloading PyNaCl-1.4.0-cp35-abi3-manylinux1_x86_64.whl (961 kB)\n", + "\u001b[K |████████████████████████████████| 961 kB 49.8 MB/s \n", + "\u001b[?25hRequirement already satisfied: python-utils>=2.3.0 in /usr/local/lib/python3.7/dist-packages (from progressbar2->pydload==1.0.9->vakyansh-tts==0.0.1) (2.5.6)\n", + "Building wheels for collected packages: layers, bashutils, ffmpy, flask-cachebuster\n", + " Building wheel for layers (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for layers: filename=layers-0.1.5-py3-none-any.whl size=5379 sha256=759f381849c193619d4e1d46982ad55fd081f3359d2b70d3fede9092d81d6b24\n", + " Stored in directory: /root/.cache/pip/wheels/75/6f/32/757f357608178c55254f10906905e7f8cd63b566173377c819\n", + " Building wheel for bashutils (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for bashutils: filename=Bashutils-0.0.4-py3-none-any.whl size=5472 sha256=60c44cb259b33784163362297bfeb8a6c349296e3acb89196eb4d9cab2274c08\n", + " Stored in directory: /root/.cache/pip/wheels/c7/a0/9a/b99da313eb952e5d8ab2622528c0102544d5cddca1ffc9b15e\n", + " Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for ffmpy: filename=ffmpy-0.3.0-py3-none-any.whl size=4710 sha256=a2f3fdb0f222e1f7efb4cec778da16dc98a2dd5504bc1aa55f8d9210904764bf\n", + " Stored in directory: /root/.cache/pip/wheels/13/e4/6c/e8059816e86796a597c6e6b0d4c880630f51a1fcfa0befd5e6\n", + " Building wheel for flask-cachebuster (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for flask-cachebuster: filename=Flask_CacheBuster-1.0.0-py3-none-any.whl size=3371 sha256=ed02a328b3fdd4faad60c78aff1cdd40efd352cf5bcd5a15e0783d47789aaf19\n", + " Stored in directory: /root/.cache/pip/wheels/28/c0/c4/44687421dab41455be93112bd1b0dee1f3c5a9aa27bee63708\n", + "Successfully built layers bashutils ffmpy flask-cachebuster\n", + "Installing collected packages: sniffio, scipy, pynacl, monotonic, cryptography, bcrypt, backoff, anyio, starlette, pydub, pydantic, pycryptodome, paramiko, matplotlib, markdown2, h11, Flask-Login, Flask-Cors, flask-cachebuster, ffmpy, bashutils, asgiref, analytics-python, wavio, uvicorn, Unidecode, torch, tensorboardX, pydload, layers, inflect, gradio, fastapi, vakyansh-tts\n", + " Attempting uninstall: scipy\n", + " Found existing installation: scipy 1.4.1\n", + " Uninstalling scipy-1.4.1:\n", + " Successfully uninstalled scipy-1.4.1\n", + " Attempting uninstall: matplotlib\n", + " Found existing installation: matplotlib 3.2.2\n", + " Uninstalling matplotlib-3.2.2:\n", + " Successfully uninstalled matplotlib-3.2.2\n", + " Attempting uninstall: torch\n", + " Found existing installation: torch 1.10.0+cu111\n", + " Uninstalling torch-1.10.0+cu111:\n", + " Successfully uninstalled torch-1.10.0+cu111\n", + " Attempting uninstall: inflect\n", + " Found existing installation: inflect 2.1.0\n", + " Uninstalling inflect-2.1.0:\n", + " Successfully uninstalled inflect-2.1.0\n", + " Running setup.py develop for vakyansh-tts\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "torchvision 0.11.1+cu111 requires torch==1.10.0, but you have torch 1.5.1 which is incompatible.\n", + "torchtext 0.11.0 requires torch==1.10.0, but you have torch 1.5.1 which is incompatible.\n", + "torchaudio 0.10.0+cu111 requires torch==1.10.0, but you have torch 1.5.1 which is incompatible.\n", + "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n", + "Successfully installed Flask-Cors-3.0.10 Flask-Login-0.5.0 Unidecode-1.3.2 analytics-python-1.4.0 anyio-3.4.0 asgiref-3.4.1 backoff-1.10.0 bashutils-0.0.4 bcrypt-3.2.0 cryptography-36.0.1 fastapi-0.70.0 ffmpy-0.3.0 flask-cachebuster-1.0.0 gradio-2.5.2 h11-0.12.0 inflect-5.3.0 layers-0.1.5 markdown2-2.4.2 matplotlib-3.3.4 monotonic-1.6 paramiko-2.9.1 pycryptodome-3.12.0 pydantic-1.9.0 pydload-1.0.9 pydub-0.25.1 pynacl-1.4.0 scipy-1.5.4 sniffio-1.2.0 starlette-0.16.0 tensorboardX-2.4 torch-1.5.1 uvicorn-0.15.0 vakyansh-tts-0.0.1 wavio-0.0.4\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "matplotlib", + "mpl_toolkits" + ] + } + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2022-01-04 08:20:03-- https://storage.googleapis.com/vakyaansh-open-models/translit_models/default_lineup.json\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3422 (3.3K) [application/json]\n", + "Saving to: ‘default_lineup.json’\n", + "\n", + "\rdefault_lineup.json 0%[ ] 0 --.-KB/s \rdefault_lineup.json 100%[===================>] 3.34K --.-KB/s in 0s \n", + "\n", + "2022-01-04 08:20:03 (44.3 MB/s) - ‘default_lineup.json’ saved [3422/3422]\n", + "\n", + "--2022-01-04 08:20:03-- https://storage.googleapis.com/vakyaansh-open-models/translit_models/hindi/hindi_transliteration.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 45018357 (43M) [application/zip]\n", + "Saving to: ‘hindi_transliteration.zip’\n", + "\n", + "hindi_transliterati 100%[===================>] 42.93M 113MB/s in 0.4s \n", + "\n", + "2022-01-04 08:20:04 (113 MB/s) - ‘hindi_transliteration.zip’ saved [45018357/45018357]\n", + "\n", + "Archive: hindi_transliteration.zip\n", + " inflating: hi_111_model.pth \n", + " inflating: hi_scripts.json \n", + " inflating: hi_words_a4b.json \n", + "--2022-01-04 08:20:05-- https://storage.googleapis.com/vakyansh-open-models/tts/hindi/hi-IN/female_voice_0/glow.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 313981548 (299M) [application/zip]\n", + "Saving to: ‘glow.zip’\n", + "\n", + "glow.zip 100%[===================>] 299.44M 109MB/s in 2.7s \n", + "\n", + "2022-01-04 08:20:08 (109 MB/s) - ‘glow.zip’ saved [313981548/313981548]\n", + "\n", + "Archive: glow.zip\n", + " creating: glow_ckp/\n", + " inflating: glow_ckp/config.json \n", + " inflating: glow_ckp/G_250.pth \n", + "--2022-01-04 08:20:12-- https://storage.googleapis.com/vakyansh-open-models/tts/hindi/hi-IN/female_voice_0/hifi.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 51788492 (49M) [application/zip]\n", + "Saving to: ‘hifi.zip’\n", + "\n", + "hifi.zip 100%[===================>] 49.39M 88.9MB/s in 0.6s \n", + "\n", + "2022-01-04 08:20:13 (88.9 MB/s) - ‘hifi.zip’ saved [51788492/51788492]\n", + "\n", + "Archive: hifi.zip\n", + " creating: hifi_ckp/\n", + " inflating: hifi_ckp/config.json \n", + " inflating: hifi_ckp/g_00100000 \n" + ] + } + ], + "source": [ + "import os\n", + "!git clone https://github.com/Open-Speech-EkStep/vakyansh-tts\n", + "os.chdir('vakyansh-tts') \n", + "!bash install.sh\n", + "!python setup.py bdist_wheel\n", + "!pip install -e .\n", + "os.chdir('tts_infer')\n", + "!mkdir translit_models\n", + "os.chdir('translit_models')\n", + "!wget https://storage.googleapis.com/vakyaansh-open-models/translit_models/default_lineup.json\n", + "!mkdir hindi\n", + "os.chdir('hindi')\n", + "!wget https://storage.googleapis.com/vakyaansh-open-models/translit_models/hindi/hindi_transliteration.zip\n", + "!unzip hindi_transliteration\n", + "\n", + "!wget https://storage.googleapis.com/vakyansh-open-models/tts/hindi/hi-IN/female_voice_0/glow.zip\n", + "!unzip glow.zip\n", + "\n", + "!wget https://storage.googleapis.com/vakyansh-open-models/tts/hindi/hi-IN/female_voice_0/hifi.zip\n", + "!unzip hifi.zip\n", + "\n", + "!rm glow.zip\n", + "!rm hifi.zip\n", + "\n", + "os.chdir('/content/vakyansh-tts/')" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Inference Code" + ], + "metadata": { + "id": "NvQoCgYzKbWN" + } + }, + { + "cell_type": "code", + "source": [ + "from tts_infer.tts import TextToMel, MelToWav\n", + "from tts_infer.transliterate import XlitEngine\n", + "from tts_infer.num_to_word_on_sent import normalize_nums\n", + "\n", + "import re\n", + "from scipy.io.wavfile import write\n", + "device = 'cpu'\n", + "\n", + "text_to_mel = TextToMel(glow_model_dir='/content/vakyansh-tts/tts_infer/translit_models/hindi/glow_ckp', device=device)\n", + "mel_to_wav = MelToWav(hifi_model_dir='/content/vakyansh-tts/tts_infer/translit_models/hindi/hifi_ckp', device=device)\n", + "\n", + "def translit(text, lang):\n", + " reg = re.compile(r'[a-zA-Z]')\n", + " engine = XlitEngine(lang)\n", + " words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()]\n", + " updated_sent = ' '.join(words)\n", + " return updated_sent\n", + " \n", + "def run_tts(text, lang):\n", + " text = text.replace('।', '.') # only for hindi models\n", + " text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang\n", + " text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang\n", + " \n", + " mel = text_to_mel.generate_mel(text_num_to_word_and_transliterated)\n", + " audio, sr = mel_to_wav.generate_wav(mel)\n", + " write(filename='temp.wav', rate=sr, data=audio) # for saving wav file, if needed\n", + " return (sr, audio)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TVW_x9L0b5W4", + "outputId": "28f0a3b9-8f72-4562-db4b-af49699d6cc3" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/vakyansh-tts/tts_infer/translit_models/hindi/glow_ckp/G_250.pth\n", + "INFO:root:Loaded checkpoint '/content/vakyansh-tts/tts_infer/translit_models/hindi/glow_ckp/G_250.pth' (iteration 250)\n", + "/content/vakyansh-tts/tts_infer/translit_models/hindi/hifi_ckp/g_00100000\n", + "Loading '/content/vakyansh-tts/tts_infer/translit_models/hindi/hifi_ckp/g_00100000'\n", + "Complete.\n", + "Removing weight norm...\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "_, audio = run_tts('hello my name is harveen', 'hi')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aqZ5xOVidczp", + "outputId": "bdf8f92b-c673-4738-860e-0cbf3f339d6e" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading hi...\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Results" + ], + "metadata": { + "id": "jaFjD59HKghg" + } + }, + { + "cell_type": "code", + "source": [ + "import IPython.display as ipd\n", + "ipd.Audio('temp.wav')" + ], + "metadata": { + "id": "zC9I2Zt5fijp", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "outputId": "86d09807-41a8-48e7-ec71-4734b6ccbdc8" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + } + ] +} \ No newline at end of file diff --git a/vakyansh-tts/results/api/.gitkeep b/vakyansh-tts/results/api/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vakyansh-tts/scripts/data/duration.sh b/vakyansh-tts/scripts/data/duration.sh new file mode 100644 index 0000000000000000000000000000000000000000..6fc586c05259d3d576fa4437dea5f650fe5f5031 --- /dev/null +++ b/vakyansh-tts/scripts/data/duration.sh @@ -0,0 +1,9 @@ +wav_path='/home/harveen/en/iitm_data/english/wav_22k' +####################### + +dir=$PWD +parentdir="$(dirname "$dir")" +parentdir="$(dirname "$parentdir")" + + +python $parentdir/utils/data/duration.py $wav_path diff --git a/vakyansh-tts/scripts/data/resample.sh b/vakyansh-tts/scripts/data/resample.sh new file mode 100644 index 0000000000000000000000000000000000000000..8489b0a0056d46a93d24db8dba173ad7a4b8a44a --- /dev/null +++ b/vakyansh-tts/scripts/data/resample.sh @@ -0,0 +1,14 @@ +input_wav_path='/home/harveen/en/iitm_data/english/wav/' +output_wav_path='/home/harveen/en/iitm_data/english/wav_22k/' +output_sample_rate=22050 + +####################### + +dir=$PWD +parentdir="$(dirname "$dir")" +parentdir="$(dirname "$parentdir")" + +mkdir -p $output_wav_path +python $parentdir/utils/data/resample.py -i $input_wav_path -o $output_wav_path -s $output_sample_rate + +python $parentdir/utils/data/duration.py $output_wav_path diff --git a/vakyansh-tts/scripts/glow/prepare_data.sh b/vakyansh-tts/scripts/glow/prepare_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..2357eeebd0fb7e6fba858242af44e8b8aa87fdf9 --- /dev/null +++ b/vakyansh-tts/scripts/glow/prepare_data.sh @@ -0,0 +1,12 @@ +input_text_path='/home/harveen/en/iitm_data/english/txt.done.data' +input_wav_path='/home/harveen/en/iitm_data/english/wav_22k' +gender='male' + + +output_data_path='../../data/glow/'$gender + +valid_samples=100 +test_samples=10 + +mkdir -p $output_data_path +python ../../utils/glow/prepare_iitm_data_glow_en.py -i $input_text_path -o $output_data_path -w $input_wav_path -v $valid_samples -t $test_samples diff --git a/vakyansh-tts/scripts/glow/train_glow.sh b/vakyansh-tts/scripts/glow/train_glow.sh new file mode 100755 index 0000000000000000000000000000000000000000..f12939d5d4563de555bf49408fa7a27397e0dae3 --- /dev/null +++ b/vakyansh-tts/scripts/glow/train_glow.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +gender='male' + +config='../../config/glow/'$gender'.json' +modeldir='../../checkpoints/glow/'$gender +logdir='../../logs/glow/'$gender +init=1 # 1 if start from scratch. 0 if start from last checkpoint + + +#################################################### + +if [[ $init -eq 1 ]] +then + python ../../src/glow_tts/init.py -c $config -m $modeldir -l $logdir +fi +python ../../src/glow_tts/train.py -c $config -m $modeldir -l $logdir diff --git a/vakyansh-tts/scripts/hifi/prepare_data.sh b/vakyansh-tts/scripts/hifi/prepare_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..d620cfeb93d8de9b2f750ad9bd52a937b0b88c33 --- /dev/null +++ b/vakyansh-tts/scripts/hifi/prepare_data.sh @@ -0,0 +1,10 @@ +input_wav_path='/home/harveen/en/iitm_data/english/wav_22k' #give multiple folders separated by comma(,) +gender='male' + +output_data_path='../../data/hifi/'$gender + +valid_samples=100 +test_samples=10 + +mkdir -p $output_data_path +python ../../utils/hifi/prepare_iitm_data_hifi.py -i $input_wav_path -v $valid_samples -t $test_samples -d $output_data_path diff --git a/vakyansh-tts/scripts/hifi/train_hifi.sh b/vakyansh-tts/scripts/hifi/train_hifi.sh new file mode 100644 index 0000000000000000000000000000000000000000..287ca1159b5bf8f779d66885197fadbcd23b911e --- /dev/null +++ b/vakyansh-tts/scripts/hifi/train_hifi.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +gender='male' + +config='../../config/hifi/config_v1.json' +modeldir='../../checkpoints/hifi/'$gender +logdir='../../logs/hifi/'$gender + + +#################################################### + + + +python ../../src/hifi_gan/train.py \ + --config $config \ + --input_training_file '../../data/hifi/'$gender'/train.txt' \ + --input_validation_file '../../data/hifi/'$gender'/valid.txt' \ + --checkpoint_path $modeldir \ + --logs_path $logdir \ + --checkpoint_interval 10000 \ + --stdout_interval 50 diff --git a/vakyansh-tts/scripts/inference/advanced_infer.sh b/vakyansh-tts/scripts/inference/advanced_infer.sh new file mode 100644 index 0000000000000000000000000000000000000000..6bbd53454331f0bd5157aa4e38ae4d329fba05fd --- /dev/null +++ b/vakyansh-tts/scripts/inference/advanced_infer.sh @@ -0,0 +1,22 @@ +gender='male' +glowdir='../../checkpoints/glow/'$gender'/' +hifidir='../../checkpoints/hifi/'$gender'/' +device='cpu' +text='Hey mr. I am testing this one. Now on multiple sentences. Just want to see the flow.' +noise_scale='0.667' +length_scale='1.0' +transliteration=1 +number_conversion=1 +split_sentences=1 +lang='en' + + +timestamp=$(date +%s) +wav='../../results/'$gender'/' +wav_file=$wav/$timestamp'.wav' + + +mkdir -p $wav + +python ../../utils/inference/advanced_tts.py -a $glowdir -v $hifidir -d $device -t "$text" -w $wav_file -L $lang -n $noise_scale -l $length_scale -T $transliteration -N $number_conversion -S $split_sentences +echo "File saved at: "$wav_file diff --git a/vakyansh-tts/scripts/inference/api.sh b/vakyansh-tts/scripts/inference/api.sh new file mode 100644 index 0000000000000000000000000000000000000000..4f6ce2a2147f69e5b3da851c8222bef830056338 --- /dev/null +++ b/vakyansh-tts/scripts/inference/api.sh @@ -0,0 +1,8 @@ +gender='male' +glowdir='../../checkpoints/glow/'$gender'/' +hifidir='../../checkpoints/hifi/'$gender'/' +device='cpu' +lang='en' + + +python ../../utils/inference/api.py -a $glowdir -v $hifidir -d $device -L $lang diff --git a/vakyansh-tts/scripts/inference/gradio.sh b/vakyansh-tts/scripts/inference/gradio.sh new file mode 100644 index 0000000000000000000000000000000000000000..2b6657952c21ca7821a9a82ed0a38f7dcf78b8e1 --- /dev/null +++ b/vakyansh-tts/scripts/inference/gradio.sh @@ -0,0 +1,8 @@ +gender='male' +glowdir='../../checkpoints/glow/'$gender'/' +hifidir='../../checkpoints/hifi/'$gender'/' +device='cpu' +lang='en' + + +python ../../utils/inference/run_gradio.py -a $glowdir -v $hifidir -d $device -L $lang \ No newline at end of file diff --git a/vakyansh-tts/scripts/inference/infer.sh b/vakyansh-tts/scripts/inference/infer.sh new file mode 100644 index 0000000000000000000000000000000000000000..dec70e1f30fb80f6957f4f3382b4c0963827cf43 --- /dev/null +++ b/vakyansh-tts/scripts/inference/infer.sh @@ -0,0 +1,15 @@ +gender='male' +glowdir='../../checkpoints/glow/'$gender'/' +hifidir='../../checkpoints/hifi/'$gender'/' +device='cpu' +text='testing this one' + + +timestamp=$(date +%s) +wav='../../results/'$gender'/' +wav_file=$wav/$timestamp'.wav' + + +mkdir -p $wav +python ../../utils/inference/tts.py -a $glowdir -v $hifidir -d $device -t "$text" -w $wav_file +echo "File saved at: "$wav_file diff --git a/vakyansh-tts/setup.py b/vakyansh-tts/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..9d2c73345b8406195aaa6327cb3148bb92b65190 --- /dev/null +++ b/vakyansh-tts/setup.py @@ -0,0 +1,55 @@ +from setuptools import setup, find_packages + +with open("README.md", "r") as f: + long_description = f.read() + +setup( + name="vakyansh-tts", + version="0.0.5", + description="Text to speech for Indic languages", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/Open-Speech-EkStep/vakyansh-tts.git", + keywords="nlp, tts, Indic languages, deep learning, text to speech", + # package_dir={'': 'src'}, + # packages=find_packages(where='src'), + packages=["tts_infer"], + python_requires=">=3.7, <4", + install_requires=[ + "Cython==0.29.24", + "layers==0.1.5", + "librosa==0.8.1", + "matplotlib==3.3.4", + "numpy==1.20.2", + "scipy==1.5.4", + "tensorboardX==2.4", + "tensorboard==2.7.0", + "tqdm==4.62.3", + "fastapi==0.70.0", + "uvicorn==0.15.0", + "gradio==2.5.2", + "wavio==0.0.4", + "pydload==1.0.9", + "mosestokenizer==1.2.1", + "indic-nlp-library==0.81" + ], + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + "Development Status :: 3 - Alpha", + # Indicate who your project is intended for + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Text Processing :: Linguistic", + # Pick your license as you wish (should match "license" above) + "License :: OSI Approved :: MIT License", + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + "Programming Language :: Python :: 3.7", + ], + include_package_data=True, +) diff --git a/vakyansh-tts/src/glow_tts/attentions.py b/vakyansh-tts/src/glow_tts/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..62b8c83acbd3150b6d6686f21f3627781107c1ba --- /dev/null +++ b/vakyansh-tts/src/glow_tts/attentions.py @@ -0,0 +1,378 @@ +import copy +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import modules +from modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=None, + block_length=None, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + window_size=window_size, + p_dropout=p_dropout, + block_length=block_length, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + for i in range(self.n_layers): + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class CouplingBlock(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + sigmoid_scale=False, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + self.sigmoid_scale = sigmoid_scale + + start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1) + start = torch.nn.utils.weight_norm(start) + self.start = start + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. It helps to stabilze training. + end = torch.nn.Conv1d(hidden_channels, in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + + self.wn = modules.WN( + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels, + p_dropout, + ) + + def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): + b, c, t = x.size() + if x_mask is None: + x_mask = 1 + x_0, x_1 = x[:, : self.in_channels // 2], x[:, self.in_channels // 2 :] + + x = self.start(x_0) * x_mask + x = self.wn(x, x_mask, g) + out = self.end(x) + + z_0 = x_0 + m = out[:, : self.in_channels // 2, :] + logs = out[:, self.in_channels // 2 :, :] + if self.sigmoid_scale: + logs = torch.log(1e-6 + torch.sigmoid(logs + 2)) + + if reverse: + z_1 = (x_1 - m) * torch.exp(-logs) * x_mask + logdet = None + else: + z_1 = (m + torch.exp(logs) * x_1) * x_mask + logdet = torch.sum(logs * x_mask, [1, 2]) + + z = torch.cat([z_0, z_1], 1) + return z, logdet + + def store_inverse(self): + self.wn.remove_weight_norm() + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + window_size=None, + heads_share=True, + p_dropout=0.0, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels ** -0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings) + rel_logits = self._relative_position_to_absolute_position(rel_logits) + scores_local = rel_logits / math.sqrt(self.k_channels) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores * block_mask + -1e4 * (1 - block_mask) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.conv_2 = nn.Conv1d( + filter_channels, out_channels, kernel_size, padding=kernel_size // 2 + ) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(x * x_mask) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + return x * x_mask diff --git a/vakyansh-tts/src/glow_tts/audio_processing.py b/vakyansh-tts/src/glow_tts/audio_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..3a4467355952fefaba117b6014864139ac319c6b --- /dev/null +++ b/vakyansh-tts/src/glow_tts/audio_processing.py @@ -0,0 +1,100 @@ +import torch +import numpy as np +from scipy.signal import get_window +import librosa.util as librosa_util + + +def window_sumsquare( + window, + n_frames, + hop_length=200, + win_length=800, + n_fft=800, + dtype=np.float32, + norm=None, +): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + + n_frames : int > 0 + The number of analysis frames + + hop_length : int > 0 + The number of samples to advance between frames + + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + + n_fft : int > 0 + The length of each analysis frame. + + dtype : np.dtype + The data type of the output + + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 + win_sq = librosa_util.pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] + return x + + +def griffin_lim(magnitudes, stft_fn, n_iters=30): + """ + PARAMS + ------ + magnitudes: spectrogram magnitudes + stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods + """ + + angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) + angles = angles.astype(np.float32) + angles = torch.autograd.Variable(torch.from_numpy(angles)) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + + for i in range(n_iters): + _, angles = stft_fn.transform(signal) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + return signal + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C diff --git a/vakyansh-tts/src/glow_tts/commons.py b/vakyansh-tts/src/glow_tts/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..8da7b35049d768a29de6f66cbe8795a825967818 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/commons.py @@ -0,0 +1,273 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from librosa.filters import mel as librosa_mel_fn +from audio_processing import dynamic_range_compression +from audio_processing import dynamic_range_decompression +from stft import STFT + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def mle_loss(z, m, logs, logdet, mask): + l = torch.sum(logs) + 0.5 * torch.sum( + torch.exp(-2 * logs) * ((z - m) ** 2) + ) # neg normal likelihood w/o the constant term + l = l - torch.sum(logdet) # log jacobian determinant + l = l / torch.sum( + torch.ones_like(z) * mask + ) # averaging across batch, channel and time axes + l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term + return l + + +def duration_loss(logw, logw_, lengths): + l = torch.sum((logw - logw_) ** 2) / torch.sum(lengths) + return l + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def maximum_path(value, mask, max_neg_val=-np.inf): + """Numpy-friendly version. It's about 4 times faster than torch version. + value: [b, t_x, t_y] + mask: [b, t_x, t_y] + """ + value = value * mask + + device = value.device + dtype = value.dtype + value = value.cpu().detach().numpy() + mask = mask.cpu().detach().numpy().astype(np.bool) + + b, t_x, t_y = value.shape + direction = np.zeros(value.shape, dtype=np.int64) + v = np.zeros((b, t_x), dtype=np.float32) + x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1) + for j in range(t_y): + v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[ + :, :-1 + ] + v1 = v + max_mask = v1 >= v0 + v_max = np.where(max_mask, v1, v0) + direction[:, :, j] = max_mask + + index_mask = x_range <= j + v = np.where(index_mask, v_max + value[:, :, j], max_neg_val) + direction = np.where(mask, direction, 1) + + path = np.zeros(value.shape, dtype=np.float32) + index = mask[:, :, 0].sum(1).astype(np.int64) - 1 + index_range = np.arange(b) + for j in reversed(range(t_y)): + path[index_range, index, j] = 1 + index = index + direction[index_range, index, j] - 1 + path = path * mask.astype(np.float32) + path = torch.from_numpy(path).to(device=device, dtype=dtype) + return path + + +def generate_path(duration, mask): + """ + duration: [b, t_x] + mask: [b, t_x, t_y] + """ + device = duration.device + + b, t_x, t_y = mask.shape + cum_duration = torch.cumsum(duration, 1) + path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path * mask + return path + + +class Adam: + def __init__( + self, + params, + scheduler, + dim_model, + warmup_steps=4000, + lr=1e0, + betas=(0.9, 0.98), + eps=1e-9, + ): + self.params = params + self.scheduler = scheduler + self.dim_model = dim_model + self.warmup_steps = warmup_steps + self.lr = lr + self.betas = betas + self.eps = eps + + self.step_num = 1 + self.cur_lr = lr * self._get_lr_scale() + + self._optim = torch.optim.Adam(params, lr=self.cur_lr, betas=betas, eps=eps) + + def _get_lr_scale(self): + if self.scheduler == "noam": + return np.power(self.dim_model, -0.5) * np.min( + [ + np.power(self.step_num, -0.5), + self.step_num * np.power(self.warmup_steps, -1.5), + ] + ) + else: + return 1 + + def _update_learning_rate(self): + self.step_num += 1 + if self.scheduler == "noam": + self.cur_lr = self.lr * self._get_lr_scale() + for param_group in self._optim.param_groups: + param_group["lr"] = self.cur_lr + + def get_lr(self): + return self.cur_lr + + def step(self): + self._optim.step() + self._update_learning_rate() + + def zero_grad(self): + self._optim.zero_grad() + + def load_state_dict(self, d): + self._optim.load_state_dict(d) + + def state_dict(self): + return self._optim.state_dict() + + +class TacotronSTFT(nn.Module): + def __init__( + self, + filter_length=1024, + hop_length=256, + win_length=1024, + n_mel_channels=80, + sampling_rate=22050, + mel_fmin=0.0, + mel_fmax=8000.0, + ): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.stft_fn = STFT(filter_length, hop_length, win_length) + mel_basis = librosa_mel_fn( + sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + + def spectral_normalize(self, magnitudes): + output = dynamic_range_compression(magnitudes) + return output + + def spectral_de_normalize(self, magnitudes): + output = dynamic_range_decompression(magnitudes) + return output + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert torch.min(y.data) >= -1 + assert torch.max(y.data) <= 1 + + magnitudes, phases = self.stft_fn.transform(y) + magnitudes = magnitudes.data + mel_output = torch.matmul(self.mel_basis, magnitudes) + mel_output = self.spectral_normalize(mel_output) + return mel_output + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm + + +def squeeze(x, x_mask=None, n_sqz=2): + b, c, t = x.size() + + t = (t // n_sqz) * n_sqz + x = x[:, :, :t] + x_sqz = x.view(b, c, t // n_sqz, n_sqz) + x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) + + if x_mask is not None: + x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz] + else: + x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) + return x_sqz * x_mask, x_mask + + +def unsqueeze(x, x_mask=None, n_sqz=2): + b, c, t = x.size() + + x_unsqz = x.view(b, n_sqz, c // n_sqz, t) + x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) + + if x_mask is not None: + x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) + else: + x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) + return x_unsqz * x_mask, x_mask diff --git a/vakyansh-tts/src/glow_tts/data_utils.py b/vakyansh-tts/src/glow_tts/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b58d84b3df3de3afb0a6a3bb8fadfd7a592dd602 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/data_utils.py @@ -0,0 +1,274 @@ +import random +import numpy as np +import torch +import torch.utils.data + +import commons +from utils import load_wav_to_torch, load_filepaths_and_text +from text import text_to_sequence + +class TextMelLoader(torch.utils.data.Dataset): + """ + 1) loads audio,text pairs + 2) normalizes text and converts them to sequences of one-hot vectors + 3) computes mel-spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.text_cleaners = hparams.text_cleaners + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.load_mel_from_disk = hparams.load_mel_from_disk + self.add_noise = hparams.add_noise + self.symbols = hparams.punc + hparams.chars + self.add_blank = getattr(hparams, "add_blank", False) # improved version + self.stft = commons.TacotronSTFT( + hparams.filter_length, + hparams.hop_length, + hparams.win_length, + hparams.n_mel_channels, + hparams.sampling_rate, + hparams.mel_fmin, + hparams.mel_fmax, + ) + random.seed(1234) + random.shuffle(self.audiopaths_and_text) + + def get_mel_text_pair(self, audiopath_and_text): + # separate filename and text + audiopath, text = audiopath_and_text[0], audiopath_and_text[1] + text = self.get_text(text) + mel = self.get_mel(audiopath) + return (text, mel) + + def get_mel(self, filename): + if not self.load_mel_from_disk: + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.stft.sampling_rate: + raise ValueError( + "{} {} SR doesn't match target {} SR".format( + sampling_rate, self.stft.sampling_rate + ) + ) + if self.add_noise: + audio = audio + torch.rand_like(audio) + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + melspec = self.stft.mel_spectrogram(audio_norm) + melspec = torch.squeeze(melspec, 0) + else: + melspec = torch.from_numpy(np.load(filename)) + assert ( + melspec.size(0) == self.stft.n_mel_channels + ), "Mel dimension mismatch: given {}, expected {}".format( + melspec.size(0), self.stft.n_mel_channels + ) + + return melspec + + def get_text(self, text): + text_norm = text_to_sequence(text, self.symbols, self.text_cleaners) + if self.add_blank: + text_norm = commons.intersperse( + text_norm, len(self.symbols) + ) # add a blank token, whose id number is len(symbols) + text_norm = torch.IntTensor(text_norm) + return text_norm + + def __getitem__(self, index): + return self.get_mel_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextMelCollate: + """Zero-pads model inputs and targets based on number of frames per step""" + + def __init__(self, n_frames_per_step=1): + self.n_frames_per_step = n_frames_per_step + + def __call__(self, batch): + """Collate's training batch from normalized text and mel-spectrogram + PARAMS + ------ + batch: [text_normalized, mel_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + input_lengths, ids_sorted_decreasing = torch.sort( + torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True + ) + max_input_len = input_lengths[0] + + text_padded = torch.LongTensor(len(batch), max_input_len) + text_padded.zero_() + for i in range(len(ids_sorted_decreasing)): + text = batch[ids_sorted_decreasing[i]][0] + text_padded[i, : text.size(0)] = text + + # Right zero-pad mel-spec + num_mels = batch[0][1].size(0) + max_target_len = max([x[1].size(1) for x in batch]) + if max_target_len % self.n_frames_per_step != 0: + max_target_len += ( + self.n_frames_per_step - max_target_len % self.n_frames_per_step + ) + assert max_target_len % self.n_frames_per_step == 0 + + # include mel padded + mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) + mel_padded.zero_() + output_lengths = torch.LongTensor(len(batch)) + for i in range(len(ids_sorted_decreasing)): + mel = batch[ids_sorted_decreasing[i]][1] + mel_padded[i, :, : mel.size(1)] = mel + output_lengths[i] = mel.size(1) + + return text_padded, input_lengths, mel_padded, output_lengths + + +"""Multi speaker version""" + + +class TextMelSpeakerLoader(torch.utils.data.Dataset): + """ + 1) loads audio, speaker_id, text pairs + 2) normalizes text and converts them to sequences of one-hot vectors + 3) computes mel-spectrograms from audio files. + """ + + def __init__(self, audiopaths_sid_text, hparams): + self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text) + self.text_cleaners = hparams.text_cleaners + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.load_mel_from_disk = hparams.load_mel_from_disk + self.add_noise = hparams.add_noise + self.symbols = hparams.punc + hparams.chars + self.add_blank = getattr(hparams, "add_blank", False) # improved version + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 190) + self.stft = commons.TacotronSTFT( + hparams.filter_length, + hparams.hop_length, + hparams.win_length, + hparams.n_mel_channels, + hparams.sampling_rate, + hparams.mel_fmin, + hparams.mel_fmax, + ) + + self._filter_text_len() + random.seed(1234) + random.shuffle(self.audiopaths_sid_text) + + def _filter_text_len(self): + audiopaths_sid_text_new = [] + for audiopath, sid, text in self.audiopaths_sid_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_sid_text_new.append([audiopath, sid, text]) + self.audiopaths_sid_text = audiopaths_sid_text_new + + def get_mel_text_speaker_pair(self, audiopath_sid_text): + # separate filename, speaker_id and text + audiopath, sid, text = ( + audiopath_sid_text[0], + audiopath_sid_text[1], + audiopath_sid_text[2], + ) + text = self.get_text(text) + mel = self.get_mel(audiopath) + sid = self.get_sid(sid) + return (text, mel, sid) + + def get_mel(self, filename): + if not self.load_mel_from_disk: + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.stft.sampling_rate: + raise ValueError( + "{} {} SR doesn't match target {} SR".format( + sampling_rate, self.stft.sampling_rate + ) + ) + if self.add_noise: + audio = audio + torch.rand_like(audio) + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + melspec = self.stft.mel_spectrogram(audio_norm) + melspec = torch.squeeze(melspec, 0) + else: + melspec = torch.from_numpy(np.load(filename)) + assert ( + melspec.size(0) == self.stft.n_mel_channels + ), "Mel dimension mismatch: given {}, expected {}".format( + melspec.size(0), self.stft.n_mel_channels + ) + + return melspec + + def get_text(self, text): + text_norm = text_to_sequence(text, self.symbols, self.text_cleaners) + if self.add_blank: + text_norm = commons.intersperse( + text_norm, len(self.symbols) + ) # add a blank token, whose id number is len(symbols) + text_norm = torch.IntTensor(text_norm) + return text_norm + + def get_sid(self, sid): + sid = torch.IntTensor([int(sid)]) + return sid + + def __getitem__(self, index): + return self.get_mel_text_speaker_pair(self.audiopaths_sid_text[index]) + + def __len__(self): + return len(self.audiopaths_sid_text) + + +class TextMelSpeakerCollate: + """Zero-pads model inputs and targets based on number of frames per step""" + + def __init__(self, n_frames_per_step=1): + self.n_frames_per_step = n_frames_per_step + + def __call__(self, batch): + """Collate's training batch from normalized text and mel-spectrogram + PARAMS + ------ + batch: [text_normalized, mel_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + input_lengths, ids_sorted_decreasing = torch.sort( + torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True + ) + max_input_len = input_lengths[0] + + text_padded = torch.LongTensor(len(batch), max_input_len) + text_padded.zero_() + for i in range(len(ids_sorted_decreasing)): + text = batch[ids_sorted_decreasing[i]][0] + text_padded[i, : text.size(0)] = text + + # Right zero-pad mel-spec + num_mels = batch[0][1].size(0) + max_target_len = max([x[1].size(1) for x in batch]) + if max_target_len % self.n_frames_per_step != 0: + max_target_len += ( + self.n_frames_per_step - max_target_len % self.n_frames_per_step + ) + assert max_target_len % self.n_frames_per_step == 0 + + # include mel padded & sid + mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) + mel_padded.zero_() + output_lengths = torch.LongTensor(len(batch)) + sid = torch.LongTensor(len(batch)) + for i in range(len(ids_sorted_decreasing)): + mel = batch[ids_sorted_decreasing[i]][1] + mel_padded[i, :, : mel.size(1)] = mel + output_lengths[i] = mel.size(1) + sid[i] = batch[ids_sorted_decreasing[i]][2] + + return text_padded, input_lengths, mel_padded, output_lengths, sid diff --git a/vakyansh-tts/src/glow_tts/generate_mels.py b/vakyansh-tts/src/glow_tts/generate_mels.py new file mode 100644 index 0000000000000000000000000000000000000000..a3d331aef019cfd8cf45d6264db88d0fa26e5c0f --- /dev/null +++ b/vakyansh-tts/src/glow_tts/generate_mels.py @@ -0,0 +1,70 @@ +import numpy as np +import os +import torch +import commons + +import models +import utils +from argparse import ArgumentParser +from tqdm import tqdm +from text import text_to_sequence + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("-m", "--model_dir", required=True, type=str) + parser.add_argument("-s", "--mels_dir", required=True, type=str) + args = parser.parse_args() + MODEL_DIR = args.model_dir # path to model dir + SAVE_MELS_DIR = args.mels_dir # path to save generated mels + + if not os.path.exists(SAVE_MELS_DIR): + os.makedirs(SAVE_MELS_DIR) + + hps = utils.get_hparams_from_dir(MODEL_DIR) + symbols = list(hps.data.punc) + list(hps.data.chars) + checkpoint_path = utils.latest_checkpoint_path(MODEL_DIR) + cleaner = hps.data.text_cleaners + + model = models.FlowGenerator( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ).to("cuda") + + utils.load_checkpoint(checkpoint_path, model) + model.decoder.store_inverse() # do not calcuate jacobians for fast decoding + _ = model.eval() + + def get_mel(text, fpath): + if getattr(hps.data, "add_blank", False): + text_norm = text_to_sequence(text, symbols, cleaner) + text_norm = commons.intersperse(text_norm, len(symbols)) + else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality + text = " " + text.strip() + " " + text_norm = text_to_sequence(text, symbols, cleaner) + + sequence = np.array(text_norm)[None, :] + + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() + + with torch.no_grad(): + noise_scale = 0.667 + length_scale = 1.0 + (y_gen_tst, *_), *_, (attn_gen, *_) = model( + x_tst, + x_tst_lengths, + gen=True, + noise_scale=noise_scale, + length_scale=length_scale, + ) + + np.save(os.path.join(SAVE_MELS_DIR, fpath), y_gen_tst.cpu().detach().numpy()) + + for f in [hps.data.training_files, hps.data.validation_files]: + file_lines = open(f).read().splitlines() + + for line in tqdm(file_lines): + fname, text = line.split("|") + fname = os.path.basename(fname).replace(".wav", ".npy") + get_mel(text, fname) diff --git a/vakyansh-tts/src/glow_tts/hifi/__init__.py b/vakyansh-tts/src/glow_tts/hifi/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0323b35a0fc2ef21ac417857d9336cc7c8a3b717 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/hifi/__init__.py @@ -0,0 +1,5 @@ +from .env import AttrDict +from .models import Generator + +if __name__ == "__main__": + pass diff --git a/vakyansh-tts/src/glow_tts/hifi/env.py b/vakyansh-tts/src/glow_tts/hifi/env.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdbc95d4f7a8bad8fd4f5eef657e2b51d946056 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/hifi/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/vakyansh-tts/src/glow_tts/hifi/models.py b/vakyansh-tts/src/glow_tts/hifi/models.py new file mode 100644 index 0000000000000000000000000000000000000000..aaf911836119d69129abe22aa4fc875f2ba3d53c --- /dev/null +++ b/vakyansh-tts/src/glow_tts/hifi/models.py @@ -0,0 +1,403 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .utils import init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Generator(torch.nn.Module): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + self.conv_pre = weight_norm( + Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3) + ) + resblock = ResBlock1 if h.resblock == "1" else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + h.upsample_initial_channel // (2 ** i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) + ): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiPeriodDiscriminator, self).__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ] + ) + self.meanpools = nn.ModuleList( + [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg ** 2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/vakyansh-tts/src/glow_tts/hifi/utils.py b/vakyansh-tts/src/glow_tts/hifi/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..71e9b2c99e053e2d4239074a67d64b834898c348 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/hifi/utils.py @@ -0,0 +1,57 @@ +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm + +matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "????????") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] diff --git a/vakyansh-tts/src/glow_tts/init.py b/vakyansh-tts/src/glow_tts/init.py new file mode 100644 index 0000000000000000000000000000000000000000..39dd83dbd55475d562a3f54d951cb822800d2e0f --- /dev/null +++ b/vakyansh-tts/src/glow_tts/init.py @@ -0,0 +1,79 @@ +import os +import json +import argparse +import math +import torch +from torch import nn, optim +from torch.nn import functional as F +from torch.utils.data import DataLoader + +from data_utils import TextMelLoader, TextMelCollate +import models +import commons +import utils + + +class FlowGenerator_DDI(models.FlowGenerator): + """A helper for Data-dependent Initialization""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + for f in self.decoder.flows: + if getattr(f, "set_ddi", False): + f.set_ddi(True) + + +def main(): + hps = utils.get_hparams() + logger = utils.get_logger(hps.log_dir) + logger.info(hps) + utils.check_git_hash(hps.log_dir) + + torch.manual_seed(hps.train.seed) + + train_dataset = TextMelLoader(hps.data.training_files, hps.data) + collate_fn = TextMelCollate(1) + train_loader = DataLoader( + train_dataset, + num_workers=8, + shuffle=True, + batch_size=hps.train.batch_size, + pin_memory=True, + drop_last=True, + collate_fn=collate_fn, + ) + symbols = hps.data.punc + hps.data.chars + generator = FlowGenerator_DDI( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ).cuda() + optimizer_g = commons.Adam( + generator.parameters(), + scheduler=hps.train.scheduler, + dim_model=hps.model.hidden_channels, + warmup_steps=hps.train.warmup_steps, + lr=hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + + generator.train() + for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(train_loader): + x, x_lengths = x.cuda(), x_lengths.cuda() + y, y_lengths = y.cuda(), y_lengths.cuda() + + _ = generator(x, x_lengths, y, y_lengths, gen=False) + break + + utils.save_checkpoint( + generator, + optimizer_g, + hps.train.learning_rate, + 0, + os.path.join(hps.model_dir, "ddi_G.pth"), + ) + + +if __name__ == "__main__": + main() diff --git a/vakyansh-tts/src/glow_tts/models.py b/vakyansh-tts/src/glow_tts/models.py new file mode 100644 index 0000000000000000000000000000000000000000..a77596153fa2e7e6fdd52ee0028a0c8ce02050b4 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/models.py @@ -0,0 +1,403 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +import modules +import commons +import attentions +import monotonic_align + + +class DurationPredictor(nn.Module): + def __init__(self, in_channels, filter_channels, kernel_size, p_dropout): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = attentions.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = attentions.LayerNorm(filter_channels) + self.proj = nn.Conv1d(filter_channels, 1, 1) + + def forward(self, x, x_mask): + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + x = self.proj(x * x_mask) + return x * x_mask + + +class TextEncoder(nn.Module): + def __init__( + self, + n_vocab, + out_channels, + hidden_channels, + filter_channels, + filter_channels_dp, + n_heads, + n_layers, + kernel_size, + p_dropout, + window_size=None, + block_length=None, + mean_only=False, + prenet=False, + gin_channels=0, + ): + + super().__init__() + + self.n_vocab = n_vocab + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.filter_channels_dp = filter_channels_dp + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + self.mean_only = mean_only + self.prenet = prenet + self.gin_channels = gin_channels + + self.emb = nn.Embedding(n_vocab, hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5) + + if prenet: + self.pre = modules.ConvReluNorm( + hidden_channels, + hidden_channels, + hidden_channels, + kernel_size=5, + n_layers=3, + p_dropout=0.5, + ) + self.encoder = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + window_size=window_size, + block_length=block_length, + ) + + self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1) + if not mean_only: + self.proj_s = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj_w = DurationPredictor( + hidden_channels + gin_channels, filter_channels_dp, kernel_size, p_dropout + ) + + def forward(self, x, x_lengths, g=None): + x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + + if self.prenet: + x = self.pre(x, x_mask) + x = self.encoder(x, x_mask) + + if g is not None: + g_exp = g.expand(-1, -1, x.size(-1)) + x_dp = torch.cat([torch.detach(x), g_exp], 1) + else: + x_dp = torch.detach(x) + + x_m = self.proj_m(x) * x_mask + if not self.mean_only: + x_logs = self.proj_s(x) * x_mask + else: + x_logs = torch.zeros_like(x_m) + + logw = self.proj_w(x_dp, x_mask) + return x_m, x_logs, logw, x_mask + + +class FlowSpecDecoder(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_blocks, + n_layers, + p_dropout=0.0, + n_split=4, + n_sqz=2, + sigmoid_scale=False, + gin_channels=0, + ): + super().__init__() + + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_blocks = n_blocks + self.n_layers = n_layers + self.p_dropout = p_dropout + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for b in range(n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * n_sqz)) + self.flows.append( + modules.InvConvNear(channels=in_channels * n_sqz, n_split=n_split) + ) + self.flows.append( + attentions.CouplingBlock( + in_channels * n_sqz, + hidden_channels, + kernel_size=kernel_size, + dilation_rate=dilation_rate, + n_layers=n_layers, + gin_channels=gin_channels, + p_dropout=p_dropout, + sigmoid_scale=sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + + if self.n_sqz > 1: + x, x_mask = commons.squeeze(x, x_mask, self.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.n_sqz > 1: + x, x_mask = commons.unsqueeze(x, x_mask, self.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + + +class FlowGenerator(nn.Module): + def __init__( + self, + n_vocab, + hidden_channels, + filter_channels, + filter_channels_dp, + out_channels, + kernel_size=3, + n_heads=2, + n_layers_enc=6, + p_dropout=0.0, + n_blocks_dec=12, + kernel_size_dec=5, + dilation_rate=5, + n_block_layers=4, + p_dropout_dec=0.0, + n_speakers=0, + gin_channels=0, + n_split=4, + n_sqz=1, + sigmoid_scale=False, + window_size=None, + block_length=None, + mean_only=False, + hidden_channels_enc=None, + hidden_channels_dec=None, + prenet=False, + **kwargs + ): + + super().__init__() + self.n_vocab = n_vocab + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.filter_channels_dp = filter_channels_dp + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_heads = n_heads + self.n_layers_enc = n_layers_enc + self.p_dropout = p_dropout + self.n_blocks_dec = n_blocks_dec + self.kernel_size_dec = kernel_size_dec + self.dilation_rate = dilation_rate + self.n_block_layers = n_block_layers + self.p_dropout_dec = p_dropout_dec + self.n_speakers = n_speakers + self.gin_channels = gin_channels + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.window_size = window_size + self.block_length = block_length + self.mean_only = mean_only + self.hidden_channels_enc = hidden_channels_enc + self.hidden_channels_dec = hidden_channels_dec + self.prenet = prenet + + self.encoder = TextEncoder( + n_vocab, + out_channels, + hidden_channels_enc or hidden_channels, + filter_channels, + filter_channels_dp, + n_heads, + n_layers_enc, + kernel_size, + p_dropout, + window_size=window_size, + block_length=block_length, + mean_only=mean_only, + prenet=prenet, + gin_channels=gin_channels, + ) + + self.decoder = FlowSpecDecoder( + out_channels, + hidden_channels_dec or hidden_channels, + kernel_size_dec, + dilation_rate, + n_blocks_dec, + n_block_layers, + p_dropout=p_dropout_dec, + n_split=n_split, + n_sqz=n_sqz, + sigmoid_scale=sigmoid_scale, + gin_channels=gin_channels, + ) + + if n_speakers > 1: + self.emb_g = nn.Embedding(n_speakers, gin_channels) + nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) + + def forward( + self, + x, + x_lengths, + y=None, + y_lengths=None, + g=None, + gen=False, + noise_scale=1.0, + length_scale=1.0, + ): + if g is not None: + g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] + x_m, x_logs, logw, x_mask = self.encoder(x, x_lengths, g=g) + + if gen: + w = torch.exp(logw) * x_mask * length_scale + w_ceil = torch.ceil(w) + y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() + y_max_length = None + else: + y_max_length = y.size(2) + y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to( + x_mask.dtype + ) + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + + if gen: + attn = commons.generate_path( + w_ceil.squeeze(1), attn_mask.squeeze(1) + ).unsqueeze(1) + z_m = torch.matmul( + attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2) + ).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul( + attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2) + ).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask + + z = (z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask + y, logdet = self.decoder(z, z_mask, g=g, reverse=True) + return ( + (y, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + (attn, logw, logw_), + ) + else: + z, logdet = self.decoder(y, z_mask, g=g, reverse=False) + with torch.no_grad(): + x_s_sq_r = torch.exp(-2 * x_logs) + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze( + -1 + ) # [b, t, 1] + logp2 = torch.matmul( + x_s_sq_r.transpose(1, 2), -0.5 * (z ** 2) + ) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul( + (x_m * x_s_sq_r).transpose(1, 2), z + ) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (x_m ** 2) * x_s_sq_r, [1]).unsqueeze( + -1 + ) # [b, t, 1] + logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] + + attn = ( + monotonic_align.maximum_path(logp, attn_mask.squeeze(1)) + .unsqueeze(1) + .detach() + ) + z_m = torch.matmul( + attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2) + ).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul( + attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2) + ).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask + return ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + (attn, logw, logw_), + ) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.n_sqz) * self.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.n_sqz) * self.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() diff --git a/vakyansh-tts/src/glow_tts/modules.py b/vakyansh-tts/src/glow_tts/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..a192251aaccb036780d77d6c8b538b652a5e24e2 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/modules.py @@ -0,0 +1,276 @@ +import copy +import math +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import functional as F + +import commons + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-4): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean) ** 2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + assert hidden_channels % 2 == 0 + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask=None, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + x_in = self.drop(x_in) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + x = (x + res_skip_acts[:, : self.hidden_channels, :]) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ActNorm(nn.Module): + def __init__(self, channels, ddi=False, **kwargs): + super().__init__() + self.channels = channels + self.initialized = not ddi + + self.logs = nn.Parameter(torch.zeros(1, channels, 1)) + self.bias = nn.Parameter(torch.zeros(1, channels, 1)) + + def forward(self, x, x_mask=None, reverse=False, **kwargs): + if x_mask is None: + x_mask = torch.ones(x.size(0), 1, x.size(2)).to( + device=x.device, dtype=x.dtype + ) + x_len = torch.sum(x_mask, [1, 2]) + if not self.initialized: + self.initialize(x, x_mask) + self.initialized = True + + if reverse: + z = (x - self.bias) * torch.exp(-self.logs) * x_mask + logdet = None + else: + z = (self.bias + torch.exp(self.logs) * x) * x_mask + logdet = torch.sum(self.logs) * x_len # [b] + + return z, logdet + + def store_inverse(self): + pass + + def set_ddi(self, ddi): + self.initialized = not ddi + + def initialize(self, x, x_mask): + with torch.no_grad(): + denom = torch.sum(x_mask, [0, 2]) + m = torch.sum(x * x_mask, [0, 2]) / denom + m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom + v = m_sq - (m ** 2) + logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) + + bias_init = ( + (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) + ) + logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype) + + self.bias.data.copy_(bias_init) + self.logs.data.copy_(logs_init) + + +class InvConvNear(nn.Module): + def __init__(self, channels, n_split=4, no_jacobian=False, **kwargs): + super().__init__() + assert n_split % 2 == 0 + self.channels = channels + self.n_split = n_split + self.no_jacobian = no_jacobian + + w_init = torch.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0] + if torch.det(w_init) < 0: + w_init[:, 0] = -1 * w_init[:, 0] + self.weight = nn.Parameter(w_init) + + def forward(self, x, x_mask=None, reverse=False, **kwargs): + b, c, t = x.size() + assert c % self.n_split == 0 + if x_mask is None: + x_mask = 1 + x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t + else: + x_len = torch.sum(x_mask, [1, 2]) + + x = x.view(b, 2, c // self.n_split, self.n_split // 2, t) + x = ( + x.permute(0, 1, 3, 2, 4) + .contiguous() + .view(b, self.n_split, c // self.n_split, t) + ) + + if reverse: + if hasattr(self, "weight_inv"): + weight = self.weight_inv + else: + weight = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) + logdet = None + else: + weight = self.weight + if self.no_jacobian: + logdet = 0 + else: + logdet = torch.logdet(self.weight) * (c / self.n_split) * x_len # [b] + + weight = weight.view(self.n_split, self.n_split, 1, 1) + z = F.conv2d(x, weight) + + z = z.view(b, 2, self.n_split // 2, c // self.n_split, t) + z = z.permute(0, 1, 3, 2, 4).contiguous().view(b, c, t) * x_mask + return z, logdet + + def store_inverse(self): + self.weight_inv = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) diff --git a/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/__init__.py b/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..47a4dbf3177302af6b8e7d08b0b78343b1329efa --- /dev/null +++ b/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/__init__.py @@ -0,0 +1,5 @@ +import pkg_resources + +__version__ = pkg_resources.get_distribution("monotonic_align").version + +from monotonic_align.mas import * diff --git a/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/core.pyx b/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/core.pyx new file mode 100644 index 0000000000000000000000000000000000000000..6aabccc4c408cb1b555e2abb4d73e0d1ce4d346e --- /dev/null +++ b/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/core.pyx @@ -0,0 +1,45 @@ +import numpy as np +cimport numpy as np +cimport cython +from cython.parallel import prange + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: + cdef int x + cdef int y + cdef float v_prev + cdef float v_cur + cdef float tmp + cdef int index = t_x - 1 + + for y in range(t_y): + for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): + if x == y: + v_cur = max_neg_val + else: + v_cur = value[x, y-1] + if x == 0: + if y == 0: + v_prev = 0. + else: + v_prev = max_neg_val + else: + v_prev = value[x-1, y-1] + value[x, y] = max(v_cur, v_prev) + value[x, y] + + for y in range(t_y - 1, -1, -1): + path[index, y] = 1 + if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): + index = index - 1 + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: + cdef int b = values.shape[0] + + cdef int i + for i in prange(b, nogil=True): + maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) diff --git a/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/mas.py b/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/mas.py new file mode 100644 index 0000000000000000000000000000000000000000..207ab3e858389ec06c902fd6f5bec6c5da2996af --- /dev/null +++ b/vakyansh-tts/src/glow_tts/monotonic_align/monotonic_align/mas.py @@ -0,0 +1,57 @@ +from typing import overload +import numpy as np +import torch +from monotonic_align.core import maximum_path_c + + +def mask_from_len(lens: torch.Tensor, max_len=None): + """ + Make a `mask` from lens. + + :param inputs: (B, T, D) + :param lens: (B) + + :return: + `mask`: (B, T) + """ + if max_len is None: + max_len = lens.max() + index = torch.arange(max_len).to(lens).view(1, -1) + return index < lens.unsqueeze(1) # (B, T) + + +def mask_from_lens( + similarity: torch.Tensor, + symbol_lens: torch.Tensor, + mel_lens: torch.Tensor, +): + """ + :param similarity: (B, S, T) + :param symbol_lens: (B,) + :param mel_lens: (B,) + """ + _, S, T = similarity.size() + mask_S = mask_from_len(symbol_lens, S) + mask_T = mask_from_len(mel_lens, T) + mask_ST = mask_S.unsqueeze(2) * mask_T.unsqueeze(1) + return mask_ST.to(similarity) + + +def maximum_path(value, mask=None): + """Cython optimised version. + value: [b, t_x, t_y] + mask: [b, t_x, t_y] + """ + if mask is None: + mask = torch.zeros_like(value) + + value = value * mask + device = value.device + dtype = value.dtype + value = value.data.cpu().numpy().astype(np.float32) + path = np.zeros_like(value).astype(np.int32) + mask = mask.data.cpu().numpy() + t_x_max = mask.sum(1)[:, 0].astype(np.int32) + t_y_max = mask.sum(2)[:, 0].astype(np.int32) + maximum_path_c(path, value, t_x_max, t_y_max) + return torch.from_numpy(path).to(device=device, dtype=dtype) diff --git a/vakyansh-tts/src/glow_tts/monotonic_align/pyproject.toml b/vakyansh-tts/src/glow_tts/monotonic_align/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..ea6358a08fd8d6fc177e2361a82e1a5cc7b837d9 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/monotonic_align/pyproject.toml @@ -0,0 +1,7 @@ +[build-system] +requires = [ + "wheel", + "setuptools", + "cython>=0.24.0", + "numpy= win_length + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer("forward_basis", forward_basis.float()) + self.register_buffer("inverse_basis", inverse_basis.float()) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + if input_data.device.type == "cuda": + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode="reflect", + ) + input_data = input_data.squeeze(1) + + forward_transform = F.conv1d( + input_data, self.forward_basis, stride=self.hop_length, padding=0 + ) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + else: + x = input_data.detach().numpy() + real_part = [] + imag_part = [] + for y in x: + y_ = stft( + y, self.filter_length, self.hop_length, self.win_length, self.window + ) + real_part.append(y_.real[None, :, :]) + imag_part.append(y_.imag[None, :, :]) + real_part = np.concatenate(real_part, 0) + imag_part = np.concatenate(imag_part, 0) + + real_part = torch.from_numpy(real_part).to(input_data.dtype) + imag_part = torch.from_numpy(imag_part).to(input_data.dtype) + + magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2) + phase = torch.atan2(imag_part.data, real_part.data) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 + ) + + if magnitude.device.type == "cuda": + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + self.inverse_basis, + stride=self.hop_length, + padding=0, + ) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, + magnitude.size(-1), + hop_length=self.hop_length, + win_length=self.win_length, + n_fft=self.filter_length, + dtype=np.float32, + ) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0] + ) + window_sum = torch.from_numpy(window_sum).to(inverse_transform.device) + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ + approx_nonzero_indices + ] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :] + inverse_transform = inverse_transform[ + :, :, : -int(self.filter_length / 2) : + ] + inverse_transform = inverse_transform.squeeze(1) + else: + x_org = recombine_magnitude_phase.detach().numpy() + n_b, n_f, n_t = x_org.shape + x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64) + x.real = x_org[:, : n_f // 2] + x.imag = x_org[:, n_f // 2 :] + inverse_transform = [] + for y in x: + y_ = istft(y, self.hop_length, self.win_length, self.window) + inverse_transform.append(y_[None, :]) + inverse_transform = np.concatenate(inverse_transform, 0) + inverse_transform = torch.from_numpy(inverse_transform).to( + recombine_magnitude_phase.dtype + ) + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction diff --git a/vakyansh-tts/src/glow_tts/t2s_fastapi.py b/vakyansh-tts/src/glow_tts/t2s_fastapi.py new file mode 100644 index 0000000000000000000000000000000000000000..e034fc01a4a5bcd54b365a49dad2e907b57504a1 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/t2s_fastapi.py @@ -0,0 +1,63 @@ +from starlette.responses import StreamingResponse +from texttospeech import MelToWav, TextToMel +from typing import Optional +from pydantic import BaseModel +from fastapi import FastAPI, HTTPException +import uvicorn +import base64 + +app = FastAPI() + + +class TextJson(BaseModel): + text: str + lang: Optional[str] = "hi" + gender: Optional[str] = "male" + + +glow_hi_male = TextToMel(glow_model_dir="", device="") +glow_hi_female = TextToMel(glow_model_dir="", device="") +hifi_hi = MelToWav(hifi_model_dir="", device="") + + +available_choice = { + "hi_male": [glow_hi_male, hifi_hi], + "hi_female": [glow_hi_female, hifi_hi], +} + + +@app.post("/TTS/") +async def tts(input: TextJson): + text = input.text + lang = input.lang + gender = input.gender + + choice = lang + "_" + gender + if choice in available_choice.keys(): + t2s = available_choice[choice] + else: + raise HTTPException( + status_code=400, detail={"error": "Requested model not found"} + ) + + if text: + mel = t2s[0].generate_mel(text) + data, sr = t2s[1].generate_wav(mel) + t2s.save_audio("out.wav", data, sr) + else: + raise HTTPException(status_code=400, detail={"error": "No text"}) + + ## to return outpur as a file + # audio = open('out.wav', mode='rb') + # return StreamingResponse(audio, media_type="audio/wav") + + with open("out.wav", "rb") as audio_file: + encoded_bytes = base64.b64encode(audio_file.read()) + encoded_string = encoded_bytes.decode() + return {"encoding": "base64", "data": encoded_string, "sr": sr} + + +if __name__ == "__main__": + uvicorn.run( + "t2s_fastapi:app", host="127.0.0.1", port=5000, log_level="info", reload=True + ) diff --git a/vakyansh-tts/src/glow_tts/t2s_gradio.py b/vakyansh-tts/src/glow_tts/t2s_gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..bd9acbe68761759ff259f4476bb3df57a75c78ff --- /dev/null +++ b/vakyansh-tts/src/glow_tts/t2s_gradio.py @@ -0,0 +1,24 @@ +import gradio as gr +from texttospeech import TextToMel, MelToWav + +text_to_mel = TextToMel( + glow_model_dir="/path/to/glow-tts/checkpoint/dir", device="cuda" +) +mel_to_wav = MelToWav(hifi_model_dir="/path/to/glow-tts/checkpoint/dir", device="cuda") + + +def run_tts(text): + mel = text_to_mel.generate_mel(text) + audio, sr = mel_to_wav.generate_wav(mel) + return (sr, audio) + + +# text = " सीआईएसएफ में उप-निरीक्षक महावीर प्रसाद गोदरा को मरणोपरांत 'शौर्य चक्र' से सम्मानित किया गया। " +# run_tts(text) + +textbox = gr.inputs.Textbox( + placeholder="Enter Telugu text here", default="", label="TTS" +) +op = gr.outputs.Audio(type="numpy", label=None) +iface = gr.Interface(fn=run_tts, inputs=textbox, outputs=op) +iface.launch(share=True) diff --git a/vakyansh-tts/src/glow_tts/text/__init__.py b/vakyansh-tts/src/glow_tts/text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3f5aa62bfcd56165b85d064f5ca0ba59fbe34a72 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/text/__init__.py @@ -0,0 +1,84 @@ +""" from https://github.com/keithito/tacotron """ +import re +from text import cleaners + +# Regular expression matching text enclosed in curly braces: +_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') + + +def get_arpabet(word, dictionary): + word_arpabet = dictionary.lookup(word) + if word_arpabet is not None: + return "{" + word_arpabet[0] + "}" + else: + return word + + +def text_to_sequence(text, symbols, cleaner_names, dictionary=None): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + + The text can optionally have ARPAbet sequences enclosed in curly braces embedded + in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." + + Args: + text: string to convert to a sequence + cleaner_names: names of the cleaner functions to run the text through + dictionary: arpabet class with arpabet dictionary + + Returns: + List of integers corresponding to the symbols in the text + ''' + # Mappings from symbol to numeric ID and vice versa: + global _id_to_symbol, _symbol_to_id + _symbol_to_id = {s: i for i, s in enumerate(symbols)} + _id_to_symbol = {i: s for i, s in enumerate(symbols)} + + sequence = [] + + space = _symbols_to_sequence(' ') + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = _curly_re.match(text) + if not m: + clean_text = _clean_text(text, cleaner_names) + if dictionary is not None: + clean_text = [get_arpabet(w, dictionary) for w in clean_text.split(" ")] + for i in range(len(clean_text)): + t = clean_text[i] + if t.startswith("{"): + sequence += _arpabet_to_sequence(t[1:-1]) + else: + sequence += _symbols_to_sequence(t) + sequence += space + else: + sequence += _symbols_to_sequence(clean_text) + break + sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) + sequence += _arpabet_to_sequence(m.group(2)) + text = m.group(3) + + # remove trailing space + if dictionary is not None: + sequence = sequence[:-1] if sequence[-1] == space[0] else sequence + return sequence + + +def _clean_text(text, cleaner_names): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text) + return text + + +def _symbols_to_sequence(symbols): + return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] + + +def _arpabet_to_sequence(text): + return _symbols_to_sequence(['@' + s for s in text.split()]) + + +def _should_keep_symbol(s): + return s in _symbol_to_id and s is not '_' and s is not '~' \ No newline at end of file diff --git a/vakyansh-tts/src/glow_tts/text/cleaners.py b/vakyansh-tts/src/glow_tts/text/cleaners.py new file mode 100644 index 0000000000000000000000000000000000000000..a7d4e029baa436e88e4d68090e886afdd998a68d --- /dev/null +++ b/vakyansh-tts/src/glow_tts/text/cleaners.py @@ -0,0 +1,78 @@ +import re + +from unidecode import unidecode +from .numbers import normalize_numbers + + + + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r"\s+") + +def lowercase(text): + return text.lower() + +def collapse_whitespace(text): + return re.sub(_whitespace_re, " ", text) + +def basic_indic_cleaners(text): + """Basic pipeline that collapses whitespace without transliteration.""" + text = collapse_whitespace(text) + return text + + +def english_cleaner(text): + text = text.lower().replace('‘','\'').replace('’','\'') + return text + + +def lowercase(text): + return text.lower() + +def convert_to_ascii(text): + return unidecode(text) + +def expand_numbers(text): + return normalize_numbers(text) + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'missus'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), + ('pvt', 'private'), + ('rs', 'Rupees') +]] + + + + + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/vakyansh-tts/src/glow_tts/text/numbers.py b/vakyansh-tts/src/glow_tts/text/numbers.py new file mode 100644 index 0000000000000000000000000000000000000000..491634d692ee71e7ea0e5213b513e15be825c9b2 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/text/numbers.py @@ -0,0 +1,69 @@ +import inflect +import re + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text \ No newline at end of file diff --git a/vakyansh-tts/src/glow_tts/texttospeech.py b/vakyansh-tts/src/glow_tts/texttospeech.py new file mode 100644 index 0000000000000000000000000000000000000000..3c88925cac0c56e52d35acfa5d6d7e5ce51329c7 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/texttospeech.py @@ -0,0 +1,146 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +from typing import Tuple + +from scipy.io.wavfile import write +from hifi.env import AttrDict +from hifi.models import Generator + +import numpy as np +import os +import json + +import torch +from text import text_to_sequence +import commons +import models +import utils +import sys +from argparse import ArgumentParser + + +def check_directory(dir): + if not os.path.exists(dir): + sys.exit("Error: {} directory does not exist".format(dir)) + + +class TextToMel: + def __init__(self, glow_model_dir, device="cuda"): + self.glow_model_dir = glow_model_dir + check_directory(self.glow_model_dir) + self.device = device + self.hps, self.glow_tts_model = self.load_glow_tts() + pass + + def load_glow_tts(self): + hps = utils.get_hparams_from_dir(self.glow_model_dir) + checkpoint_path = utils.latest_checkpoint_path(self.glow_model_dir) + symbols = list(hps.data.punc) + list(hps.data.chars) + glow_tts_model = models.FlowGenerator( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ) # .to(self.device) + + if self.device == "cuda": + glow_tts_model.to("cuda") + + utils.load_checkpoint(checkpoint_path, glow_tts_model) + glow_tts_model.decoder.store_inverse() + _ = glow_tts_model.eval() + + return hps, glow_tts_model + + def generate_mel(self, text, noise_scale=0.667, length_scale=1.0): + symbols = list(self.hps.data.punc) + list(self.hps.data.chars) + cleaner = self.hps.data.text_cleaners + if getattr(self.hps.data, "add_blank", False): + text_norm = text_to_sequence(text, symbols, cleaner) + text_norm = commons.intersperse(text_norm, len(symbols)) + else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality + text = " " + text.strip() + " " + text_norm = text_to_sequence(text, symbols, cleaner) + + sequence = np.array(text_norm)[None, :] + + if self.device == "cuda": + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() + else: + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]) + + with torch.no_grad(): + (y_gen_tst, *_), *_, (attn_gen, *_) = self.glow_tts_model( + x_tst, + x_tst_lengths, + gen=True, + noise_scale=noise_scale, + length_scale=length_scale, + ) + + return y_gen_tst + #return y_gen_tst.cpu().detach().numpy() + + +class MelToWav: + def __init__(self, hifi_model_dir, device="cuda"): + self.hifi_model_dir = hifi_model_dir + check_directory(self.hifi_model_dir) + self.device = device + self.h, self.hifi_gan_generator = self.load_hifi_gan() + pass + + def load_hifi_gan(self): + checkpoint_path = utils.latest_checkpoint_path(self.hifi_model_dir, regex="g_*") + config_file = os.path.join(self.hifi_model_dir, "config.json") + data = open(config_file).read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + + generator = Generator(h).to(self.device) + + assert os.path.isfile(checkpoint_path) + print("Loading '{}'".format(checkpoint_path)) + state_dict_g = torch.load(checkpoint_path, map_location=self.device) + print("Complete.") + + generator.load_state_dict(state_dict_g["generator"]) + + generator.eval() + generator.remove_weight_norm() + + return h, generator + + def generate_wav(self, mel): + #mel = torch.FloatTensor(mel).to(self.device) + + y_g_hat = self.hifi_gan_generator(mel.to(self.device)) # passing through vocoder + audio = y_g_hat.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().detach().numpy().astype("int16") + + return audio, self.h.sampling_rate + + + + + +if __name__ == "__main__": + + parser = ArgumentParser() + parser.add_argument("-m", "--model", required=True, type=str) + parser.add_argument("-g", "--gan", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-t", "--text", type=str, required=True) + parser.add_argument("-w", "--wav", type=str, required=True) + + args = parser.parse_args() + + text_to_mel = TextToMel(glow_model_dir=args.model, device=args.device) + mel_to_wav = MelToWav(hifi_model_dir=args.gan, device=args.device) + + mel = text_to_mel.generate_mel(args.text) + audio, sr = mel_to_wav.generate_wav(mel) + + write(filename=args.wav, rate=sr, data=audio) \ No newline at end of file diff --git a/vakyansh-tts/src/glow_tts/train.py b/vakyansh-tts/src/glow_tts/train.py new file mode 100644 index 0000000000000000000000000000000000000000..79bf515a707b309e82e9686c140658f23acf1b91 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/train.py @@ -0,0 +1,286 @@ +import os +import json +import argparse +import math +import torch +from torch import nn, optim +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +import torch.multiprocessing as mp +import torch.distributed as dist +from apex.parallel import DistributedDataParallel as DDP +from apex import amp + +from data_utils import TextMelLoader, TextMelCollate +import models +import commons +import utils + + +global_step = 0 + + +def main(): + """Assume Single Node Multi GPUs Training Only""" + assert torch.cuda.is_available(), "CPU training is not allowed." + + n_gpus = torch.cuda.device_count() + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "80000" + + hps = utils.get_hparams() + mp.spawn( + train_and_eval, + nprocs=n_gpus, + args=( + n_gpus, + hps, + ), + ) + + +def train_and_eval(rank, n_gpus, hps): + global global_step + if rank == 0: + logger = utils.get_logger(hps.log_dir) + logger.info(hps) + utils.check_git_hash(hps.log_dir) + writer = SummaryWriter(log_dir=hps.log_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.log_dir, "eval")) + + dist.init_process_group( + backend="nccl", init_method="env://", world_size=n_gpus, rank=rank + ) + torch.manual_seed(hps.train.seed) + torch.cuda.set_device(rank) + + train_dataset = TextMelLoader(hps.data.training_files, hps.data) + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=n_gpus, rank=rank, shuffle=True + ) + collate_fn = TextMelCollate(1) + train_loader = DataLoader( + train_dataset, + num_workers=8, + shuffle=False, + batch_size=hps.train.batch_size, + pin_memory=True, + drop_last=True, + collate_fn=collate_fn, + sampler=train_sampler, + ) + if rank == 0: + val_dataset = TextMelLoader(hps.data.validation_files, hps.data) + val_loader = DataLoader( + val_dataset, + num_workers=8, + shuffle=False, + batch_size=hps.train.batch_size, + pin_memory=True, + drop_last=True, + collate_fn=collate_fn, + ) + symbols = hps.data.punc + hps.data.chars + generator = models.FlowGenerator( + n_vocab=len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ).cuda(rank) + optimizer_g = commons.Adam( + generator.parameters(), + scheduler=hps.train.scheduler, + dim_model=hps.model.hidden_channels, + warmup_steps=hps.train.warmup_steps, + lr=hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + if hps.train.fp16_run: + generator, optimizer_g._optim = amp.initialize( + generator, optimizer_g._optim, opt_level="O1" + ) + generator = DDP(generator) + epoch_str = 1 + global_step = 0 + try: + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), + generator, + optimizer_g, + ) + epoch_str += 1 + optimizer_g.step_num = (epoch_str - 1) * len(train_loader) + optimizer_g._update_learning_rate() + global_step = (epoch_str - 1) * len(train_loader) + except: + if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")): + _ = utils.load_checkpoint( + os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g + ) + + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train( + rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer + ) + evaluate( + rank, + epoch, + hps, + generator, + optimizer_g, + val_loader, + logger, + writer_eval, + ) + if epoch % hps.train.save_epoch == 0: + utils.save_checkpoint( + generator, + optimizer_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(epoch)), + ) + else: + train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None) + + +def train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer): + train_loader.sampler.set_epoch(epoch) + global global_step + + generator.train() + for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(train_loader): + x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda( + rank, non_blocking=True + ) + y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( + rank, non_blocking=True + ) + + # Train Generator + optimizer_g.zero_grad() + + ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + (attn, logw, logw_), + ) = generator(x, x_lengths, y, y_lengths, gen=False) + l_mle = commons.mle_loss(z, z_m, z_logs, logdet, z_mask) + l_length = commons.duration_loss(logw, logw_, x_lengths) + + loss_gs = [l_mle, l_length] + loss_g = sum(loss_gs) + + if hps.train.fp16_run: + with amp.scale_loss(loss_g, optimizer_g._optim) as scaled_loss: + scaled_loss.backward() + grad_norm = commons.clip_grad_value_( + amp.master_params(optimizer_g._optim), 5 + ) + else: + loss_g.backward() + grad_norm = commons.clip_grad_value_(generator.parameters(), 5) + optimizer_g.step() + + if rank == 0: + if batch_idx % hps.train.log_interval == 0: + (y_gen, *_), *_ = generator.module(x[:1], x_lengths[:1], gen=True) + logger.info( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(x), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss_g.item(), + ) + ) + logger.info( + [x.item() for x in loss_gs] + [global_step, optimizer_g.get_lr()] + ) + + scalar_dict = { + "loss/g/total": loss_g, + "learning_rate": optimizer_g.get_lr(), + "grad_norm": grad_norm, + } + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(loss_gs)} + ) + utils.summarize( + writer=writer, + global_step=global_step, + images={ + "y_org": utils.plot_spectrogram_to_numpy( + y[0].data.cpu().numpy() + ), + "y_gen": utils.plot_spectrogram_to_numpy( + y_gen[0].data.cpu().numpy() + ), + "attn": utils.plot_alignment_to_numpy( + attn[0, 0].data.cpu().numpy() + ), + }, + scalars=scalar_dict, + ) + global_step += 1 + + if rank == 0: + logger.info("====> Epoch: {}".format(epoch)) + + +def evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval): + if rank == 0: + global global_step + generator.eval() + losses_tot = [] + with torch.no_grad(): + for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(val_loader): + x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda( + rank, non_blocking=True + ) + y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( + rank, non_blocking=True + ) + + ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + (attn, logw, logw_), + ) = generator(x, x_lengths, y, y_lengths, gen=False) + l_mle = commons.mle_loss(z, z_m, z_logs, logdet, z_mask) + l_length = commons.duration_loss(logw, logw_, x_lengths) + + loss_gs = [l_mle, l_length] + loss_g = sum(loss_gs) + + if batch_idx == 0: + losses_tot = loss_gs + else: + losses_tot = [x + y for (x, y) in zip(losses_tot, loss_gs)] + + if batch_idx % hps.train.log_interval == 0: + logger.info( + "Eval Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(x), + len(val_loader.dataset), + 100.0 * batch_idx / len(val_loader), + loss_g.item(), + ) + ) + logger.info([x.item() for x in loss_gs]) + + losses_tot = [x / len(val_loader) for x in losses_tot] + loss_tot = sum(losses_tot) + scalar_dict = {"loss/g/total": loss_tot} + scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_tot)}) + utils.summarize( + writer=writer_eval, global_step=global_step, scalars=scalar_dict + ) + logger.info("====> Epoch: {}".format(epoch)) + + +if __name__ == "__main__": + main() diff --git a/vakyansh-tts/src/glow_tts/utils.py b/vakyansh-tts/src/glow_tts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a591aa319ccb264110111cda55c4a232b41aae74 --- /dev/null +++ b/vakyansh-tts/src/glow_tts/utils.py @@ -0,0 +1,282 @@ +import os +import glob +import sys +import argparse +import logging +import json +import subprocess +import numpy as np +from scipy.io.wavfile import read +import torch + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def load_checkpoint(checkpoint_path, model, optimizer=None): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + iteration = 1 + if "iteration" in checkpoint_dict.keys(): + iteration = checkpoint_dict["iteration"] + if "learning_rate" in checkpoint_dict.keys(): + learning_rate = checkpoint_dict["learning_rate"] + if optimizer is not None and "optimizer" in checkpoint_dict.keys(): + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + logger.info("%s is not in the checkpoint" % k) + new_state_dict[k] = v + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + logger.info( + "Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration) + ) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at iteration {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize(writer, global_step, scalars={}, histograms={}, images={}): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots() + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow(alignment, aspect="auto", origin="lower", interpolation="none") + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True): + parser = argparse.ArgumentParser() + parser.add_argument("-c", "--config", type=str, help="JSON file for configuration") + parser.add_argument("-m", "--model", type=str, help="Model name") + # parser.add_argument('-g', '--gan', type=str, + # help='Model name') + parser.add_argument("-l", "--logs", type=str, help="logs name") + # parser.add_argument('-s', '--mels', type=str, + # help='logs name') + + args = parser.parse_args() + # model_dir = os.path.join("./logs", args.model) + model_dir = args.model + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + config_path = args.config + config_save_path = os.path.join(model_dir, "config.json") + + # if not config_path : config_path = config_save_path + + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + hparams.log_dir = args.logs + # hparams.mels_dir = args.mels + # hparams.gan_dir = args.gan + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/vakyansh-tts/src/hifi_gan/env.py b/vakyansh-tts/src/hifi_gan/env.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdbc95d4f7a8bad8fd4f5eef657e2b51d946056 --- /dev/null +++ b/vakyansh-tts/src/hifi_gan/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/vakyansh-tts/src/hifi_gan/inference.py b/vakyansh-tts/src/hifi_gan/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..c70ee09b4110677b7cf9732d76a5e6ca93c8860c --- /dev/null +++ b/vakyansh-tts/src/hifi_gan/inference.py @@ -0,0 +1,98 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import os +import argparse +import json +import torch +from scipy.io.wavfile import write +from env import AttrDict +from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav +from models import Generator + +h = None +device = None + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def get_mel(x): + return mel_spectrogram( + x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax + ) + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "*") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return "" + return sorted(cp_list)[-1] + + +def inference(a): + generator = Generator(h).to(device) + + state_dict_g = load_checkpoint(a.checkpoint_file, device) + generator.load_state_dict(state_dict_g["generator"]) + + filelist = os.listdir(a.input_wavs_dir) + + os.makedirs(a.output_dir, exist_ok=True) + + generator.eval() + generator.remove_weight_norm() + with torch.no_grad(): + for i, filname in enumerate(filelist): + wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname)) + wav = wav / MAX_WAV_VALUE + wav = torch.FloatTensor(wav).to(device) + x = get_mel(wav.unsqueeze(0)) + y_g_hat = generator(x) + audio = y_g_hat.squeeze() + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype("int16") + + output_file = os.path.join( + a.output_dir, os.path.splitext(filname)[0] + "_generated.wav" + ) + write(output_file, h.sampling_rate, audio) + print(output_file) + + +def main(): + print("Initializing Inference Process..") + + parser = argparse.ArgumentParser() + parser.add_argument("--input_wavs_dir", default="test_files") + parser.add_argument("--output_dir", default="generated_files") + parser.add_argument("--checkpoint_file", required=True) + a = parser.parse_args() + + config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json") + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + h = AttrDict(json_config) + + torch.manual_seed(h.seed) + global device + if torch.cuda.is_available(): + torch.cuda.manual_seed(h.seed) + device = torch.device("cuda") + else: + device = torch.device("cpu") + + inference(a) + + +if __name__ == "__main__": + main() diff --git a/vakyansh-tts/src/hifi_gan/inference_e2e.py b/vakyansh-tts/src/hifi_gan/inference_e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..062aecd4280925336ab1d36420d2cd47febf661c --- /dev/null +++ b/vakyansh-tts/src/hifi_gan/inference_e2e.py @@ -0,0 +1,91 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import os +import numpy as np +import argparse +import json +import torch +from scipy.io.wavfile import write +from env import AttrDict +from meldataset import MAX_WAV_VALUE +from models import Generator + +h = None +device = None + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "*") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return "" + return sorted(cp_list)[-1] + + +def inference(a): + generator = Generator(h).to(device) + + state_dict_g = load_checkpoint(a.checkpoint_file, device) + generator.load_state_dict(state_dict_g["generator"]) + + filelist = os.listdir(a.input_mels_dir) + + os.makedirs(a.output_dir, exist_ok=True) + + generator.eval() + generator.remove_weight_norm() + with torch.no_grad(): + for i, filname in enumerate(filelist): + x = np.load(os.path.join(a.input_mels_dir, filname)) + x = torch.FloatTensor(x).to(device) + y_g_hat = generator(x) + audio = y_g_hat.squeeze() + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype("int16") + + output_file = os.path.join( + a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav" + ) + write(output_file, h.sampling_rate, audio) + print(output_file) + + +def main(): + print("Initializing Inference Process..") + + parser = argparse.ArgumentParser() + parser.add_argument("--input_mels_dir", default="test_mel_files") + parser.add_argument("--output_dir", default="generated_files_from_mel") + parser.add_argument("--checkpoint_file", required=True) + a = parser.parse_args() + + config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json") + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + h = AttrDict(json_config) + + torch.manual_seed(h.seed) + global device + if torch.cuda.is_available(): + torch.cuda.manual_seed(h.seed) + device = torch.device("cuda") + else: + device = torch.device("cpu") + + inference(a) + + +if __name__ == "__main__": + main() diff --git a/vakyansh-tts/src/hifi_gan/meldataset.py b/vakyansh-tts/src/hifi_gan/meldataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8c6ca9ec8a6cc6408a77492e795bffef7f86b611 --- /dev/null +++ b/vakyansh-tts/src/hifi_gan/meldataset.py @@ -0,0 +1,233 @@ +import math +import os +import random +import torch +import torch.utils.data +import numpy as np +from librosa.util import normalize +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def load_wav(full_path): + sampling_rate, data = read(full_path) + return data, sampling_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) + + global mel_basis, hann_window + if fmax not in mel_basis: + mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel_basis[str(fmax) + "_" + str(y.device)] = ( + torch.from_numpy(mel).float().to(y.device) + ) + hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[str(y.device)], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) + + spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) + spec = spectral_normalize_torch(spec) + + return spec + + +def get_dataset_filelist(a): + with open(a.input_training_file, "r", encoding="utf-8") as fi: + training_files = [x for x in fi.read().split("\n") if len(x) > 0] + + with open(a.input_validation_file, "r", encoding="utf-8") as fi: + validation_files = [x for x in fi.read().split("\n") if len(x) > 0] + return training_files, validation_files + + +class MelDataset(torch.utils.data.Dataset): + def __init__( + self, + training_files, + segment_size, + n_fft, + num_mels, + hop_size, + win_size, + sampling_rate, + fmin, + fmax, + split=True, + shuffle=True, + n_cache_reuse=1, + device=None, + fmax_loss=None, + fine_tuning=False, + base_mels_path=None, + ): + self.audio_files = training_files + random.seed(1234) + if shuffle: + random.shuffle(self.audio_files) + self.segment_size = segment_size + self.sampling_rate = sampling_rate + self.split = split + self.n_fft = n_fft + self.num_mels = num_mels + self.hop_size = hop_size + self.win_size = win_size + self.fmin = fmin + self.fmax = fmax + self.fmax_loss = fmax_loss + self.cached_wav = None + self.n_cache_reuse = n_cache_reuse + self._cache_ref_count = 0 + self.device = device + self.fine_tuning = fine_tuning + self.base_mels_path = base_mels_path + + def __getitem__(self, index): + filename = self.audio_files[index] + if self._cache_ref_count == 0: + audio, sampling_rate = load_wav(filename) + audio = audio / MAX_WAV_VALUE + if not self.fine_tuning: + audio = normalize(audio) * 0.95 + self.cached_wav = audio + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + self._cache_ref_count = self.n_cache_reuse + else: + audio = self.cached_wav + self._cache_ref_count -= 1 + + audio = torch.FloatTensor(audio) + audio = audio.unsqueeze(0) + + if not self.fine_tuning: + if self.split: + if audio.size(1) >= self.segment_size: + max_audio_start = audio.size(1) - self.segment_size + audio_start = random.randint(0, max_audio_start) + audio = audio[:, audio_start : audio_start + self.segment_size] + else: + audio = torch.nn.functional.pad( + audio, (0, self.segment_size - audio.size(1)), "constant" + ) + + mel = mel_spectrogram( + audio, + self.n_fft, + self.num_mels, + self.sampling_rate, + self.hop_size, + self.win_size, + self.fmin, + self.fmax, + center=False, + ) + else: + mel = np.load( + os.path.join( + self.base_mels_path, + os.path.splitext(os.path.split(filename)[-1])[0] + ".npy", + ) + ) + mel = torch.from_numpy(mel) + + if len(mel.shape) < 3: + mel = mel.unsqueeze(0) + + if self.split: + frames_per_seg = math.ceil(self.segment_size / self.hop_size) + + if audio.size(1) >= self.segment_size: + mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) + mel = mel[:, :, mel_start : mel_start + frames_per_seg] + audio = audio[ + :, + mel_start + * self.hop_size : (mel_start + frames_per_seg) + * self.hop_size, + ] + else: + mel = torch.nn.functional.pad( + mel, (0, frames_per_seg - mel.size(2)), "constant" + ) + audio = torch.nn.functional.pad( + audio, (0, self.segment_size - audio.size(1)), "constant" + ) + + mel_loss = mel_spectrogram( + audio, + self.n_fft, + self.num_mels, + self.sampling_rate, + self.hop_size, + self.win_size, + self.fmin, + self.fmax_loss, + center=False, + ) + + return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) + + def __len__(self): + return len(self.audio_files) diff --git a/vakyansh-tts/src/hifi_gan/models.py b/vakyansh-tts/src/hifi_gan/models.py new file mode 100644 index 0000000000000000000000000000000000000000..be51fa51407e6ce1daaee5e8d090f6acdbee0db9 --- /dev/null +++ b/vakyansh-tts/src/hifi_gan/models.py @@ -0,0 +1,403 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from utils import init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Generator(torch.nn.Module): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + self.conv_pre = weight_norm( + Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3) + ) + resblock = ResBlock1 if h.resblock == "1" else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + h.upsample_initial_channel // (2 ** i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) + ): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiPeriodDiscriminator, self).__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ] + ) + self.meanpools = nn.ModuleList( + [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg ** 2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/vakyansh-tts/src/hifi_gan/train.py b/vakyansh-tts/src/hifi_gan/train.py new file mode 100644 index 0000000000000000000000000000000000000000..709e085d019eb98006b26555f7fe2582d759efa6 --- /dev/null +++ b/vakyansh-tts/src/hifi_gan/train.py @@ -0,0 +1,400 @@ +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) +import itertools +import os +import time +import argparse +import json +import torch +import torch.nn.functional as F +from torch.utils.tensorboard import SummaryWriter +from torch.utils.data import DistributedSampler, DataLoader +import torch.multiprocessing as mp +from torch.distributed import init_process_group +from torch.nn.parallel import DistributedDataParallel +from env import AttrDict, build_env +from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist +from models import ( + Generator, + MultiPeriodDiscriminator, + MultiScaleDiscriminator, + feature_loss, + generator_loss, + discriminator_loss, +) +from utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint + +torch.backends.cudnn.benchmark = True + + +def train(rank, a, h): + if h.num_gpus > 1: + init_process_group( + backend=h.dist_config["dist_backend"], + init_method=h.dist_config["dist_url"], + world_size=h.dist_config["world_size"] * h.num_gpus, + rank=rank, + ) + + torch.cuda.manual_seed(h.seed) + device = torch.device("cuda:{:d}".format(rank)) + + generator = Generator(h).to(device) + mpd = MultiPeriodDiscriminator().to(device) + msd = MultiScaleDiscriminator().to(device) + + if rank == 0: + print(generator) + os.makedirs(a.checkpoint_path, exist_ok=True) + print("checkpoints directory : ", a.checkpoint_path) + + if os.path.isdir(a.checkpoint_path): + cp_g = scan_checkpoint(a.checkpoint_path, "g_") + cp_do = scan_checkpoint(a.checkpoint_path, "do_") + + steps = 0 + if cp_g is None or cp_do is None: + state_dict_do = None + last_epoch = -1 + else: + state_dict_g = load_checkpoint(cp_g, device) + state_dict_do = load_checkpoint(cp_do, device) + generator.load_state_dict(state_dict_g["generator"]) + mpd.load_state_dict(state_dict_do["mpd"]) + msd.load_state_dict(state_dict_do["msd"]) + steps = state_dict_do["steps"] + 1 + last_epoch = state_dict_do["epoch"] + + if h.num_gpus > 1: + generator = DistributedDataParallel(generator, device_ids=[rank]).to(device) + mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device) + msd = DistributedDataParallel(msd, device_ids=[rank]).to(device) + + optim_g = torch.optim.AdamW( + generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2] + ) + optim_d = torch.optim.AdamW( + itertools.chain(msd.parameters(), mpd.parameters()), + h.learning_rate, + betas=[h.adam_b1, h.adam_b2], + ) + + if state_dict_do is not None: + optim_g.load_state_dict(state_dict_do["optim_g"]) + optim_d.load_state_dict(state_dict_do["optim_d"]) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=h.lr_decay, last_epoch=last_epoch + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=h.lr_decay, last_epoch=last_epoch + ) + + training_filelist, validation_filelist = get_dataset_filelist(a) + + trainset = MelDataset( + training_filelist, + h.segment_size, + h.n_fft, + h.num_mels, + h.hop_size, + h.win_size, + h.sampling_rate, + h.fmin, + h.fmax, + n_cache_reuse=0, + shuffle=False if h.num_gpus > 1 else True, + fmax_loss=h.fmax_for_loss, + device=device, + fine_tuning=a.fine_tuning, + base_mels_path=a.input_mels_dir, + ) + + train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None + + train_loader = DataLoader( + trainset, + num_workers=h.num_workers, + shuffle=False, + sampler=train_sampler, + batch_size=h.batch_size, + pin_memory=True, + drop_last=True, + ) + + if rank == 0: + validset = MelDataset( + validation_filelist, + h.segment_size, + h.n_fft, + h.num_mels, + h.hop_size, + h.win_size, + h.sampling_rate, + h.fmin, + h.fmax, + False, + False, + n_cache_reuse=0, + fmax_loss=h.fmax_for_loss, + device=device, + fine_tuning=a.fine_tuning, + base_mels_path=a.input_mels_dir, + ) + validation_loader = DataLoader( + validset, + num_workers=1, + shuffle=False, + sampler=None, + batch_size=1, + pin_memory=True, + drop_last=True, + ) + + sw = SummaryWriter(os.path.join(a.logs_path)) + + generator.train() + mpd.train() + msd.train() + for epoch in range(max(0, last_epoch), a.training_epochs): + if rank == 0: + start = time.time() + print("Epoch: {}".format(epoch + 1)) + + if h.num_gpus > 1: + train_sampler.set_epoch(epoch) + + for i, batch in enumerate(train_loader): + if rank == 0: + start_b = time.time() + x, y, _, y_mel = batch + x = torch.autograd.Variable(x.to(device, non_blocking=True)) + y = torch.autograd.Variable(y.to(device, non_blocking=True)) + y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True)) + y = y.unsqueeze(1) + + y_g_hat = generator(x) + y_g_hat_mel = mel_spectrogram( + y_g_hat.squeeze(1), + h.n_fft, + h.num_mels, + h.sampling_rate, + h.hop_size, + h.win_size, + h.fmin, + h.fmax_for_loss, + ) + + optim_d.zero_grad() + + # MPD + y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach()) + loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss( + y_df_hat_r, y_df_hat_g + ) + + # MSD + y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach()) + loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss( + y_ds_hat_r, y_ds_hat_g + ) + + loss_disc_all = loss_disc_s + loss_disc_f + + loss_disc_all.backward() + optim_d.step() + + # Generator + optim_g.zero_grad() + + # L1 Mel-Spectrogram Loss + loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45 + + y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat) + y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat) + loss_fm_f = feature_loss(fmap_f_r, fmap_f_g) + loss_fm_s = feature_loss(fmap_s_r, fmap_s_g) + loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g) + loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g) + loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel + + loss_gen_all.backward() + optim_g.step() + + if rank == 0: + # STDOUT logging + if steps % a.stdout_interval == 0: + with torch.no_grad(): + mel_error = F.l1_loss(y_mel, y_g_hat_mel).item() + + print( + "Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}".format( + steps, loss_gen_all, mel_error, time.time() - start_b + ) + ) + + # checkpointing + if steps % a.checkpoint_interval == 0 and steps != 0: + checkpoint_path = "{}/g_{:08d}".format(a.checkpoint_path, steps) + save_checkpoint( + checkpoint_path, + { + "generator": ( + generator.module if h.num_gpus > 1 else generator + ).state_dict() + }, + ) + checkpoint_path = "{}/do_{:08d}".format(a.checkpoint_path, steps) + save_checkpoint( + checkpoint_path, + { + "mpd": (mpd.module if h.num_gpus > 1 else mpd).state_dict(), + "msd": (msd.module if h.num_gpus > 1 else msd).state_dict(), + "optim_g": optim_g.state_dict(), + "optim_d": optim_d.state_dict(), + "steps": steps, + "epoch": epoch, + }, + ) + + # Tensorboard summary logging + if steps % a.summary_interval == 0: + sw.add_scalar("training/gen_loss_total", loss_gen_all, steps) + sw.add_scalar("training/mel_spec_error", mel_error, steps) + + # Validation + if steps % a.validation_interval == 0: # and steps != 0: + generator.eval() + torch.cuda.empty_cache() + val_err_tot = 0 + with torch.no_grad(): + for j, batch in enumerate(validation_loader): + x, y, _, y_mel = batch + y_g_hat = generator(x.to(device)) + y_mel = torch.autograd.Variable( + y_mel.to(device, non_blocking=True) + ) + y_g_hat_mel = mel_spectrogram( + y_g_hat.squeeze(1), + h.n_fft, + h.num_mels, + h.sampling_rate, + h.hop_size, + h.win_size, + h.fmin, + h.fmax_for_loss, + ) + val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item() + + if j <= 4: + if steps == 0: + sw.add_audio( + "gt/y_{}".format(j), + y[0], + steps, + h.sampling_rate, + ) + sw.add_figure( + "gt/y_spec_{}".format(j), + plot_spectrogram(x[0]), + steps, + ) + + sw.add_audio( + "generated/y_hat_{}".format(j), + y_g_hat[0], + steps, + h.sampling_rate, + ) + y_hat_spec = mel_spectrogram( + y_g_hat.squeeze(1), + h.n_fft, + h.num_mels, + h.sampling_rate, + h.hop_size, + h.win_size, + h.fmin, + h.fmax, + ) + sw.add_figure( + "generated/y_hat_spec_{}".format(j), + plot_spectrogram( + y_hat_spec.squeeze(0).cpu().numpy() + ), + steps, + ) + + val_err = val_err_tot / (j + 1) + sw.add_scalar("validation/mel_spec_error", val_err, steps) + + generator.train() + + steps += 1 + + scheduler_g.step() + scheduler_d.step() + + if rank == 0: + print( + "Time taken for epoch {} is {} sec\n".format( + epoch + 1, int(time.time() - start) + ) + ) + + +def main(): + print("Initializing Training Process..") + + parser = argparse.ArgumentParser() + + parser.add_argument("--group_name", default=None) + parser.add_argument("--input_wavs_dir", default="LJSpeech-1.1/wavs") + parser.add_argument("--input_mels_dir", default="ft_dataset") + parser.add_argument("--input_training_file", default="LJSpeech-1.1/training.txt") + parser.add_argument( + "--input_validation_file", default="LJSpeech-1.1/validation.txt" + ) + parser.add_argument("--checkpoint_path", default="cp_hifigan") + parser.add_argument("--logs_path", default="") + parser.add_argument("--config", default="") + parser.add_argument("--training_epochs", default=3100, type=int) + parser.add_argument("--stdout_interval", default=5, type=int) + parser.add_argument("--checkpoint_interval", default=5000, type=int) + parser.add_argument("--summary_interval", default=100, type=int) + parser.add_argument("--validation_interval", default=1000, type=int) + parser.add_argument("--fine_tuning", default=False, type=bool) + + a = parser.parse_args() + + with open(a.config) as f: + data = f.read() + + json_config = json.loads(data) + h = AttrDict(json_config) + build_env(a.config, "config.json", a.checkpoint_path) + + torch.manual_seed(h.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(h.seed) + h.num_gpus = torch.cuda.device_count() + h.batch_size = int(h.batch_size / h.num_gpus) + print("Batch size per GPU :", h.batch_size) + else: + pass + + if h.num_gpus > 1: + mp.spawn( + train, + nprocs=h.num_gpus, + args=( + a, + h, + ), + ) + else: + train(0, a, h) + + +if __name__ == "__main__": + main() diff --git a/vakyansh-tts/src/hifi_gan/utils.py b/vakyansh-tts/src/hifi_gan/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..71e9b2c99e053e2d4239074a67d64b834898c348 --- /dev/null +++ b/vakyansh-tts/src/hifi_gan/utils.py @@ -0,0 +1,57 @@ +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm + +matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "????????") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] diff --git a/vakyansh-tts/tts_infer/__init__.py b/vakyansh-tts/tts_infer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vakyansh-tts/tts_infer/example_inference.py b/vakyansh-tts/tts_infer/example_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..676718fff3c6a7120cea91b0cfc95f8872929da7 --- /dev/null +++ b/vakyansh-tts/tts_infer/example_inference.py @@ -0,0 +1,79 @@ +''' Example file to test tts_infer after installing it. Refer to section 1.1 in README.md for steps of installation. ''' + +from tts_infer.tts import TextToMel, MelToWav +from tts_infer.transliterate import XlitEngine +from tts_infer.num_to_word_on_sent import normalize_nums + +import re +import numpy as np +from scipy.io.wavfile import write + +from mosestokenizer import * +from indicnlp.tokenize import sentence_tokenize + +INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"] + +def split_sentences(paragraph, language): + if language == "en": + with MosesSentenceSplitter(language) as splitter: + return splitter([paragraph]) + elif language in INDIC: + return sentence_tokenize.sentence_split(paragraph, lang=language) + + +device='cpu' +text_to_mel = TextToMel(glow_model_dir='/path/to/glow_ckp', device=device) +mel_to_wav = MelToWav(hifi_model_dir='/path/to/hifi_ckp', device=device) + +lang='hi' # transliteration from En to Hi +engine = XlitEngine(lang) # loading translit model globally + +def translit(text, lang): + reg = re.compile(r'[a-zA-Z]') + words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()] + updated_sent = ' '.join(words) + return updated_sent + +def run_tts(text, lang): + text = text.replace('।', '.') # only for hindi models + text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang + text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang + final_text = ' ' + text_num_to_word_and_transliterated + + mel = text_to_mel.generate_mel(final_text) + audio, sr = mel_to_wav.generate_wav(mel) + write(filename='temp.wav', rate=sr, data=audio) # for saving wav file, if needed + return (sr, audio) + +def run_tts_paragraph(text, lang): + audio_list = [] + split_sentences_list = split_sentences(text, language='hi') + + for sent in split_sentences_list: + sr, audio = run_tts(sent, lang) + audio_list.append(audio) + + concatenated_audio = np.concatenate([i for i in audio_list]) + write(filename='temp_long.wav', rate=sr, data=concatenated_audio) + return (sr, concatenated_audio) + +if __name__ == "__main__": + _, audio = run_tts('mera naam neeraj hai', 'hi') + + para = ''' + भारत मेरा देश है और मुझे भारतीय होने पर गर्व है। ये विश्व का सातवाँ सबसे बड़ा और विश्व में दूसरा सबसे अधिक जनसंख्या वाला देश है। + इसे भारत, हिन्दुस्तान और आर्यव्रत के नाम से भी जाना जाता है। ये एक प्रायद्वीप है जो पूरब में बंगाल की खाड़ी, + पश्चिम में अरेबियन सागर और दक्षिण में भारतीय महासागर जैसे तीन महासगरों से घिरा हुआ है। + भारत का राष्ट्रीय पशु चीता, राष्ट्रीय पक्षी मोर, राष्ट्रीय फूल कमल, और राष्ट्रीय फल आम है। + भारत मेरा देश है और मुझे भारतीय होने पर गर्व है। ये विश्व का सातवाँ सबसे बड़ा और विश्व में दूसरा सबसे अधिक जनसंख्या वाला देश है। + इसे भारत, हिन्दुस्तान और आर्यव्रत के नाम से भी जाना जाता है। ये एक प्रायद्वीप है जो पूरब में बंगाल की खाड़ी, + पश्चिम में अरेबियन सागर और दक्षिण में भारतीय महासागर जैसे तीन महासगरों से घिरा हुआ है। + भारत का राष्ट्रीय पशु चीता, राष्ट्रीय पक्षी मोर, राष्ट्रीय फूल कमल, और राष्ट्रीय फल आम है। + भारत मेरा देश है और मुझे भारतीय होने पर गर्व है। ये विश्व का सातवाँ सबसे बड़ा और विश्व में दूसरा सबसे अधिक जनसंख्या वाला देश है। + इसे भारत, हिन्दुस्तान और आर्यव्रत के नाम से भी जाना जाता है। ये एक प्रायद्वीप है जो पूरब में बंगाल की खाड़ी, + पश्चिम में अरेबियन सागर और दक्षिण में भारतीय महासागर जैसे तीन महासगरों से घिरा हुआ है। + भारत का राष्ट्रीय पशु चीता, राष्ट्रीय पक्षी मोर, राष्ट्रीय फूल कमल, और राष्ट्रीय फल आम है। + ''' + + print('Num chars in paragraph: ', len(para)) + _, audio_long = run_tts_paragraph(para, 'hi') diff --git a/vakyansh-tts/tts_infer/num_to_word_on_sent.py b/vakyansh-tts/tts_infer/num_to_word_on_sent.py new file mode 100644 index 0000000000000000000000000000000000000000..de571c2be63fa467491d01daf0e2f38dada67de9 --- /dev/null +++ b/vakyansh-tts/tts_infer/num_to_word_on_sent.py @@ -0,0 +1,1319 @@ +import re +import string + +# ----------------------------- indic_num.py ----------------------------- +supported_lang = {"en", "hi", "gu", "mr", "bn", "te", "ta", "kn", "or", "pa"} +# supported_lang = {'eng', 'hin', 'guj', 'mar', 'ben', 'tel', 'tam', 'kan', 'ori', 'pan'} # Three alphabet lang code + +all_num = { + "en": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], + "hi": ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"], + "gu": ["૦", "૧", "૨", "૩", "૪", "૫", "૬", "૭", "૮", "૯"], + "mr": ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"], + "bn": ["০", "১", "২", "৩", "৪", "৫", "৬", "৭", "৮", "৯"], + "te": ["౦", "౧", "౨", "౩", "౪", "౫", "౬", "౭", "౮", "౯"], + "ta": ["0", "௧", "௨", "௩", "௪", "௫", "௬", "௭", "௮", "௯", "௰"], + "kn": ["೦", "೧", "೨", "೩", "೪", "೫", "೬", "೭", "೮", "೯"], + "or": ["୦", "୧", "୨", "୩", "୪", "୫", "୬", "୭", "୮", "୯"], + "pa": ["੦", "੧", "੨", "੩", "੪", "੫", "੬", "੭", "੮", "੯"], +} + +num_dict = dict() +num_dict["en"] = { + "0": "zero", + "1": "one", + "2": "two", + "3": "three", + "4": "four", + "5": "five", + "6": "six", + "7": "seven", + "8": "eight", + "9": "nine", + "10": "ten", + "11": "eleven", + "12": "twelve", + "13": "thirteen", + "14": "fourteen", + "15": "fifteen", + "16": "sixteen", + "17": "seventeen", + "18": "eighteen", + "19": "nineteen", + "20": "twenty", + "21": "twenty-one", + "22": "twenty-two", + "23": "twenty-three", + "24": "twenty-four", + "25": "twenty-five", + "26": "twenty-six", + "27": "twenty-seven", + "28": "twenty-eight", + "29": "twenty-nine", + "30": "thirty", + "31": "thirty-one", + "32": "thirty-two", + "33": "thirty-three", + "34": "thirty-four", + "35": "thirty-five", + "36": "thirty-six", + "37": "thirty-seven", + "38": "thirty-eight", + "39": "thirty-nine", + "40": "forty", + "41": "forty-one", + "42": "forty-two", + "43": "forty-three", + "44": "forty-four", + "45": "forty-five", + "46": "forty-six", + "47": "forty-seven", + "48": "forty-eight", + "49": "forty-nine", + "50": "fifty", + "51": "fifty-one", + "52": "fifty-two", + "53": "fifty-three", + "54": "fifty-four", + "55": "fifty-five", + "56": "fifty-six", + "57": "fifty-seven", + "58": "fifty-eight", + "59": "fifty-nine", + "60": "sixty", + "61": "sixty-one", + "62": "sixty-two", + "63": "sixty-three", + "64": "sixty-four", + "65": "sixty-five", + "66": "sixty-six", + "67": "sixty-seven", + "68": "sixty-eight", + "69": "sixty-nine", + "70": "seventy", + "71": "seventy-one", + "72": "seventy-two", + "73": "seventy-three", + "74": "seventy-four", + "75": "seventy-five", + "76": "seventy-six", + "77": "seventy-seven", + "78": "seventy-eight", + "79": "seventy-nine", + "80": "eighty", + "81": "eighty-one", + "82": "eighty-two", + "83": "eighty-three", + "84": "eighty-four", + "85": "eighty-five", + "86": "eighty-six", + "87": "eighty-seven", + "88": "eighty-eight", + "89": "eighty-nine", + "90": "ninety", + "91": "ninety-one", + "92": "ninety-two", + "93": "ninety-three", + "94": "ninety-four", + "95": "ninety-five", + "96": "ninety-six", + "97": "ninety-seven", + "98": "ninety-eight", + "99": "ninety-nine", + "100": "hundred", + "1000": "thousand", + "100000": "lac", + "10000000": "crore", + "1000000000": "arab", +} # English-India +num_dict["hi"] = { + "0": "शून्य", + "1": "एक", + "2": "दो", + "3": "तीन", + "4": "चार", + "5": "पाँच", + "6": "छः", + "7": "सात", + "8": "आठ", + "9": "नौ", + "10": "दस", + "11": "ग्यारह", + "12": "बारह", + "13": "तेरह", + "14": "चौदह", + "15": "पंद्रह", + "16": "सोलह", + "17": "सत्रह", + "18": "अट्ठारह", + "19": "उन्नीस", + "20": "बीस", + "21": "इक्कीस", + "22": "बाईस", + "23": "तेईस", + "24": "चौबिस", + "25": "पच्चीस", + "26": "छब्बीस", + "27": "सत्ताईस", + "28": "अट्ठाईस", + "29": "उनतीस", + "30": "तीस", + "31": "इकतीस", + "32": "बत्तीस", + "33": "तैंतीस", + "34": "चौंतीस", + "35": "पैंतीस", + "36": "छत्तीस", + "37": "सैंतीस", + "38": "अड़तीस", + "39": "उनतालीस", + "40": "चालीस", + "41": "इकतालीस", + "42": "बयालीस", + "43": "तैंतालीस", + "44": "चौंतालीस", + "45": "पैंतालीस", + "46": "छियालीस", + "47": "सैंतालीस", + "48": "अड़तालीस", + "49": "उनचास", + "50": "पचास", + "51": "इक्यावन​", + "52": "बावन", + "53": "तिरेपन", + "54": "चौवन", + "55": "पचपन", + "56": "छप्पन", + "57": "सत्तावन", + "58": "अट्ठावन", + "59": "उनसठ", + "60": "साठ", + "61": "इकसठ", + "62": "बासठ", + "63": "तिरेसठ", + "64": "चौंसठ", + "65": "पैंसठ", + "66": "छयासठ", + "67": "सरसठ​", + "68": "अड़सठ", + "69": "उनहत्तर", + "70": "सत्तर", + "71": "इकहत्तर", + "72": "बहत्तर", + "73": "तिहत्तर", + "74": "चौहत्तर", + "75": "पचहत्तर", + "76": "छिहत्तर", + "77": "सतहत्तर", + "78": "अठहत्तर", + "79": "उन्यासी", + "80": "अस्सी", + "81": "इक्यासी", + "82": "बयासी", + "83": "तिरासी", + "84": "चौरासी", + "85": "पचासी", + "86": "छियासी", + "87": "सत्तासी", + "88": "अठासी", + "89": "नवासी", + "90": "नब्बे", + "91": "इक्यानवे", + "92": "बानवे", + "93": "तिरानवे", + "94": "चौरानवे", + "95": "पचानवे", + "96": "छियानवे", + "97": "सत्तानवे", + "98": "अट्ठानवे", + "99": "निन्यानवे", + "100": "सौ", + "1000": "हज़ार", + "100000": "लाख", + "10000000": "करोड़", + "1000000000": "अरब", +} # Hindi +num_dict["gu"] = { + "0": "શૂન્ય", + "1": "એક", + "2": "બે", + "3": "ત્રણ", + "4": "ચાર", + "5": "પાંચ", + "6": "છ", + "7": "સાત", + "8": "આઠ", + "9": "નવ", + "10": "દસ", + "11": "અગિયાર", + "12": "બાર", + "13": "તેર", + "14": "ચૌદ", + "15": "પંદર", + "16": "સોળ", + "17": "સત્તર", + "18": "અઢાર", + "19": "ઓગણિસ", + "20": "વીસ", + "21": "એકવીસ", + "22": "બાવીસ", + "23": "તેવીસ", + "24": "ચોવીસ", + "25": "પચ્ચીસ", + "26": "છવીસ", + "27": "સત્તાવીસ", + "28": "અઠ્ઠાવીસ", + "29": "ઓગણત્રીસ", + "30": "ત્રીસ", + "31": "એકત્રીસ", + "32": "બત્રીસ", + "33": "તેત્રીસ", + "34": "ચોત્રીસ", + "35": "પાંત્રીસ", + "36": "છત્રીસ", + "37": "સડત્રીસ", + "38": "અડત્રીસ", + "39": "ઓગણચાલીસ", + "40": "ચાલીસ", + "41": "એકતાલીસ", + "42": "બેતાલીસ", + "43": "ત્રેતાલીસ", + "44": "ચુંમાલીસ", + "45": "પિસ્તાલીસ", + "46": "છેતાલીસ", + "47": "સુડતાલીસ", + "48": "અડતાલીસ", + "49": "ઓગણપચાસ", + "50": "પચાસ", + "51": "એકાવન", + "52": "બાવન", + "53": "ત્રેપન", + "54": "ચોપન", + "55": "પંચાવન", + "56": "છપ્પન", + "57": "સત્તાવન", + "58": "અઠ્ઠાવન", + "59": "ઓગણસાઠ", + "60": "સાઈઠ", + "61": "એકસઠ", + "62": "બાસઠ", + "63": "ત્રેસઠ", + "64": "ચોસઠ", + "65": "પાંસઠ", + "66": "છાસઠ", + "67": "સડસઠ", + "68": "અડસઠ", + "69": "અગણોસિત્તેર", + "70": "સિત્તેર", + "71": "એકોતેર", + "72": "બોતેર", + "73": "તોતેર", + "74": "ચુમોતેર", + "75": "પંચોતેર", + "76": "છોતેર", + "77": "સિત્યોતેર", + "78": "ઇઠ્યોતેર", + "79": "ઓગણાએંસી", + "80": "એંસી", + "81": "એક્યાસી", + "82": "બ્યાસી", + "83": "ત્યાસી", + "84": "ચોર્યાસી", + "85": "પંચાસી", + "86": "છ્યાસી", + "87": "સિત્યાસી", + "88": "ઈઠ્યાસી", + "89": "નેવ્યાસી", + "90": "નેવું", + "91": "એકાણું", + "92": "બાણું", + "93": "ત્રાણું", + "94": "ચોરાણું", + "95": "પંચાણું", + "96": "છન્નું", + "97": "સત્તાણું", + "98": "અઠ્ઠાણું", + "99": "નવ્વાણું", + "100": "સો", + "1000": "હજાર", + "100000": "લાખ", + "1000000": "દસ લાખ", + "10000000": "કરોડ઼", +} # Gujarati +num_dict["mr"] = { + "0": "शून्य", + "1": "एक", + "2": "दोन", + "3": "तीन", + "4": "चार", + "5": "पाच", + "6": "सहा", + "7": "सात", + "8": "आठ", + "9": "नऊ", + "10": "दहा", + "11": "अकरा", + "12": "बारा", + "13": "तेरा", + "14": "चौदा", + "15": "पंधरा", + "16": "सोळा", + "17": "सतरा", + "18": "अठरा", + "19": "एकोणीस", + "20": "वीस", + "21": "एकवीस", + "22": "बावीस", + "23": "तेवीस", + "24": "चोवीस", + "25": "पंचवीस", + "26": "सव्वीस", + "27": "सत्तावीस", + "28": "अठ्ठावीस", + "29": "एकोणतीस", + "30": "तीस", + "31": "एकतीस", + "32": "बत्तीस", + "33": "तेहेतीस", + "34": "चौतीस", + "35": "पस्तीस", + "36": "छत्तीस", + "37": "सदतीस", + "38": "अडतीस", + "39": "एकोणचाळीस", + "40": "चाळीस", + "41": "एक्केचाळीस", + "42": "बेचाळीस", + "43": "त्रेचाळीस", + "44": "चव्वेचाळीस", + "45": "पंचेचाळीस", + "46": "सेहेचाळीस", + "47": "सत्तेचाळीस", + "48": "अठ्ठेचाळीस", + "49": "एकोणपन्नास", + "50": "पन्नास", + "51": "एक्कावन्न", + "52": "बावन्न", + "53": "त्रेपन्न", + "54": "चोपन्न", + "55": "पंचावन्न", + "56": "छप्पन्न", + "57": "सत्तावन्न", + "58": "अठ्ठावन्न", + "59": "एकोणसाठ", + "60": "साठ", + "61": "एकसष्ठ", + "62": "बासष्ठ", + "63": "त्रेसष्ठ", + "64": "चौसष्ठ", + "65": "पासष्ठ", + "66": "सहासष्ठ", + "67": "सदुसष्ठ", + "68": "अडुसष्ठ", + "69": "एकोणसत्तर", + "70": "सत्तर", + "71": "एक्काहत्तर", + "72": "बाहत्तर", + "73": "त्र्याहत्तर", + "74": "चौर्‍याहत्तर", + "75": "पंच्याहत्तर", + "76": "शहात्तर", + "77": "सत्याहत्तर", + "78": "अठ्ठ्याहत्तर", + "79": "एकोण ऐंशी", + "80": "ऐंशी", + "81": "एक्क्याऐंशी", + "82": "ब्याऐंशी", + "83": "त्र्याऐंशी", + "84": "चौऱ्याऐंशी", + "85": "पंच्याऐंशी", + "86": "शहाऐंशी", + "87": "सत्त्याऐंशी", + "88": "अठ्ठ्याऐंशी", + "89": "एकोणनव्वद", + "90": "नव्वद", + "91": "एक्क्याण्णव", + "92": "ब्याण्णव", + "93": "त्र्याण्णव", + "94": "चौऱ्याण्णव", + "95": "पंच्याण्णव", + "96": "शहाण्णव", + "97": "सत्त्याण्णव", + "98": "अठ्ठ्याण्णव", + "99": "नव्व्याण्णव", + "100": "शे", + "1000": "हजार", + "100000": "लाख", + "10000000": "कोटी", + "1000000000": "अब्ज", +} # Marathi +num_dict["bn"] = { + "0": "শূন্য", + "1": "এক", + "2": "দুই", + "3": "তিন", + "4": "চার", + "5": "পাঁচ", + "6": "ছয়", + "7": "সাত", + "8": "আট", + "9": "নয়", + "10": "দশ", + "11": "এগার", + "12": "বার", + "13": "তের", + "14": "চৌদ্দ", + "15": "পনের", + "16": "ষোল", + "17": "সতের", + "18": "আঠার", + "19": "ঊনিশ", + "20": "বিশ", + "21": "একুশ", + "22": "বাইশ", + "23": "তেইশ", + "24": "চব্বিশ", + "25": "পঁচিশ", + "26": "ছাব্বিশ", + "27": "সাতাশ", + "28": "আঠাশ", + "29": "ঊনত্রিশ", + "30": "ত্রিশ", + "31": "একত্রিশ", + "32": "বত্রিশ", + "33": "তেত্রিশ", + "34": "চৌত্রিশ", + "35": "পঁয়ত্রিশ", + "36": "ছত্রিশ", + "37": "সাঁইত্রিশ", + "38": "আটত্রিশ", + "39": "ঊনচল্লিশ", + "40": "চল্লিশ", + "41": "একচল্লিশ", + "42": "বিয়াল্লিশ", + "43": "তেতাল্লিশ", + "44": "চুয়াল্লিশ", + "45": "পঁয়তাল্লিশ", + "46": "ছেচল্লিশ", + "47": "সাতচল্লিশ", + "48": "আটচল্লিশ", + "49": "ঊনপঞ্চাশ", + "50": "পঞ্চাশ", + "51": "একান্ন", + "52": "বায়ান্ন", + "53": "তিপ্পান্ন", + "54": "চুয়ান্ন", + "55": "পঞ্চান্ন", + "56": "ছাপ্পান্ন", + "57": "সাতান্ন", + "58": "আটান্ন", + "59": "ঊনষাট", + "60": "ষাট", + "61": "একষট্টি", + "62": "বাষট্টি", + "63": "তেষট্টি", + "64": "চৌষট্টি", + "65": "পঁয়ষট্টি", + "66": "ছেষট্টি", + "67": "সাতষট্টি", + "68": "আটষট্টি", + "69": "ঊনসত্তর", + "70": "সত্তর", + "71": "একাত্তর", + "72": "বাহাত্তর", + "73": "তিয়াত্তর", + "74": "চুয়াত্তর", + "75": "পঁচাত্তর", + "76": "ছিয়াত্তর", + "77": "সাতাত্তর", + "78": "আটাত্তর", + "79": "ঊনআশি", + "80": "আশি", + "81": "একাশি", + "82": "বিরাশি", + "83": "তিরাশি", + "84": "চুরাশি", + "85": "পঁচাশি", + "86": "ছিয়াশি", + "87": "সাতাশি", + "88": "আটাশি", + "89": "ঊননব্বই", + "90": "নব্বই", + "91": "একানব্বই", + "92": "বিরানব্বই", + "93": "তিরানব্বই", + "94": "চুরানব্বই", + "95": "পঁচানব্বই", + "96": "ছিয়ানব্বই", + "97": "সাতানব্বই", + "98": "আটানব্বই", + "99": "নিরানব্বই", + "100": "শো", + "1000": "হাজার", + "100000": "লাখ", + "10000000": "কোটি", + "1000000000": "একশ’ কোটি", +} # Bengali +num_dict["te"] = { + "0": "సున్నా", + "1": "ఒకటి", + "2": "రెండు", + "3": "మూడు", + "4": "నాలుగు", + "5": "ఐదు", + "6": "ఆరు", + "7": "ఏడు", + "8": "ఎనిమిది", + "9": "తొమ్మిది", + "10": "పది", + "11": "పదకొండు", + "12": "పన్నెండు", + "13": "పదమూడు", + "14": "పద్నాలుగు", + "15": "పదిహేను", + "16": "పదహారు", + "17": "పదిహేడు", + "18": "పద్దెనిమిది", + "19": "పందొమ్మిది", + "20": "ఇరవై", + "21": "ఇరవై ఒకటి", + "22": "ఇరవై రెండు", + "23": "ఇరవై మూడు", + "24": "ఇరవై నాలుగు", + "25": "ఇరవై ఐదు", + "26": "ఇరవై ఆరు", + "27": "ఇరవై ఏడు", + "28": "ఇరవై ఎనిమిది", + "29": "ఇరవై తొమ్మిది", + "30": "ముప్పై", + "31": "ముప్పై ఒకటి", + "32": "ముప్పై రెండు", + "33": "ముప్పై మూడు", + "34": "ముప్పై నాలుగు", + "35": "ముప్పై ఐదు", + "36": "ముప్పై ఆరు", + "37": "ముప్పై ఏడు", + "38": "ముప్పై ఎనిమిది", + "39": "ముప్పై తొమ్మిది", + "40": "నలభై", + "41": "నలభై ఒకటి", + "42": "నలభై రెండు", + "43": "నలభై మూడు", + "44": "నలభై నాలుగు", + "45": "నలభై ఐదు", + "46": "నలభై ఆరు", + "47": "నలభై ఏడు", + "48": "నలభై ఎనిమిది", + "49": "నలభై తొమ్మిది", + "50": "యాభై", + "51": "యాభై ఒకటి", + "52": "యాభై రెండు", + "53": "యాభై మూడు", + "54": "యాభై నాలుగు", + "55": "యాభై ఐదు", + "56": "యాభై ఆరు", + "57": "యాభై ఏడు", + "58": "యాభై ఎనిమిది", + "59": "యాభై తొమ్మిది", + "60": "అరవై", + "61": "అరవై ఒకటి", + "62": "అరవై రెండు", + "63": "అరవై మూడు", + "64": "అరవై నాలుగు", + "65": "అరవై ఐదు", + "66": "అరవై ఆరు", + "67": "అరవై ఏడు", + "68": "అరవై ఎనిమిది", + "69": "అరవై తొమ్మిది", + "70": "డెబ్బై", + "71": "డెబ్బై ఒకటి", + "72": "డెబ్బై రెండు", + "73": "డెబ్బై మూడు", + "74": "డెబ్బై నాలుగు", + "75": "డెబ్బై ఐదు", + "76": "డెబ్బై ఆరు", + "77": "డెబ్బై ఏడు", + "78": "డెబ్బై ఎనిమిది", + "79": "డెబ్బై తొమ్మిది", + "80": "ఎనభై", + "81": "ఎనభై ఒకటి", + "82": "ఎనభై రెండు", + "83": "ఎనభై మూడు", + "84": "ఎనభై నాలుగు", + "85": "ఎనభై ఐదు", + "86": "ఎనభై ఆరు", + "87": "ఎనభై ఏడు", + "88": "ఎనభై ఎనిమిది", + "89": "ఎనభై తొమ్మిది", + "90": "తొంభై", + "91": "తొంభై ఒకటి", + "92": "తొంభై రెండు", + "93": "తొంభై మూడు", + "94": "తొంభై నాలుగు", + "95": "తొంభై ఐదు", + "96": "తొంభై ఆరు", + "97": "తొంభై ఏడు", + "98": "తొంభై ఎనిమిది", + "99": "తొంభై తొమ్మిది", + "100": "వందల", + "1000": "వేల", + "100000": "లక్షల", + "10000000": "కోట్ల", + "1000000000": "బిలియన్", +} # Telugu +num_dict["ta"] = { + "0": "பூஜ்ஜியம்", + "1": "ஒன்று", + "2": "இரண்டு", + "3": "மூன்று", + "4": "நான்கு", + "5": "ஐந்து", + "6": "ஆறு", + "7": "ஏழு", + "8": "எட்டு", + "9": "ஒன்பது", + "10": "பத்து", + "11": "பதினொன்று", + "12": "பன்னிரண்டு", + "13": "பதிமூன்று", + "14": "பதினான்கு", + "15": "பதினைந்து", + "16": "பதினாறு", + "17": "பதினேழு", + "18": "பதினெட்டு", + "19": "பத்தொன்பது", + "20": "இருபது", + "21": "இருபது ஒன்று", + "22": "இருபத்து இரண்டு", + "23": "இருபத்து மூன்று", + "24": "இருபத்து நான்கு", + "25": "இருபத்து ஐந்து", + "26": "இருபத்து ஆறு", + "27": "இருபத்து ஏழு", + "28": "இருபத்து எட்டு", + "29": "இருபத்து ஒன்பது", + "30": "முப்பது", + "31": "முப்பத்து ஒன்று", + "32": "முப்பத்து இரண்டு", + "33": "முப்பத்து மூன்று", + "34": "முப்பத்து நான்கு", + "35": "முப்பத்து ஐந்து", + "36": "முப்பத்து ஆறு", + "37": "முப்பத்து ஏழு", + "38": "முப்பத்து எட்டு", + "39": "முப்பத்து ஒன்பது", + "40": "நாற்பது", + "41": "நாற்பத்து ஒன்று", + "42": "நாற்பத்து இரண்டு", + "43": "நாற்பத்து மூன்று", + "44": "நாற்பத்து நான்கு", + "45": "நாற்பத்து ஐந்து", + "46": "நாற்பத்து ஆறு", + "47": " நாற்பத்து ஏழு", + "48": "நாற்பத்து எட்டு", + "49": "நாற்பத்து ஒன்பது", + "50": "ஐம்பது", + "51": "ஐம்பத்து ஒன்று", + "52": "ஐம்பத்து இரண்டு", + "53": "ஐம்பத்து மூன்று", + "54": "ஐம்பத்து நான்கு", + "55": "ஐம்பத்து ஐந்து", + "56": "ஐம்பத்து ஆறு", + "57": "ஐம்பத்து ஏழு", + "58": "ஐம்பத்து எட்டு", + "59": "ஐம்பத்து ஒன்பது", + "60": "அறுபது", + "61": "அறுபத்து ஒன்று", + "62": "அறுபத்து இரண்டு", + "63": "அறுபத்து மூன்று", + "64": "அறுபத்து நான்கு", + "65": "அறுபத்து ஐந்து", + "66": "அறுபத்து ஆறு", + "67": "அறுபத்து ஏழு", + "68": "அறுபத்து எட்டு", + "69": "அறுபத்து ஒன்பது", + "70": "எழுபது", + "71": "எழுபத்தி ஒன்று", + "72": "எழுபத்தி இரண்டு", + "73": "எழுபத்தி முச்சக்கர", + "74": "எழுபத்தி நான்கு", + "75": "எழுபத்தி ஐந்து", + "76": "எழுபத்தி ஆறு", + "77": "எழுபத்தி ஏழு", + "78": "எழுபத்தி எட்டு", + "79": "எழுபத்தி ஒன்பது", + "80": "எண்பது", + "81": "எண்பத்தியொன்று", + "82": "எண்பத்திரண்டு", + "83": "எண்பத்திமூன்று", + "84": "என்பதினான்கு", + "85": "என்பதினைந்து", + "86": "எண்பத்திஆறு", + "87": "எண்பத்திஏழு", + "88": "எண்பத்தியெட்டு", + "89": "எண்பத்தியொன்பது", + "90": "தொன்னூறு", + "91": "தொண்ணூற்றியொன்று", + "92": "தொண்ணூற்றிரண்டு", + "93": "தொண்ணூற்றிமூன்று", + "94": "தொண்ணூற்றிநான்கு", + "95": "தொண்ணூற்றிஐந்து", + "96": "தொண்ணூற்றியாறு", + "97": "தொண்ணூற்றியேழு", + "98": "தொண்ணூற்றியெட்டு", + "99": "தொண்ணூற்றிஒன்பது", + "100": "நூறு", + "1000": "ஆயிரம்", + "100000": "இலட்சம்", + "10000000": "கோடி", + "1000000000": "பில்லியன்", +} # Tamil +num_dict["kn"] = { + "0": "ಸೊನ್ನೆ", + "1": "ಒಂದು", + "2": "ಎರಡು", + "3": "ಮೂರು", + "4": "ನಾಲ್ಕು", + "5": "ಅಯ್ದು", + "6": "ಆರು", + "7": "ಏಳು", + "8": "ಎಂಟು", + "9": "ಒಂಬತ್ತು", + "10": "ಹತ್ತು", + "11": "ಹನ್ನೊಂದು", + "12": "ಹನ್ನೆರಡು", + "13": "ಹದಿಮೂರು", + "14": "ಹದಿನಾಲ್ಕು", + "15": "ಹದಿನೈದು", + "16": "ಹದಿನಾರು", + "17": "ಹದಿನೇಳು", + "18": "ಹದಿನೆಂಟು", + "19": "ಹತ್ತೊಂಬತ್ತು", + "20": "ಇಪ್ಪತ್ತು", + "21": "ಇಪ್ಪತ್ತ್’ಒಂದು", + "22": "ಇಪ್ಪತ್ತ್’ಎರಡು", + "23": "ಇಪ್ಪತ್ತ್’ಮೂರು", + "24": "ಇಪ್ಪತ್ತ್’ನಾಲ್ಕು", + "25": "ಇಪ್ಪತ್ತ್’ಐದು", + "26": "ಇಪ್ಪತ್ತ್’ಆರು", + "27": "ಇಪ್ಪತ್ತ್’ಏಳು", + "28": "ಇಪ್ಪತ್ತ್’ಎಂಟು", + "29": "ಇಪ್ಪತ್ತ್’ಒಂಬತ್ತು", + "30": "ಮೂವತ್ತು", + "31": "ಮುವತ್ತ್’ಒಂದು", + "32": "ಮುವತ್ತ್’ಎರಡು", + "33": "ಮುವತ್ತ್’ಮೂರು", + "34": "ಮೂವತ್ತ್’ನಾಲ್ಕು", + "35": "ಮೂವತ್ತ್’ಐದು", + "36": "ಮೂವತ್ತ್’ಆರು", + "37": "ಮೂವತ್ತ್’ಏಳು", + "38": "ಮೂವತ್ತ್’ಎಂಟು", + "39": "ಮೂವತ್ತ್’ಒಂಬತ್ತು", + "40": "ನಲವತ್ತು", + "41": "ನಲವತ್ತೊಂದು", + "42": "ನಲವತ್ತ್ ಎರಡು", + "43": "ನಲವತ್ತ್ ಮೂರು", + "44": "ನಲವತ್ತ್ ನಾಲ್ಕು", + "45": "ನಲವತ್ತೈದು", + "46": "ನಲವತ್ತಾರು", + "47": "ನಲವತ್ತೇಳು", + "48": "ನಲವತ್ತೆಂಟು", + "49": "ನಲವತ್ತೊಂಬತ್ತು", + "50": "ಐವತ್ತು", + "51": "ಐವತ್ತೊಂದು", + "52": "ಐವತ್ತೆರಡು", + "53": "ಐವತ್ತಮೂರು", + "54": "ಐವತ್ತ್ನಾಲ್ಕು", + "55": "ಐವತ್ತೈದು", + "56": "ಐವತ್ತಾರು", + "57": "ಐವತ್ತೇಳು", + "58": "ಐವತ್ತೆಂಟು", + "59": "ಐವತ್ತೊಂಬತ್ತು", + "60": "ಅರವತ್ತು", + "61": "ಅರವತ್ತೊಂದು", + "62": "ಅರವತ್ತೆರಡು", + "63": "ಅರವತ್ತ್ ಮೂರು", + "64": "ಅರವತ್ತ್ ನಾಲ್ಕು", + "65": "ಅರವತ್ತೈದು", + "66": "ಅರವತ್ತಾರು", + "67": "ಅರವತ್ತೇಳು", + "68": "ಅರವತ್ತೆಂಟು", + "69": "ಅರವತ್ತೊಂಬತ್ತು", + "70": "ಎಪ್ಪತ್ತು", + "71": "ಎಪ್ಪತ್ತೊಂದು", + "72": "ಎಪ್ಪತ್ತೆರಡು", + "73": "ಎಪ್ಪತ್ತ್ ಮೂರು", + "74": "ಎಪ್ಪತ್ತ್ ನಾಲ್ಕು", + "75": "ಎಪ್ಪತ್ತೈದು", + "76": "ಎಪ್ಪತ್ತಾರು", + "77": "ಎಪ್ಪತ್ತೇಳು", + "78": "ಎಪ್ಪತ್ತೆಂಟು", + "79": "ಎಪ್ಪತ್ತೊಂಬತ್ತು", + "80": "ಎಂಬತ್ತು", + "81": "ಎಂಬತ್ತೊಂದು", + "82": "ಎಂಬತ್ತೆರಡು", + "83": "ಎಂಬತ್ತ್ ಮೂರು", + "84": "ಎಂಬತ್ತ್ ನಾಲ್ಕು", + "85": "ಎಂಬತ್ತೈದು", + "86": "ಎಂಬತ್ತಾರು", + "87": "ಎಂಬತ್ತೇಳು", + "88": "ಎಂಬತ್ತೆಂಟು", + "89": "ಎಂಬತ್ತೊಂಬತ್ತು", + "90": "ತೊಂಬತ್ತು", + "91": "ತೊಂಬತ್ತೊಂದು", + "92": "ತೊಂಬತ್ತೆರಡು", + "93": "ತೊಂಬತ್ತ ಮೂರು", + "94": "ತೊಂಬತ್ತ ನಾಲ್ಕು", + "95": "ತೊಂಬತ್ತೈದು", + "96": "ತೊಂಬತ್ತಾರು", + "97": "ತೊಂಬತ್ತೇಳು", + "98": "ತೊಂಬತ್ತೆಂಟು", + "99": "ತೊಂಬತ್ತೊಂಬತ್ತು", + "100": "ನೂರ", + "1000": "ಸಾವಿರದ", + "100000": "ಲಕ್ಷದ", + "10000000": "ಕೋಟಿ", + "1000000000": "ಶತಕೋಟಿ", +} # Kannada +num_dict["or"] = { + "0": "ଶୁନ୍ୟ", + "1": "ଏକ", + "2": "ଦୁଇ", + "3": "ତିନି", + "4": "ଚାରି", + "5": "ପାଞ୍ଚ", + "6": "ଛଅ", + "7": "ସାତ", + "8": "ଆଠ", + "9": "ନଅ", + "10": "ନଅ", + "11": "ଏଗାର", + "12": "ବାର", + "13": "ତେର", + "14": "ଚଉଦ", + "15": "ପନ୍ଦର", + "16": "ଷୋହଳ", + "17": "ସତର", + "18": "ଅଠର", + "19": "ଊଣାଇଶ", + "20": "କୋଡିଏ", + "21": "ଏକୋଇଶି", + "22": "ବାଇଶି", + "23": "ତେଇଶି", + "24": "ଚବିଶି", + "25": "ପଚିଶି", + "26": "ଛବିଶି", + "27": "ସତାଇଶି", + "28": "ଅଠାଇଶି", + "29": "ଅଣତିରିଶି", + "30": "ତିରିଶି", + "31": "ଏକତିରିଶି", + "32": "ବତିଶି", + "33": "ତେତିଶି", + "34": "ଚଉତିରିଶି", + "35": "ପଞ୍ଚତିରିଶି", + "36": "ଛତିଶି", + "37": "ସଂଇତିରିଶି", + "38": "ଅଠତିରିଶି", + "39": "ଅଣଚାଳିଶି", + "40": "ଚାଳିଶି", + "41": "ଏକଚାଳିଶି", + "42": "ବୟାଳିଶି", + "43": "ତେୟାଳିଶି", + "44": "ଚଉରାଳିଶି", + "45": "ପଞ୍ଚଚାଳିଶି", + "46": "ଛୟାଳିଶି", + "47": "ସତଚାଳିଶି", + "48": "ଅଠଚାଳିଶି", + "49": "ଅଣଚାଶ", + "50": "ପଚାଶ", + "51": "ଏକାବନ", + "52": "ବାଉନ", + "53": "ତେପନ", + "54": "ଚଉବନ", + "55": "ପଞ୍ଚାବନ", + "56": "ଛପନ", + "57": "ସତାବନ", + "58": "ଅଠାବନ", + "59": "ଅଣଷଠି", + "60": "ଷାଠିଏ", + "61": "ଏକଷଠି", + "62": "ବାଷଠି", + "63": "ତେଷଠି", + "64": "ଚଉଷଠି", + "65": "ପଞ୍ଚଷଠି", + "66": "ଛଅଷଠି", + "67": "ସତଷଠି", + "68": "ଅଠଷଠି", + "69": "ଅଣସ୍ତରୀ", + "70": "ସତୂରୀ", + "71": "ଏକସ୍ତରୀ", + "72": "ବାସ୍ତରୀ", + "73": "ତେସ୍ତରୀ", + "74": "ଚଉସ୍ତରୀ", + "75": "ପଞ୍ଚସ୍ତରୀ", + "76": "ଛଅସ୍ତରୀ", + "77": "ସତସ୍ତରୀ", + "78": "ଅଠସ୍ତରୀ", + "79": "ଅଣାଅଶୀ", + "80": "ଅଶୀ", + "81": "ଏକାଅଶୀ", + "82": "ବୟାଅଶୀ", + "83": "ତେୟାଅଶୀ", + "84": "ଚଉରାଅଶୀ", + "85": "ପଞ୍ଚାଅଶୀ", + "86": "ଛୟାଅଶୀ", + "87": "ସତାଅଶୀ", + "88": "ଅଠାଅଶୀ", + "89": "ଅଣାନବେ", + "90": "ନବେ", + "91": "ଏକାନବେ", + "92": "ବୟାନବେ", + "93": "ତେୟାନବେ", + "94": "ଚଉରାନବେ", + "95": "ପଞ୍ଚାନବେ", + "96": "ଛୟାନବେ", + "97": "ସତାନବେ", + "98": "ଅଠାନବେ", + "99": "ଅନେଶତ", + "100": "ଶହେ", + "1000": "ହଜାର", + "100000": "ଲକ୍ଷ", + "10000000": "କୋଟି", + "1000000000": "କୋଟି", +} # Oriya +num_dict["pa"] = { + "0": "ਸਿਫਰ ", + "1": "ਇੱਕ", + "2": "ਦੋ", + "3": "ਤਿੰਨ", + "4": "ਚਾਰ", + "5": "ਪੰਜ", + "6": "ਛੇ", + "7": "ਸੱਤ", + "8": "ਅੱਠ", + "9": "ਨੌਂ", + "10": "ਦੱਸ", + "11": "ਗਿਆਰਾਂ", + "12": "ਬਾਰਾਂ", + "13": "ਤੇਰਾਂ", + "14": "ਚੌਦਾਂ", + "15": "ਪੰਦਰਾਂ", + "16": "ਸੋਲ਼ਾਂ", + "17": "ਸਤਾਰਾਂ", + "18": "ਅਠਾਰਾਂ", + "19": "ਉਨੀ", + "20": "ਵੀਹ", + "21": "ਇੱਕੀ", + "22": "ਬਾਈ", + "23": "ਤੇਈ", + "24": "ਚੌਵੀ", + "25": "ਪੰਝੀ", + "26": "ਛੱਬੀ", + "27": "ਸਤਾਈ", + "28": "ਅਠਾਈ", + "29": "ਉਨੱਤੀ", + "30": "ਤੀਹ", + "31": "ਇਕੱਤੀ", + "32": "ਬੱਤੀ", + "33": "ਤੇਤੀ", + "34": "ਚੌਂਤੀ", + "35": "ਪੈਂਤੀ", + "36": "ਛੱਤੀ", + "37": "ਸੈਂਤੀ", + "38": "ਅਠੱਤੀ", + "39": "ਉਨਤਾਲੀ", + "40": "ਚਾਲੀ", + "41": "ਇਕਤਾਲੀ", + "42": "ਬਤਾਲੀ", + "43": "ਤਰਤਾਲੀ", + "44": "ਚੌਤਾਲੀ", + "45": "ਪੰਜਤਾਲੀ", + "46": "ਛਿਆਲੀ", + "47": "ਸੰਤਾਲੀ", + "48": "ਅੱਠਤਾਲੀ", + "49": "ਉਣਿੰਜਾ", + "50": "ਪੰਜਾਹ", + "51": "ਇਕਵਿੰਜਾ", + "52": "ਬਵਿੰਜਾ", + "53": "ਤਰਵਿੰਜਾ", + "54": "ਚਰਿੰਜਾ", + "55": "ਪਚਵਿੰਜਾ", + "56": "ਛਪਿੰਜਾ", + "57": "ਸਤਵਿੰਜਾ", + "58": "ਅੱਠਵਿੰਜਾ", + "59": "ਉਣਾਠ", + "60": "ਸੱਠ", + "61": "ਇਕਾਠ", + "62": "ਬਾਠ੍ਹ", + "63": "ਤਰੇਠ੍ਹ", + "64": "ਚੌਠ੍ਹ", + "65": "ਪੈਂਠ", + "66": "ਛਿਆਠ", + "67": "ਸਤਾਹਠ", + "68": "ਅੱਠਾਠ", + "69": "ਉਣੱਤਰ", + "70": "ਸੱਤਰ", + "71": "ਇਕ੍ਹੱਤਰ", + "72": "ਬਹੱਤਰ", + "73": "ਤਹੱਤਰ", + "74": "ਚੌਹੱਤਰ", + "75": "ਪੰਜੱਤਰ", + "76": "ਛਿਹੱਤਰ", + "77": "ਸਤੱਤਰ", + "78": "ਅਠੱਤਰ", + "79": "ਉਣਾਸੀ", + "80": "ਅੱਸੀ", + "81": "ਇਕਾਸੀ", + "82": "ਬਿਆਸੀ", + "83": "ਤਰਾਸੀ", + "84": "ਚਰਾਸੀ", + "85": "ਪੰਜਾਸੀ", + "86": "ਛਿਆਸੀ", + "87": "ਸਤਾਸੀ", + "88": "ਅਠਾਸੀ", + "89": "ਉਣਾਨਵੇਂ", + "90": "ਨੱਬੇ", + "91": "ਇਕਾਨਵੇਂ", + "92": "ਬਿਆਨਵੇਂ", + "93": "ਤਰਾਨਵੇਂ", + "94": "ਚਰਾਨਵੇਂ", + "95": "ਪਚਾਨਵੇਂ", + "96": "ਛਿਆਨਵੇਂ", + "97": "ਸਤਾਨਵੇਂ", + "98": "ਅਠਾਨਵੇਂ", + "99": "ਨਿੜਾਨਵੇਂ", + "100": "ਸੌ", + "1000": "ਹਜਾਰ", + "100000": "ਲੱਖ", + "10000000": "ਕਰੋੜ", + "1000000000": "ਅਰਬ", +} # Punjabi + +# --------------------------- num_to_word.py ------------------------------ +""" +Method to convert Numbers to Words +for indian languages + +Use cases:- +1) Speech recognition pre-processing +2) Language modeling Data pre-processing + +------------------------- +check indic_numbers.py to add support +for any indian language +""" + + +def language_specific_exception(words, lang, combiner): + """ + Language Specific Exception will come here + """ + + def occurs_at_end(piece): + return words[-len(piece) :] == piece + + if lang == "mr": + words = words.replace("एक" + combiner + "शे", "शंभर") + elif lang == "gu": + words = words.replace("બે" + combiner + "સો", "બસ્સો") + elif lang == "te": + exception_dict = { + "1": "ఒక", + "100": "వంద", + "100+": "వందలు", + "1000": "వెయ్యి", + "1000+": "వేలు", + "100000": "లక్ష", + "100000+": "లక్షలు", + "10000000": "కోటి", + "10000000+": "కోట్లు", + } + + test_case = ["100", "1000", "100000", "10000000"] + for test in test_case: + test_word = num_dict["te"][test] + match = num_dict["te"]["1"] + combiner + test_word + # for numbers like : 100, 1000, 100000 + if words == match: + return exception_dict[test] + # for numbers like : 200, 4000, 800000 + elif occurs_at_end(test_word): + words = words.replace(test_word, exception_dict[test + "+"]) + # for numbers like : 105, 1076, 123993 + elif not occurs_at_end(match): + replacement = exception_dict["1"] + combiner + exception_dict[test] + words = words.replace(match, replacement) + + # Exception case for 101...199 + special_case = "ఒక" + combiner + "వంద" + words = words.replace(special_case, "నూట") + elif lang == "kn": + # special case for 100 + if words == ("ಒಂದು" + combiner + "ನೂರ"): + return "ನೂರು" + exception_dict = { + "ನೂರ": "ನೂರು", + "ಸಾವಿರದ": "ಸಾವಿರ", + "ಲಕ್ಷದ": "ಲಕ್ಷ", + "ಕೋಟಿಯ": "ಕೋಟಿ", + } + for expt in exception_dict: + if occurs_at_end(expt): + words = words.replace(expt, exception_dict[expt]) + return words + + +def num_to_word(num, lang, separator=", ", combiner=" "): + """ + Main Method + :param num: Number digits from any indian language + :param lang: Language Code from supported Language + :param separator: Separator character i.e. separator = '-' --> 'two hundred-sixty' + :param combiner: combine number with position i.e. combiner = '-' --> 'two-hundred sixty' + :return: UTF-8 String of numbers in words + """ + lang = lang.lower() + num = str(num) + + # Load dictionary according to language code + assert lang in supported_lang, "Language not supported" + num_dic = num_dict[lang] + + # dash default combiner for english-india + if (lang == "en") & (combiner == " "): + combiner = "-" + + # Remove punctuations from numbers + num = str(num).replace(",", "").replace(" ", "") + + # return word as it is if not number + if not num.isdecimal(): + return num + + # Replace native language numbers with english digits + for language in supported_lang: + for num_index in range(10): + num = num.replace(all_num[language][num_index], all_num["en"][num_index]) + + # Assert that input contains only integer number + for digit in num: + assert digit in all_num["en"], "Give proper input" + + # Process + # For Number longer than 9 digits + def all_two_digit(digits_2): + if len(digits_2) <= 1: # Provided only one/zero digit + return num_dic.get(digits_2, "") + elif digits_2 == "00": # Two Zero provided + return num_dic["0"] + separator + num_dic["0"] + elif digits_2[0] == "0": # First digit is zero + return num_dic["0"] + separator + num_dic[digits_2[1]] + else: # Both digit provided + return num_dic[digits_2] + + # For Number less than 9 digits + def two_digit(digits_2): + digits_2 = digits_2.lstrip("0") + if len(digits_2) != 0: + return num_dic[digits_2] + else: + return "" + + def all_digit(digits): + digits = digits.lstrip("0") + digit_len = len(digits) + if digit_len > 3: + num_of_digits_to_process = (digit_len % 2) + 1 + process_digits = digits[:num_of_digits_to_process] + base = str(10 ** (int(digit_len / 2) * 2 - 1)) + remain_digits = digits[num_of_digits_to_process:] + return ( + num_dic[process_digits] + + combiner + + num_dic[base] + + separator + + all_digit(remain_digits) + ) + elif len(digits) == 3: + return ( + num_dic[digits[:1]] + + combiner + + num_dic["100"] + + separator + + two_digit(digits[1:]) + ) + else: + return two_digit(digits) + + num = num.lstrip("0") + full_digit_len = len(num) + + if full_digit_len == 0: + output = num_dic["0"] + elif full_digit_len <= 9: + output = all_digit(num) + else: + iteration = round(full_digit_len / 2) + output = all_two_digit(num[:2]) # First to digit + for i in range(1, iteration): + output = ( + output + separator + all_two_digit(num[i * 2 : (i + 1) * 2]) + ) # Next two digit pairs + remaining_digits = num[iteration * 2 :] + if not all_two_digit(remaining_digits) == "": + output = ( + output + separator + all_two_digit(remaining_digits) + ) # remaining Last one/two digits + + output = output.strip(separator) + + output = language_specific_exception(output, lang, combiner) + + return output + + +# --------------------------------- num_to_word_on_a_sent --------------------------------- + + +def is_digit(word, digit_pattern): + return re.search(digit_pattern, word) + + +def remove_punct(sent): + clean = re.sub("[%s]" % re.escape(string.punctuation), " ", sent) + return " ".join([word for word in clean.split() if word]) + + +def normalize_nums(text, lang): + """ + text: str (eg) + lang: lang code ['en', 'hi'] + + returns: str + (eg) + """ + + if lang in supported_lang: + text = text.replace('-',' - ') # space separate hyphen + words = text.split() + lang_digits = [str(i) for i in range(0, 10)] + + digit_pattern = "[" + "".join(lang_digits) + "]" + num_indices = [ + ind for ind, word in enumerate(words) if is_digit(word, digit_pattern) + ] + + words_up = [ + num_to_word(word, lang, separator=" ", combiner=" ") + if ind in num_indices + else word + for ind, word in enumerate(words) + ] + return " ".join(words_up) + else: + return text + + +if __name__ == "__main__": + print(normalize_nums("रीटा के पास 16 बिल्लियाँ हैं।", "hi")) diff --git a/vakyansh-tts/tts_infer/requirements.txt b/vakyansh-tts/tts_infer/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbbb8fff6386b8b986fd69a4328cbd93fdc4ce6a --- /dev/null +++ b/vakyansh-tts/tts_infer/requirements.txt @@ -0,0 +1,6 @@ +# will be installed with main setup.py, no need to reinstall + +ai4bharat-transliteration==0.5.0.3 +numpy==1.19.5 +pandas +pydload \ No newline at end of file diff --git a/vakyansh-tts/tts_infer/transliterate.py b/vakyansh-tts/tts_infer/transliterate.py new file mode 100644 index 0000000000000000000000000000000000000000..575430562683434cd44fd8d2e77d26dab9ced73b --- /dev/null +++ b/vakyansh-tts/tts_infer/transliterate.py @@ -0,0 +1,919 @@ +import torch +import torch.nn as nn +import numpy as np +import pandas as pd +import random +import sys +import os +import json +import enum +import traceback +import re + +F_DIR = os.path.dirname(os.environ.get('translit_model_base_path', os.path.realpath(__file__))) + + +class XlitError(enum.Enum): + lang_err = "Unsupported langauge ID requested ;( Please check available languages." + string_err = "String passed is incompatable ;(" + internal_err = "Internal crash ;(" + unknown_err = "Unknown Failure" + loading_err = "Loading failed ;( Check if metadata/paths are correctly configured." + + +##=================== Network ================================================== + + +class Encoder(nn.Module): + def __init__( + self, + input_dim, + embed_dim, + hidden_dim, + rnn_type="gru", + layers=1, + bidirectional=False, + dropout=0, + device="cpu", + ): + super(Encoder, self).__init__() + + self.input_dim = input_dim # src_vocab_sz + self.enc_embed_dim = embed_dim + self.enc_hidden_dim = hidden_dim + self.enc_rnn_type = rnn_type + self.enc_layers = layers + self.enc_directions = 2 if bidirectional else 1 + self.device = device + + self.embedding = nn.Embedding(self.input_dim, self.enc_embed_dim) + + if self.enc_rnn_type == "gru": + self.enc_rnn = nn.GRU( + input_size=self.enc_embed_dim, + hidden_size=self.enc_hidden_dim, + num_layers=self.enc_layers, + bidirectional=bidirectional, + ) + elif self.enc_rnn_type == "lstm": + self.enc_rnn = nn.LSTM( + input_size=self.enc_embed_dim, + hidden_size=self.enc_hidden_dim, + num_layers=self.enc_layers, + bidirectional=bidirectional, + ) + else: + raise Exception("XlitError: unknown RNN type mentioned") + + def forward(self, x, x_sz, hidden=None): + """ + x_sz: (batch_size, 1) - Unpadded sequence lengths used for pack_pad + """ + batch_sz = x.shape[0] + # x: batch_size, max_length, enc_embed_dim + x = self.embedding(x) + + ## pack the padded data + # x: max_length, batch_size, enc_embed_dim -> for pack_pad + x = x.permute(1, 0, 2) + x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad + + # output: packed_size, batch_size, enc_embed_dim + # hidden: n_layer**num_directions, batch_size, hidden_dim | if LSTM (h_n, c_n) + output, hidden = self.enc_rnn( + x + ) # gru returns hidden state of all timesteps as well as hidden state at last timestep + + ## pad the sequence to the max length in the batch + # output: max_length, batch_size, enc_emb_dim*directions) + output, _ = nn.utils.rnn.pad_packed_sequence(output) + + # output: batch_size, max_length, hidden_dim + output = output.permute(1, 0, 2) + + return output, hidden + + def get_word_embedding(self, x): + """ """ + x_sz = torch.tensor([len(x)]) + x_ = torch.tensor(x).unsqueeze(0).to(dtype=torch.long) + # x: 1, max_length, enc_embed_dim + x = self.embedding(x_) + + ## pack the padded data + # x: max_length, 1, enc_embed_dim -> for pack_pad + x = x.permute(1, 0, 2) + x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad + + # output: packed_size, 1, enc_embed_dim + # hidden: n_layer**num_directions, 1, hidden_dim | if LSTM (h_n, c_n) + output, hidden = self.enc_rnn( + x + ) # gru returns hidden state of all timesteps as well as hidden state at last timestep + + out_embed = hidden[0].squeeze() + + return out_embed + + +class Decoder(nn.Module): + def __init__( + self, + output_dim, + embed_dim, + hidden_dim, + rnn_type="gru", + layers=1, + use_attention=True, + enc_outstate_dim=None, # enc_directions * enc_hidden_dim + dropout=0, + device="cpu", + ): + super(Decoder, self).__init__() + + self.output_dim = output_dim # tgt_vocab_sz + self.dec_hidden_dim = hidden_dim + self.dec_embed_dim = embed_dim + self.dec_rnn_type = rnn_type + self.dec_layers = layers + self.use_attention = use_attention + self.device = device + if self.use_attention: + self.enc_outstate_dim = enc_outstate_dim if enc_outstate_dim else hidden_dim + else: + self.enc_outstate_dim = 0 + + self.embedding = nn.Embedding(self.output_dim, self.dec_embed_dim) + + if self.dec_rnn_type == "gru": + self.dec_rnn = nn.GRU( + input_size=self.dec_embed_dim + + self.enc_outstate_dim, # to concat attention_output + hidden_size=self.dec_hidden_dim, # previous Hidden + num_layers=self.dec_layers, + batch_first=True, + ) + elif self.dec_rnn_type == "lstm": + self.dec_rnn = nn.LSTM( + input_size=self.dec_embed_dim + + self.enc_outstate_dim, # to concat attention_output + hidden_size=self.dec_hidden_dim, # previous Hidden + num_layers=self.dec_layers, + batch_first=True, + ) + else: + raise Exception("XlitError: unknown RNN type mentioned") + + self.fc = nn.Sequential( + nn.Linear(self.dec_hidden_dim, self.dec_embed_dim), + nn.LeakyReLU(), + # nn.Linear(self.dec_embed_dim, self.dec_embed_dim), nn.LeakyReLU(), # removing to reduce size + nn.Linear(self.dec_embed_dim, self.output_dim), + ) + + ##----- Attention ---------- + if self.use_attention: + self.W1 = nn.Linear(self.enc_outstate_dim, self.dec_hidden_dim) + self.W2 = nn.Linear(self.dec_hidden_dim, self.dec_hidden_dim) + self.V = nn.Linear(self.dec_hidden_dim, 1) + + def attention(self, x, hidden, enc_output): + """ + x: (batch_size, 1, dec_embed_dim) -> after Embedding + enc_output: batch_size, max_length, enc_hidden_dim *num_directions + hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n) + """ + + ## perform addition to calculate the score + + # hidden_with_time_axis: batch_size, 1, hidden_dim + ## hidden_with_time_axis = hidden.permute(1, 0, 2) ## replaced with below 2lines + hidden_with_time_axis = ( + torch.sum(hidden, axis=0) + if self.dec_rnn_type != "lstm" + else torch.sum(hidden[0], axis=0) + ) # h_n + + hidden_with_time_axis = hidden_with_time_axis.unsqueeze(1) + + # score: batch_size, max_length, hidden_dim + score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)) + + # attention_weights: batch_size, max_length, 1 + # we get 1 at the last axis because we are applying score to self.V + attention_weights = torch.softmax(self.V(score), dim=1) + + # context_vector shape after sum == (batch_size, hidden_dim) + context_vector = attention_weights * enc_output + context_vector = torch.sum(context_vector, dim=1) + # context_vector: batch_size, 1, hidden_dim + context_vector = context_vector.unsqueeze(1) + + # attend_out (batch_size, 1, dec_embed_dim + hidden_size) + attend_out = torch.cat((context_vector, x), -1) + + return attend_out, attention_weights + + def forward(self, x, hidden, enc_output): + """ + x: (batch_size, 1) + enc_output: batch_size, max_length, dec_embed_dim + hidden: n_layer, batch_size, hidden_size | lstm: (h_n, c_n) + """ + if (hidden is None) and (self.use_attention is False): + raise Exception( + "XlitError: No use of a decoder with No attention and No Hidden" + ) + + batch_sz = x.shape[0] + + if hidden is None: + # hidden: n_layers, batch_size, hidden_dim + hid_for_att = torch.zeros( + (self.dec_layers, batch_sz, self.dec_hidden_dim) + ).to(self.device) + elif self.dec_rnn_type == "lstm": + hid_for_att = hidden[1] # c_n + + # x (batch_size, 1, dec_embed_dim) -> after embedding + x = self.embedding(x) + + if self.use_attention: + # x (batch_size, 1, dec_embed_dim + hidden_size) -> after attention + # aw: (batch_size, max_length, 1) + x, aw = self.attention(x, hidden, enc_output) + else: + x, aw = x, 0 + + # passing the concatenated vector to the GRU + # output: (batch_size, n_layers, hidden_size) + # hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n) + output, hidden = ( + self.dec_rnn(x, hidden) if hidden is not None else self.dec_rnn(x) + ) + + # output :shp: (batch_size * 1, hidden_size) + output = output.view(-1, output.size(2)) + + # output :shp: (batch_size * 1, output_dim) + output = self.fc(output) + + return output, hidden, aw + + +class Seq2Seq(nn.Module): + """ + Class dependency: Encoder, Decoder + """ + + def __init__( + self, encoder, decoder, pass_enc2dec_hid=False, dropout=0, device="cpu" + ): + super(Seq2Seq, self).__init__() + + self.encoder = encoder + self.decoder = decoder + self.device = device + self.pass_enc2dec_hid = pass_enc2dec_hid + _force_en2dec_hid_conv = False + + if self.pass_enc2dec_hid: + assert ( + decoder.dec_hidden_dim == encoder.enc_hidden_dim + ), "Hidden Dimension of encoder and decoder must be same, or unset `pass_enc2dec_hid`" + if decoder.use_attention: + assert ( + decoder.enc_outstate_dim + == encoder.enc_directions * encoder.enc_hidden_dim + ), "Set `enc_out_dim` correctly in decoder" + assert ( + self.pass_enc2dec_hid or decoder.use_attention + ), "No use of a decoder with No attention and No Hidden from Encoder" + + self.use_conv_4_enc2dec_hid = False + if ( + self.pass_enc2dec_hid + and (encoder.enc_directions * encoder.enc_layers != decoder.dec_layers) + ) or _force_en2dec_hid_conv: + if encoder.enc_rnn_type == "lstm" or encoder.enc_rnn_type == "lstm": + raise Exception( + "XlitError: conv for enc2dec_hid not implemented; Change the layer numbers appropriately" + ) + + self.use_conv_4_enc2dec_hid = True + self.enc_hid_1ax = encoder.enc_directions * encoder.enc_layers + self.dec_hid_1ax = decoder.dec_layers + self.e2d_hidden_conv = nn.Conv1d(self.enc_hid_1ax, self.dec_hid_1ax, 1) + + def enc2dec_hidden(self, enc_hidden): + """ + enc_hidden: n_layer, batch_size, hidden_dim*num_directions + TODO: Implement the logic for LSTm bsed model + """ + # hidden: batch_size, enc_layer*num_directions, enc_hidden_dim + hidden = enc_hidden.permute(1, 0, 2).contiguous() + # hidden: batch_size, dec_layers, dec_hidden_dim -> [N,C,Tstep] + hidden = self.e2d_hidden_conv(hidden) + + # hidden: dec_layers, batch_size , dec_hidden_dim + hidden_for_dec = hidden.permute(1, 0, 2).contiguous() + + return hidden_for_dec + + def active_beam_inference(self, src, beam_width=3, max_tgt_sz=50): + """Search based decoding + src: (sequence_len) + """ + + def _avg_score(p_tup): + """Used for Sorting + TODO: Dividing by length of sequence power alpha as hyperparam + """ + return p_tup[0] + + import sys + + batch_size = 1 + start_tok = src[0] + end_tok = src[-1] + src_sz = torch.tensor([len(src)]) + src_ = src.unsqueeze(0) + + # enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction) + # enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim) + enc_output, enc_hidden = self.encoder(src_, src_sz) + + if self.pass_enc2dec_hid: + # dec_hidden: dec_layers, batch_size , dec_hidden_dim + if self.use_conv_4_enc2dec_hid: + init_dec_hidden = self.enc2dec_hidden(enc_hidden) + else: + init_dec_hidden = enc_hidden + else: + # dec_hidden -> Will be initialized to zeros internally + init_dec_hidden = None + + # top_pred[][0] = Σ-log_softmax + # top_pred[][1] = sequence torch.tensor shape: (1) + # top_pred[][2] = dec_hidden + top_pred_list = [(0, start_tok.unsqueeze(0), init_dec_hidden)] + + for t in range(max_tgt_sz): + cur_pred_list = [] + + for p_tup in top_pred_list: + if p_tup[1][-1] == end_tok: + cur_pred_list.append(p_tup) + continue + + # dec_hidden: dec_layers, 1, hidden_dim + # dec_output: 1, output_dim + dec_output, dec_hidden, _ = self.decoder( + x=p_tup[1][-1].view(1, 1), # dec_input: (1,1) + hidden=p_tup[2], + enc_output=enc_output, + ) + + ## π{prob} = Σ{log(prob)} -> to prevent diminishing + # dec_output: (1, output_dim) + dec_output = nn.functional.log_softmax(dec_output, dim=1) + # pred_topk.values & pred_topk.indices: (1, beam_width) + pred_topk = torch.topk(dec_output, k=beam_width, dim=1) + + for i in range(beam_width): + sig_logsmx_ = p_tup[0] + pred_topk.values[0][i] + # seq_tensor_ : (seq_len) + seq_tensor_ = torch.cat((p_tup[1], pred_topk.indices[0][i].view(1))) + + cur_pred_list.append((sig_logsmx_, seq_tensor_, dec_hidden)) + + cur_pred_list.sort(key=_avg_score, reverse=True) # Maximized order + top_pred_list = cur_pred_list[:beam_width] + + # check if end_tok of all topk + end_flags_ = [1 if t[1][-1] == end_tok else 0 for t in top_pred_list] + if beam_width == sum(end_flags_): + break + + pred_tnsr_list = [t[1] for t in top_pred_list] + + return pred_tnsr_list + + +##===================== Glyph handlers ======================================= + + +class GlyphStrawboss: + def __init__(self, glyphs="en"): + """list of letters in a language in unicode + lang: ISO Language code + glyphs: json file with script information + """ + if glyphs == "en": + # Smallcase alone + self.glyphs = [chr(alpha) for alpha in range(97, 122 + 1)] + else: + self.dossier = json.load(open(glyphs, encoding="utf-8")) + self.glyphs = self.dossier["glyphs"] + self.numsym_map = self.dossier["numsym_map"] + + self.char2idx = {} + self.idx2char = {} + self._create_index() + + def _create_index(self): + + self.char2idx["_"] = 0 # pad + self.char2idx["$"] = 1 # start + self.char2idx["#"] = 2 # end + self.char2idx["*"] = 3 # Mask + self.char2idx["'"] = 4 # apostrophe U+0027 + self.char2idx["%"] = 5 # unused + self.char2idx["!"] = 6 # unused + + # letter to index mapping + for idx, char in enumerate(self.glyphs): + self.char2idx[char] = idx + 7 # +7 token initially + + # index to letter mapping + for char, idx in self.char2idx.items(): + self.idx2char[idx] = char + + def size(self): + return len(self.char2idx) + + def word2xlitvec(self, word): + """Converts given string of gyphs(word) to vector(numpy) + Also adds tokens for start and end + """ + try: + vec = [self.char2idx["$"]] # start token + for i in list(word): + vec.append(self.char2idx[i]) + vec.append(self.char2idx["#"]) # end token + + vec = np.asarray(vec, dtype=np.int64) + return vec + + except Exception as error: + print("XlitError: In word:", word, "Error Char not in Token:", error) + sys.exit() + + def xlitvec2word(self, vector): + """Converts vector(numpy) to string of glyphs(word)""" + char_list = [] + for i in vector: + char_list.append(self.idx2char[i]) + + word = "".join(char_list).replace("$", "").replace("#", "") # remove tokens + word = word.replace("_", "").replace("*", "") # remove tokens + return word + + +class VocabSanitizer: + def __init__(self, data_file): + """ + data_file: path to file conatining vocabulary list + """ + extension = os.path.splitext(data_file)[-1] + if extension == ".json": + self.vocab_set = set(json.load(open(data_file, encoding="utf-8"))) + elif extension == ".csv": + self.vocab_df = pd.read_csv(data_file).set_index("WORD") + self.vocab_set = set(self.vocab_df.index) + else: + print("XlitError: Only Json/CSV file extension supported") + + def reposition(self, word_list): + """Reorder Words in list""" + new_list = [] + temp_ = word_list.copy() + for v in word_list: + if v in self.vocab_set: + new_list.append(v) + temp_.remove(v) + new_list.extend(temp_) + + return new_list + + +##=============== INSTANTIATION ================================================ + + +class XlitPiston: + """ + For handling prediction & post-processing of transliteration for a single language + Class dependency: Seq2Seq, GlyphStrawboss, VocabSanitizer + Global Variables: F_DIR + """ + + def __init__( + self, + weight_path, + vocab_file, + tglyph_cfg_file, + iglyph_cfg_file="en", + device="cpu", + ): + + self.device = device + self.in_glyph_obj = GlyphStrawboss(iglyph_cfg_file) + self.tgt_glyph_obj = GlyphStrawboss(glyphs=tglyph_cfg_file) + self.voc_sanity = VocabSanitizer(vocab_file) + + self._numsym_set = set( + json.load(open(tglyph_cfg_file, encoding="utf-8"))["numsym_map"].keys() + ) + self._inchar_set = set("abcdefghijklmnopqrstuvwxyz") + self._natscr_set = set().union( + self.tgt_glyph_obj.glyphs, sum(self.tgt_glyph_obj.numsym_map.values(), []) + ) + + ## Model Config Static TODO: add defining in json support + input_dim = self.in_glyph_obj.size() + output_dim = self.tgt_glyph_obj.size() + enc_emb_dim = 300 + dec_emb_dim = 300 + enc_hidden_dim = 512 + dec_hidden_dim = 512 + rnn_type = "lstm" + enc2dec_hid = True + attention = True + enc_layers = 1 + dec_layers = 2 + m_dropout = 0 + enc_bidirect = True + enc_outstate_dim = enc_hidden_dim * (2 if enc_bidirect else 1) + + enc = Encoder( + input_dim=input_dim, + embed_dim=enc_emb_dim, + hidden_dim=enc_hidden_dim, + rnn_type=rnn_type, + layers=enc_layers, + dropout=m_dropout, + device=self.device, + bidirectional=enc_bidirect, + ) + dec = Decoder( + output_dim=output_dim, + embed_dim=dec_emb_dim, + hidden_dim=dec_hidden_dim, + rnn_type=rnn_type, + layers=dec_layers, + dropout=m_dropout, + use_attention=attention, + enc_outstate_dim=enc_outstate_dim, + device=self.device, + ) + self.model = Seq2Seq(enc, dec, pass_enc2dec_hid=enc2dec_hid, device=self.device) + self.model = self.model.to(self.device) + weights = torch.load(weight_path, map_location=torch.device(self.device)) + + self.model.load_state_dict(weights) + self.model.eval() + + def character_model(self, word, beam_width=1): + in_vec = torch.from_numpy(self.in_glyph_obj.word2xlitvec(word)).to(self.device) + ## change to active or passive beam + p_out_list = self.model.active_beam_inference(in_vec, beam_width=beam_width) + p_result = [ + self.tgt_glyph_obj.xlitvec2word(out.cpu().numpy()) for out in p_out_list + ] + + result = self.voc_sanity.reposition(p_result) + + # List type + return result + + def numsym_model(self, seg): + """tgt_glyph_obj.numsym_map[x] returns a list object""" + if len(seg) == 1: + return [seg] + self.tgt_glyph_obj.numsym_map[seg] + + a = [self.tgt_glyph_obj.numsym_map[n][0] for n in seg] + return [seg] + ["".join(a)] + + def _word_segementer(self, sequence): + + sequence = sequence.lower() + accepted = set().union(self._numsym_set, self._inchar_set, self._natscr_set) + # sequence = ''.join([i for i in sequence if i in accepted]) + + segment = [] + idx = 0 + seq_ = list(sequence) + while len(seq_): + # for Number-Symbol + temp = "" + while len(seq_) and seq_[0] in self._numsym_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + # for Target Chars + temp = "" + while len(seq_) and seq_[0] in self._natscr_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + # for Input-Roman Chars + temp = "" + while len(seq_) and seq_[0] in self._inchar_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + temp = "" + while len(seq_) and seq_[0] not in accepted: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + return segment + + def inferencer(self, sequence, beam_width=10): + + seg = self._word_segementer(sequence[:120]) + lit_seg = [] + + p = 0 + while p < len(seg): + if seg[p][0] in self._natscr_set: + lit_seg.append([seg[p]]) + p += 1 + + elif seg[p][0] in self._inchar_set: + lit_seg.append(self.character_model(seg[p], beam_width=beam_width)) + p += 1 + + elif seg[p][0] in self._numsym_set: # num & punc + lit_seg.append(self.numsym_model(seg[p])) + p += 1 + else: + lit_seg.append([seg[p]]) + p += 1 + + ## IF segment less/equal to 2 then return combinotorial, + ## ELSE only return top1 of each result concatenated + if len(lit_seg) == 1: + final_result = lit_seg[0] + + elif len(lit_seg) == 2: + final_result = [""] + for seg in lit_seg: + new_result = [] + for s in seg: + for f in final_result: + new_result.append(f + s) + final_result = new_result + + else: + new_result = [] + for seg in lit_seg: + new_result.append(seg[0]) + final_result = ["".join(new_result)] + + return final_result + + +from collections.abc import Iterable +from pydload import dload +import zipfile + +MODEL_DOWNLOAD_URL_PREFIX = "https://github.com/AI4Bharat/IndianNLP-Transliteration/releases/download/xlit_v0.5.0/" + + +def is_folder_writable(folder): + try: + os.makedirs(folder, exist_ok=True) + tmp_file = os.path.join(folder, ".write_test") + with open(tmp_file, "w") as f: + f.write("Permission Check") + os.remove(tmp_file) + return True + except: + return False + + +def is_directory_writable(path): + if os.name == "nt": + return is_folder_writable(path) + return os.access(path, os.W_OK | os.X_OK) + + +class XlitEngine: + """ + For Managing the top level tasks and applications of transliteration + Global Variables: F_DIR + """ + + def __init__( + self, lang2use="all", config_path="translit_models/default_lineup.json" + ): + + lineup = json.load(open(os.path.join(F_DIR, config_path), encoding="utf-8")) + self.lang_config = {} + if isinstance(lang2use, str): + if lang2use == "all": + self.lang_config = lineup + elif lang2use in lineup: + self.lang_config[lang2use] = lineup[lang2use] + else: + raise Exception( + "XlitError: The entered Langauge code not found. Available are {}".format( + lineup.keys() + ) + ) + + elif isinstance(lang2use, Iterable): + for l in lang2use: + try: + self.lang_config[l] = lineup[l] + except: + print( + "XlitError: Language code {} not found, Skipping...".format(l) + ) + else: + raise Exception( + "XlitError: lang2use must be a list of language codes (or) string of single language code" + ) + + if is_directory_writable(F_DIR): + models_path = os.path.join(F_DIR, "translit_models") + else: + user_home = os.path.expanduser("~") + models_path = os.path.join(user_home, ".AI4Bharat_Xlit_Models") + os.makedirs(models_path, exist_ok=True) + self.download_models(models_path) + + self.langs = {} + self.lang_model = {} + for la in self.lang_config: + try: + print("Loading {}...".format(la)) + self.lang_model[la] = XlitPiston( + weight_path=os.path.join( + models_path, self.lang_config[la]["weight"] + ), + vocab_file=os.path.join(models_path, self.lang_config[la]["vocab"]), + tglyph_cfg_file=os.path.join( + models_path, self.lang_config[la]["script"] + ), + iglyph_cfg_file="en", + ) + self.langs[la] = self.lang_config[la]["name"] + except Exception as error: + print("XlitError: Failure in loading {} \n".format(la), error) + print(XlitError.loading_err.value) + + def download_models(self, models_path): + """ + Download models from GitHub Releases if not exists + """ + for l in self.lang_config: + lang_name = self.lang_config[l]["eng_name"] + lang_model_path = os.path.join(models_path, lang_name) + if not os.path.isdir(lang_model_path): + print("Downloading model for language: %s" % lang_name) + remote_url = MODEL_DOWNLOAD_URL_PREFIX + lang_name + ".zip" + downloaded_zip_path = os.path.join(models_path, lang_name + ".zip") + dload(url=remote_url, save_to_path=downloaded_zip_path, max_time=None) + + if not os.path.isfile(downloaded_zip_path): + exit( + f"ERROR: Unable to download model from {remote_url} into {models_path}" + ) + + with zipfile.ZipFile(downloaded_zip_path, "r") as zip_ref: + zip_ref.extractall(models_path) + + if os.path.isdir(lang_model_path): + os.remove(downloaded_zip_path) + else: + exit( + f"ERROR: Unable to find models in {lang_model_path} after download" + ) + return + + def translit_word(self, eng_word, lang_code="default", topk=7, beam_width=10): + if eng_word == "": + return [] + + if lang_code in self.langs: + try: + res_list = self.lang_model[lang_code].inferencer( + eng_word, beam_width=beam_width + ) + return res_list[:topk] + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + elif lang_code == "default": + try: + res_dict = {} + for la in self.lang_model: + res = self.lang_model[la].inferencer( + eng_word, beam_width=beam_width + ) + res_dict[la] = res[:topk] + return res_dict + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + else: + print("XlitError: Unknown Langauge requested", lang_code) + print(XlitError.lang_err.value) + return XlitError.lang_err + + def translit_sentence(self, eng_sentence, lang_code="default", beam_width=10): + if eng_sentence == "": + return [] + + if lang_code in self.langs: + try: + out_str = "" + for word in eng_sentence.split(): + res_ = self.lang_model[lang_code].inferencer( + word, beam_width=beam_width + ) + out_str = out_str + res_[0] + " " + return out_str[:-1] + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + elif lang_code == "default": + try: + res_dict = {} + for la in self.lang_model: + out_str = "" + for word in eng_sentence.split(): + res_ = self.lang_model[la].inferencer( + word, beam_width=beam_width + ) + out_str = out_str + res_[0] + " " + res_dict[la] = out_str[:-1] + return res_dict + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + else: + print("XlitError: Unknown Langauge requested", lang_code) + print(XlitError.lang_err.value) + return XlitError.lang_err + + +if __name__ == "__main__": + + available_lang = [ + "bn", + "gu", + "hi", + "kn", + "gom", + "mai", + "ml", + "mr", + "pa", + "sd", + "si", + "ta", + "te", + "ur", + ] + + reg = re.compile(r"[a-zA-Z]") + lang = "hi" + engine = XlitEngine( + lang + ) # if you don't specify lang code here, this will give results in all langs available + sent = "Hello World! ABCD क्या हाल है आपका?" + words = [ + engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word + for word in sent.split() + ] # only transliterated en words, leaves rest as it is + updated_sent = " ".join(words) + + print(updated_sent) + + # output : हेलो वर्ल्ड! क्या हाल है आपका? + + # y = engine.translit_sentence("Hello World !")['hi'] + # print(y) diff --git a/vakyansh-tts/tts_infer/tts.py b/vakyansh-tts/tts_infer/tts.py new file mode 100644 index 0000000000000000000000000000000000000000..b373de8d62ce4aeb6ba5db5a07e8b018c347217b --- /dev/null +++ b/vakyansh-tts/tts_infer/tts.py @@ -0,0 +1,158 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +from typing import Tuple +import sys +from argparse import ArgumentParser + +import torch +import numpy as np +import os +import json +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../src/glow_tts")) + +from scipy.io.wavfile import write +from hifi.env import AttrDict +from hifi.models import Generator + + +from text import text_to_sequence +import commons +import models +import utils + + +def check_directory(dir): + if not os.path.exists(dir): + sys.exit("Error: {} directory does not exist".format(dir)) + + +class TextToMel: + def __init__(self, glow_model_dir, device="cuda"): + self.glow_model_dir = glow_model_dir + check_directory(self.glow_model_dir) + self.device = device + self.hps, self.glow_tts_model = self.load_glow_tts() + pass + + def load_glow_tts(self): + hps = utils.get_hparams_from_dir(self.glow_model_dir) + checkpoint_path = utils.latest_checkpoint_path(self.glow_model_dir) + symbols = list(hps.data.punc) + list(hps.data.chars) + glow_tts_model = models.FlowGenerator( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ) # .to(self.device) + + if self.device == "cuda": + glow_tts_model.to("cuda") + + utils.load_checkpoint(checkpoint_path, glow_tts_model) + glow_tts_model.decoder.store_inverse() + _ = glow_tts_model.eval() + + return hps, glow_tts_model + + def generate_mel(self, text, noise_scale=0.667, length_scale=1.0): + symbols = list(self.hps.data.punc) + list(self.hps.data.chars) + cleaner = self.hps.data.text_cleaners + if getattr(self.hps.data, "add_blank", False): + text_norm = text_to_sequence(text, symbols, cleaner) + text_norm = commons.intersperse(text_norm, len(symbols)) + else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality + text = " " + text.strip() + " " + text_norm = text_to_sequence(text, symbols, cleaner) + + sequence = np.array(text_norm)[None, :] + + del symbols + del cleaner + del text + del text_norm + + if self.device == "cuda": + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() + else: + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]) + + with torch.no_grad(): + (y_gen_tst, *_), *_, (attn_gen, *_) = self.glow_tts_model( + x_tst, + x_tst_lengths, + gen=True, + noise_scale=noise_scale, + length_scale=length_scale, + ) + del x_tst + del x_tst_lengths + torch.cuda.empty_cache() + return y_gen_tst + #return y_gen_tst.cpu().detach().numpy() + + +class MelToWav: + def __init__(self, hifi_model_dir, device="cuda"): + self.hifi_model_dir = hifi_model_dir + check_directory(self.hifi_model_dir) + self.device = device + self.h, self.hifi_gan_generator = self.load_hifi_gan() + pass + + def load_hifi_gan(self): + checkpoint_path = utils.latest_checkpoint_path(self.hifi_model_dir, regex="g_*") + config_file = os.path.join(self.hifi_model_dir, "config.json") + data = open(config_file).read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + + generator = Generator(h).to(self.device) + + assert os.path.isfile(checkpoint_path) + print("Loading '{}'".format(checkpoint_path)) + state_dict_g = torch.load(checkpoint_path, map_location=self.device) + print("Complete.") + + generator.load_state_dict(state_dict_g["generator"]) + + generator.eval() + generator.remove_weight_norm() + + return h, generator + + def generate_wav(self, mel): + #mel = torch.FloatTensor(mel).to(self.device) + + y_g_hat = self.hifi_gan_generator(mel.to(self.device)) # passing through vocoder + audio = y_g_hat.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().detach().numpy().astype("int16") + + del y_g_hat + del mel + torch.cuda.empty_cache() + return audio, self.h.sampling_rate + + +if __name__ == "__main__": + + parser = ArgumentParser() + parser.add_argument("-m", "--model", required=True, type=str) + parser.add_argument("-g", "--gan", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-t", "--text", type=str, required=True) + parser.add_argument("-w", "--wav", type=str, required=True) + args = parser.parse_args() + + text_to_mel = TextToMel(glow_model_dir=args.model, device=args.device) + mel_to_wav = MelToWav(hifi_model_dir=args.gan, device=args.device) + + mel = text_to_mel.generate_mel(args.text) + audio, sr = mel_to_wav.generate_wav(mel) + + write(filename=args.wav, rate=sr, data=audio) + + pass diff --git a/vakyansh-tts/utils/data/duration.py b/vakyansh-tts/utils/data/duration.py new file mode 100644 index 0000000000000000000000000000000000000000..c3b5e112b72dd5a07ea2463f604d98bb8d961496 --- /dev/null +++ b/vakyansh-tts/utils/data/duration.py @@ -0,0 +1,33 @@ +# Usage -> python duration.py /src/folder/path + +import soundfile as sf +import sys +import os +from glob import glob +from joblib import Parallel, delayed +from tqdm import tqdm + + +def get_duration(fpath): + w = sf.SoundFile(fpath) + sr = w.samplerate + assert 22050 == sr, "Sample rate is not 22050" + return len(w) / sr + + +def main(folder, ext="wav"): + file_list = glob(folder + "/**/*." + ext, recursive=True) + print(f"\n\tTotal number of wav files {len(file_list)}") + duration_list = Parallel(n_jobs=1)( + delayed(get_duration)(i) for i in tqdm(file_list) + ) + print( + f"\n\tMin Duration {min(duration_list):.2f} Max Duration {max(duration_list):.2f} in secs" + ) + print(f"\n\tTotal Duration {sum(duration_list)/3600:.2f} in hours") + + +if __name__ == "__main__": + folder = sys.argv[1] + folder = os.path.abspath(folder) + main(folder) diff --git a/vakyansh-tts/utils/data/resample.py b/vakyansh-tts/utils/data/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..c77109ef4d5142cd9094f46dd186a17571071ab8 --- /dev/null +++ b/vakyansh-tts/utils/data/resample.py @@ -0,0 +1,59 @@ +import argparse +import librosa +import numpy as np +import os +import scipy +import scipy.io.wavfile +import sys + +from glob import glob +from tqdm import tqdm +from joblib import Parallel, delayed + + +def check_directories(dir_input, dir_output): + if not os.path.exists(dir_input): + sys.exit("Error: Input directory does not exist: {}".format(dir_input)) + if not os.path.exists(dir_output): + sys.exit("Error: Output directory does not exist: {}".format(dir_output)) + abs_a = os.path.abspath(dir_input) + abs_b = os.path.abspath(dir_output) + if abs_a == abs_b: + sys.exit("Error: Paths are the same: {}".format(abs_a)) + + +def resample_file(input_filename, output_filename, sample_rate): + mono = ( + True # librosa converts signal to mono by default, so I'm just surfacing this + ) + audio, existing_rate = librosa.load(input_filename, sr=sample_rate, mono=mono) + audio /= 1.414 # Scale to [-1.0, 1.0] + audio *= 32767 # Scale to int16 + audio = audio.astype(np.int16) + scipy.io.wavfile.write(output_filename, sample_rate, audio) + + +def downsample_wav_files(input_dir, output_dir, output_sample_rate): + check_directories(input_dir, output_dir) + inp_wav_paths = glob(input_dir + "/*.wav") + out_wav_paths = [ + os.path.join(output_dir, os.path.basename(p)) for p in inp_wav_paths + ] + _ = Parallel(n_jobs=-1)( + delayed(resample_file)(i, o, output_sample_rate) + for i, o in tqdm(zip(inp_wav_paths, out_wav_paths)) + ) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_dir", "-i", type=str, required=True) + parser.add_argument("--output_dir", "-o", type=str, required=True) + parser.add_argument("--output_sample_rate", "-s", type=int, required=True) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + downsample_wav_files(args.input_dir, args.output_dir, args.output_sample_rate) + print(f"\n\tCompleted") diff --git a/vakyansh-tts/utils/glow/prepare_iitm_data_glow.py b/vakyansh-tts/utils/glow/prepare_iitm_data_glow.py new file mode 100644 index 0000000000000000000000000000000000000000..9e1e5cb8cd85c88892371851917ec721c2c4b08e --- /dev/null +++ b/vakyansh-tts/utils/glow/prepare_iitm_data_glow.py @@ -0,0 +1,134 @@ +import os +from glob import glob +import re +import string +import argparse + +import random +random.seed(42) + +def replace_extra_chars(line): + line = line.replace("(", "").replace( + ")", "" + ) # .replace('\u200d', ' ').replace('\ufeff', ' ').replace('\u200c', ' ').replace('\u200e', ' ') + # line = line.replace('“', ' ').replace('”', ' ').replace(':', ' ') + + return line.strip() + + +def write_txt(content, filename): + with open(filename, "w+", encoding="utf-8") as f: + f.write(content) + + +def save_train_test_valid_split(annotations_txt, num_samples_valid, num_samples_test): + with open(annotations_txt, encoding="utf-8") as f: + all_lines = [line.strip() for line in f.readlines()] + test_val_indices = random.sample( + range(len(all_lines)), num_samples_valid + num_samples_test + ) + valid_ix = test_val_indices[:num_samples_valid] + test_ix = test_val_indices[num_samples_valid:] + train = [line for i, line in enumerate(all_lines) if i not in test_val_indices] + valid = [line for i, line in enumerate(all_lines) if i in valid_ix] + test = [line for i, line in enumerate(all_lines) if i in test_ix] + + print(f"Num samples in train: {len(train)}") + print(f"Num samples in valid: {len(valid)}") + print(f"Num samples in test: {len(test)}") + + out_dir_path = "/".join(annotations_txt.split("/")[:-1]) + with open(os.path.join(out_dir_path, "train.txt"), "w+", encoding="utf-8") as f: + for line in train: + print(line, file=f) + with open(os.path.join(out_dir_path, "valid.txt"), "w+", encoding="utf-8") as f: + for line in valid: + print(line, file=f) + with open(os.path.join(out_dir_path, "test.txt"), "w+", encoding="utf-8") as f: + for line in test: + print(line, file=f) + print(f"train, test and valid txts saved in {out_dir_path}") + + +def save_txts_from_txt_done_data( + text_path, + wav_path_for_annotations_txt, + out_path_for_txts, + num_samples_valid, + num_samples_test, +): + outfile = os.path.join(out_path_for_txts, "annotations.txt") + with open(text_path) as file: + file_lines = file.readlines() + + # print(file_lines[0]) + + file_lines = [replace_extra_chars(line) for line in file_lines] + # print(file_lines[0]) + + fnames, ftexts = [], [] + for line in file_lines: + elems = line.split('"') + fnames.append(elems[0].strip()) + ftexts.append(elems[1].strip()) + + all_chars = list(set("".join(ftexts))) + punct_with_space = [i for i in all_chars if i in list(string.punctuation)] + [" "] + chars = [i for i in all_chars if i not in punct_with_space if i.strip()] + chars = "".join(chars) + punct_with_space = "".join(punct_with_space) + + with open('../../config/glow/base_blank.json', 'r') as jfile: + json_config = json.load(jfile) + + json_config["data"]["chars"] = chars + json_config["data"]["punc"] = punct_with_space + json_config["data"]["training_files"]=out_path_for_txts + '/train.txt' + json_config["data"]["validation_files"] = out_path_for_txts + '/valid.txt' + new_config_name = out_path_for_txts.split('/')[-1] + with open(f'../../config/glow/{new_config_name}.json','w+') as jfile: + json.dump(json_config, jfile) + + print(f"Characters: {chars}") + print(f"Punctuation: {punct_with_space}") + print(f"Config file is stored at ../../config/glow/{new_config_name}.json") + + outfile_f = open(outfile, "w+", encoding="utf-8") + for f, t in zip(fnames, ftexts): + print( + os.path.join(wav_path_for_annotations_txt, f) + ".wav", + t, + sep="|", + file=outfile_f, + ) + outfile_f.close() + write_txt(punct_with_space, os.path.join(out_path_for_txts, "punc.txt")) + write_txt(chars, os.path.join(out_path_for_txts, "chars.txt")) + + save_train_test_valid_split( + annotations_txt=outfile, + num_samples_valid=num_samples_valid, + num_samples_test=num_samples_test, + ) + + + + +if __name__ == "__main__": + + + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--text-path", type=str, required=True) + parser.add_argument("-o", "--output-path", type=str, required=True) + parser.add_argument("-w", "--wav-path", type=str, required=True) + parser.add_argument("-v", "--valid-samples", type=int, default = 100) + parser.add_argument("-t", "--test-samples", type=int, default = 10) + args = parser.parse_args() + + save_txts_from_txt_done_data( + args.text_path, + args.wav_path, + args.output_path, + args.valid_samples, + args.test_samples, + ) diff --git a/vakyansh-tts/utils/glow/prepare_iitm_data_glow_en.py b/vakyansh-tts/utils/glow/prepare_iitm_data_glow_en.py new file mode 100644 index 0000000000000000000000000000000000000000..827bdc98f2d84090cc445d786ff8fc1e5ff3d829 --- /dev/null +++ b/vakyansh-tts/utils/glow/prepare_iitm_data_glow_en.py @@ -0,0 +1,135 @@ +import os +from glob import glob +import re +import string +import argparse +import json +import random +random.seed(42) + +def replace_extra_chars(line): + line = line.replace("(", "").replace( + ")", "" + ) # .replace('\u200d', ' ').replace('\ufeff', ' ').replace('\u200c', ' ').replace('\u200e', ' ') + # line = line.replace('“', ' ').replace('”', ' ').replace(':', ' ') + + return line.strip() + + +def write_txt(content, filename): + with open(filename, "w+", encoding="utf-8") as f: + f.write(content) + + +def save_train_test_valid_split(annotations_txt, num_samples_valid, num_samples_test): + with open(annotations_txt, encoding="utf-8") as f: + all_lines = [line.strip() for line in f.readlines()] + test_val_indices = random.sample( + range(len(all_lines)), num_samples_valid + num_samples_test + ) + valid_ix = test_val_indices[:num_samples_valid] + test_ix = test_val_indices[num_samples_valid:] + train = [line for i, line in enumerate(all_lines) if i not in test_val_indices] + valid = [line for i, line in enumerate(all_lines) if i in valid_ix] + test = [line for i, line in enumerate(all_lines) if i in test_ix] + + print(f"Num samples in train: {len(train)}") + print(f"Num samples in valid: {len(valid)}") + print(f"Num samples in test: {len(test)}") + + out_dir_path = "/".join(annotations_txt.split("/")[:-1]) + with open(os.path.join(out_dir_path, "train.txt"), "w+", encoding="utf-8") as f: + for line in train: + print(line, file=f) + with open(os.path.join(out_dir_path, "valid.txt"), "w+", encoding="utf-8") as f: + for line in valid: + print(line, file=f) + with open(os.path.join(out_dir_path, "test.txt"), "w+", encoding="utf-8") as f: + for line in test: + print(line, file=f) + print(f"train, test and valid txts saved in {out_dir_path}") + + +def save_txts_from_txt_done_data( + text_path, + wav_path_for_annotations_txt, + out_path_for_txts, + num_samples_valid, + num_samples_test, +): + outfile = os.path.join(out_path_for_txts, "annotations.txt") + with open(text_path) as file: + file_lines = file.readlines() + + # print(file_lines[0]) + + file_lines = [replace_extra_chars(line) for line in file_lines] + # print(file_lines[0]) + + fnames, ftexts = [], [] + for line in file_lines: + elems = line.split('"') + fnames.append(elems[0].strip()) + ftexts.append(elems[1].strip().lower().replace('‘','\'').replace('’','\'')) + + all_chars = list(set("".join(ftexts))) + punct_with_space = [i for i in all_chars if i in list(string.punctuation)] + [" "] + chars = [i for i in all_chars if i not in punct_with_space if i.strip()] + chars = "".join(chars) + punct_with_space = "".join(punct_with_space)#.replace("'",r"\'") + + with open('../../config/glow/base_blank.json', 'r') as jfile: + json_config = json.load(jfile) + + json_config["data"]["chars"] = chars + json_config["data"]["punc"] = punct_with_space + json_config["data"]["training_files"]=out_path_for_txts + '/train.txt' + json_config["data"]["validation_files"] = out_path_for_txts + '/valid.txt' + new_config_name = out_path_for_txts.split('/')[-1] + with open(f'../../config/glow/{new_config_name}.json','w+') as jfile: + json.dump(json_config, jfile) + + print(f"Characters: {chars}") + print(f"Len of vocab: {len(chars)}") + print(f"Punctuation: {punct_with_space}") + print(f"Config file is stored at ../../config/glow/{new_config_name}.json") + + outfile_f = open(outfile, "w+", encoding="utf-8") + for f, t in zip(fnames, ftexts): + print( + os.path.join(wav_path_for_annotations_txt, f) + ".wav", + t, + sep="|", + file=outfile_f, + ) + outfile_f.close() + write_txt(punct_with_space, os.path.join(out_path_for_txts, "punc.txt")) + write_txt(chars, os.path.join(out_path_for_txts, "chars.txt")) + + save_train_test_valid_split( + annotations_txt=outfile, + num_samples_valid=num_samples_valid, + num_samples_test=num_samples_test, + ) + + + + +if __name__ == "__main__": + + + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--text-path", type=str, required=True) + parser.add_argument("-o", "--output-path", type=str, required=True) + parser.add_argument("-w", "--wav-path", type=str, required=True) + parser.add_argument("-v", "--valid-samples", type=int, default = 100) + parser.add_argument("-t", "--test-samples", type=int, default = 10) + args = parser.parse_args() + + save_txts_from_txt_done_data( + args.text_path, + args.wav_path, + args.output_path, + args.valid_samples, + args.test_samples, + ) diff --git a/vakyansh-tts/utils/hifi/prepare_iitm_data_hifi.py b/vakyansh-tts/utils/hifi/prepare_iitm_data_hifi.py new file mode 100644 index 0000000000000000000000000000000000000000..1e1de2e28735143aeef8ddb10bc5a4672c02564b --- /dev/null +++ b/vakyansh-tts/utils/hifi/prepare_iitm_data_hifi.py @@ -0,0 +1,64 @@ + +import glob +import random +import sys +import os +import argparse + + + + +def process_data(args): + + path = args.input_path + valid_files = args.valid_files + test_files = args.test_files + dest_path = args.dest_path + + list_paths = path.split(',') + + valid_set = [] + training_set = [] + test_set = [] + + for local_path in list_paths: + files = glob.glob(local_path+'/*.wav') + print(f"Total files: {len(files)}") + + valid_set_local = random.sample(files, valid_files) + + test_set_local = random.sample(valid_set_local, test_files) + valid_set.extend(list(set(valid_set_local) - set(test_set_local))) + test_set.extend(test_set_local) + + print(len(valid_set_local)) + + training_set_local = set(files) - set(valid_set_local) + print(len(training_set_local)) + training_set.extend(training_set_local) + + + valid_set = random.sample(valid_set, len(valid_set)) + test_set = random.sample(test_set, len(test_set)) + training_set = random.sample(training_set, len(training_set)) + + with open(os.path.join(dest_path , 'valid.txt'), mode = 'w+') as file: + file.write("\n".join(list(valid_set))) + + with open(os.path.join(dest_path , 'train.txt'), mode = 'w+') as file: + file.write("\n".join(list(training_set))) + + with open(os.path.join(dest_path , 'test.txt'), mode = 'w+') as file: + file.write("\n".join(list(test_set))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-i','--input-path',type=str,help='path to input wav files') + parser.add_argument('-v','--valid-files',type=int,help='number of valid files') + parser.add_argument('-t','--test-files',type=int,help='number of test files') + parser.add_argument('-d','--dest-path',type=str,help='destination path to output filelists') + + args = parser.parse_args() + + process_data(args) \ No newline at end of file diff --git a/vakyansh-tts/utils/inference/advanced_tts.py b/vakyansh-tts/utils/inference/advanced_tts.py new file mode 100644 index 0000000000000000000000000000000000000000..ccf42704b83aee57487359f447a0966c05de704e --- /dev/null +++ b/vakyansh-tts/utils/inference/advanced_tts.py @@ -0,0 +1,135 @@ + +from tts import TextToMel, MelToWav +from transliterate import XlitEngine +from num_to_word_on_sent import normalize_nums + +import re +import numpy as np +from scipy.io.wavfile import write + +from mosestokenizer import * +from indicnlp.tokenize import sentence_tokenize +import argparse + +_INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"] +_PURAM_VIRAM_LANGUAGES = ["hi", "or", "bn", "as"] +_TRANSLITERATION_NOT_AVAILABLE_IN = ["en","or"] +#_NUM2WORDS_NOT_AVAILABLE_IN = [] + +def normalize_text(text, lang): + if lang in _PURAM_VIRAM_LANGUAGES: + text = text.replace('|', '।') + text = text.replace('.', '।') + return text + +def split_sentences(paragraph, language): + if language == "en": + with MosesSentenceSplitter(language) as splitter: + return splitter([paragraph]) + elif language in _INDIC: + return sentence_tokenize.sentence_split(paragraph, lang=language) + + + +def load_models(acoustic, vocoder, device): + text_to_mel = TextToMel(glow_model_dir=acoustic, device=device) + mel_to_wav = MelToWav(hifi_model_dir=vocoder, device=device) + return text_to_mel, mel_to_wav + + +def translit(text, lang): + reg = re.compile(r'[a-zA-Z]') + words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()] + updated_sent = ' '.join(words) + return updated_sent + + + +def run_tts(text, lang, args): + if lang == 'hi': + text = text.replace('।', '.') # only for hindi models + + if lang == 'en' and text[-1] != '.': + text = text + '. ' + + if args.number_conversion == 1 and lang!='en': + print("Doing number conversion") + text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang + else: + text_num_to_word = text + + + if args.transliteration == 1 and lang not in _TRANSLITERATION_NOT_AVAILABLE_IN: + print("Doing transliteration") + text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang + else: + text_num_to_word_and_transliterated = text_num_to_word + + final_text = ' ' + text_num_to_word_and_transliterated + + mel = text_to_mel.generate_mel(final_text, args.noise_scale, args.length_scale) + audio, sr = mel_to_wav.generate_wav(mel) + return sr, audio + +def run_tts_paragraph(args): + audio_list = [] + if args.split_sentences == 1: + text = normalize_text(args.text, args.lang) + split_sentences_list = split_sentences(text, args.lang) + + for sent in split_sentences_list: + sr, audio = run_tts(sent, args.lang, args) + audio_list.append(audio) + + concatenated_audio = np.concatenate([i for i in audio_list]) + if args.wav: + write(filename=args.wav, rate=sr, data=concatenated_audio) + return (sr, concatenated_audio) + else: + sr, audio = run_tts(args.text, args.lang, args) + if args.wav: + write(filename=args.wav, rate=sr, data=audio) + return (sr, audio) + + +def load_all_models(args): + global engine + if args.lang not in _TRANSLITERATION_NOT_AVAILABLE_IN: + engine = XlitEngine(args.lang) # loading translit model globally + + global text_to_mel + global mel_to_wav + + text_to_mel, mel_to_wav = load_models(args.acoustic, args.vocoder, args.device) + + try: + args.noise_scale = float(args.noise_scale) + args.length_scale = float(args.length_scale) + except: + pass + + print(args) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--acoustic", required=True, type=str) + parser.add_argument("-v", "--vocoder", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-t", "--text", type=str, required=True) + parser.add_argument("-w", "--wav", type=str, required=True) + parser.add_argument("-n", "--noise-scale", default='0.667', type=str ) + parser.add_argument("-l", "--length-scale", default='1.0', type=str) + + parser.add_argument("-T", "--transliteration", default=1, type=int) + parser.add_argument("-N", "--number-conversion", default=1, type=int) + parser.add_argument("-S", "--split-sentences", default=1, type=int) + parser.add_argument("-L", "--lang", type=str, required=True) + + args = parser.parse_args() + + load_all_models(args) + run_tts_paragraph(args) + + diff --git a/vakyansh-tts/utils/inference/api.py b/vakyansh-tts/utils/inference/api.py new file mode 100644 index 0000000000000000000000000000000000000000..d6bcabd194a4531801941d5e1d248dc134ce255f --- /dev/null +++ b/vakyansh-tts/utils/inference/api.py @@ -0,0 +1,66 @@ +from starlette.responses import StreamingResponse +from tts import MelToWav, TextToMel +from advanced_tts import load_all_models, run_tts_paragraph +from typing import Optional +from pydantic import BaseModel +from fastapi import FastAPI, HTTPException +import uvicorn +import base64 +import argparse +import json +import time +from argparse import Namespace + +app = FastAPI() + + +class TextJson(BaseModel): + text: str + lang: Optional[str] = "hi" + noise_scale: Optional[float]=0.667 + length_scale: Optional[float]=1.0 + transliteration: Optional[int]=1 + number_conversion: Optional[int]=1 + split_sentences: Optional[int]=1 + + + + +@app.post("/TTS/") +async def tts(input: TextJson): + text = input.text + lang = input.lang + + args = Namespace(**input.dict()) + + args.wav = '../../results/api/'+str(int(time.time())) + '.wav' + + if text: + sr, audio = run_tts_paragraph(args) + else: + raise HTTPException(status_code=400, detail={"error": "No text"}) + + ## to return outpur as a file + audio = open(args.wav, mode='rb') + return StreamingResponse(audio, media_type="audio/wav") + + # with open(args.wav, "rb") as audio_file: + # encoded_bytes = base64.b64encode(audio_file.read()) + # encoded_string = encoded_bytes.decode() + # return {"encoding": "base64", "data": encoded_string, "sr": sr} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--acoustic", required=True, type=str) + parser.add_argument("-v", "--vocoder", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-L", "--lang", type=str, required=True) + + args = parser.parse_args() + + load_all_models(args) + + uvicorn.run( + "api:app", host="0.0.0.0", port=6006, log_level="debug" + ) diff --git a/vakyansh-tts/utils/inference/num_to_word_on_sent.py b/vakyansh-tts/utils/inference/num_to_word_on_sent.py new file mode 100644 index 0000000000000000000000000000000000000000..ce878a8c3ee6f5ef629abeaee418d5959f7179ed --- /dev/null +++ b/vakyansh-tts/utils/inference/num_to_word_on_sent.py @@ -0,0 +1,1314 @@ +import re +import string + +# ----------------------------- indic_num.py ----------------------------- +supported_lang = {"en", "hi", "gu", "mr", "bn", "te", "ta", "kn", "or", "pa"} +# supported_lang = {'eng', 'hin', 'guj', 'mar', 'ben', 'tel', 'tam', 'kan', 'ori', 'pan'} # Three alphabet lang code + +all_num = { + "en": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], + "hi": ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"], + "gu": ["૦", "૧", "૨", "૩", "૪", "૫", "૬", "૭", "૮", "૯"], + "mr": ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"], + "bn": ["০", "১", "২", "৩", "৪", "৫", "৬", "৭", "৮", "৯"], + "te": ["౦", "౧", "౨", "౩", "౪", "౫", "౬", "౭", "౮", "౯"], + "ta": ["0", "௧", "௨", "௩", "௪", "௫", "௬", "௭", "௮", "௯", "௰"], + "kn": ["೦", "೧", "೨", "೩", "೪", "೫", "೬", "೭", "೮", "೯"], + "or": ["୦", "୧", "୨", "୩", "୪", "୫", "୬", "୭", "୮", "୯"], + "pa": ["੦", "੧", "੨", "੩", "੪", "੫", "੬", "੭", "੮", "੯"], +} + +num_dict = dict() +num_dict["en"] = { + "0": "zero", + "1": "one", + "2": "two", + "3": "three", + "4": "four", + "5": "five", + "6": "six", + "7": "seven", + "8": "eight", + "9": "nine", + "10": "ten", + "11": "eleven", + "12": "twelve", + "13": "thirteen", + "14": "fourteen", + "15": "fifteen", + "16": "sixteen", + "17": "seventeen", + "18": "eighteen", + "19": "nineteen", + "20": "twenty", + "21": "twenty-one", + "22": "twenty-two", + "23": "twenty-three", + "24": "twenty-four", + "25": "twenty-five", + "26": "twenty-six", + "27": "twenty-seven", + "28": "twenty-eight", + "29": "twenty-nine", + "30": "thirty", + "31": "thirty-one", + "32": "thirty-two", + "33": "thirty-three", + "34": "thirty-four", + "35": "thirty-five", + "36": "thirty-six", + "37": "thirty-seven", + "38": "thirty-eight", + "39": "thirty-nine", + "40": "forty", + "41": "forty-one", + "42": "forty-two", + "43": "forty-three", + "44": "forty-four", + "45": "forty-five", + "46": "forty-six", + "47": "forty-seven", + "48": "forty-eight", + "49": "forty-nine", + "50": "fifty", + "51": "fifty-one", + "52": "fifty-two", + "53": "fifty-three", + "54": "fifty-four", + "55": "fifty-five", + "56": "fifty-six", + "57": "fifty-seven", + "58": "fifty-eight", + "59": "fifty-nine", + "60": "sixty", + "61": "sixty-one", + "62": "sixty-two", + "63": "sixty-three", + "64": "sixty-four", + "65": "sixty-five", + "66": "sixty-six", + "67": "sixty-seven", + "68": "sixty-eight", + "69": "sixty-nine", + "70": "seventy", + "71": "seventy-one", + "72": "seventy-two", + "73": "seventy-three", + "74": "seventy-four", + "75": "seventy-five", + "76": "seventy-six", + "77": "seventy-seven", + "78": "seventy-eight", + "79": "seventy-nine", + "80": "eighty", + "81": "eighty-one", + "82": "eighty-two", + "83": "eighty-three", + "84": "eighty-four", + "85": "eighty-five", + "86": "eighty-six", + "87": "eighty-seven", + "88": "eighty-eight", + "89": "eighty-nine", + "90": "ninety", + "91": "ninety-one", + "92": "ninety-two", + "93": "ninety-three", + "94": "ninety-four", + "95": "ninety-five", + "96": "ninety-six", + "97": "ninety-seven", + "98": "ninety-eight", + "99": "ninety-nine", + "100": "hundred", + "1000": "thousand", + "100000": "lac", + "10000000": "crore", + "1000000000": "arab", +} # English-India +num_dict["hi"] = { + "0": "शून्य", + "1": "एक", + "2": "दो", + "3": "तीन", + "4": "चार", + "5": "पाँच", + "6": "छः", + "7": "सात", + "8": "आठ", + "9": "नौ", + "10": "दस", + "11": "ग्यारह", + "12": "बारह", + "13": "तेरह", + "14": "चौदह", + "15": "पंद्रह", + "16": "सोलह", + "17": "सत्रह", + "18": "अट्ठारह", + "19": "उन्नीस", + "20": "बीस", + "21": "इक्कीस", + "22": "बाईस", + "23": "तेईस", + "24": "चौबिस", + "25": "पच्चीस", + "26": "छब्बीस", + "27": "सत्ताईस", + "28": "अट्ठाईस", + "29": "उनतीस", + "30": "तीस", + "31": "इकतीस", + "32": "बत्तीस", + "33": "तैंतीस", + "34": "चौंतीस", + "35": "पैंतीस", + "36": "छत्तीस", + "37": "सैंतीस", + "38": "अड़तीस", + "39": "उनतालीस", + "40": "चालीस", + "41": "इकतालीस", + "42": "बयालीस", + "43": "तैंतालीस", + "44": "चौंतालीस", + "45": "पैंतालीस", + "46": "छियालीस", + "47": "सैंतालीस", + "48": "अड़तालीस", + "49": "उनचास", + "50": "पचास", + "51": "इक्यावन​", + "52": "बावन", + "53": "तिरेपन", + "54": "चौवन", + "55": "पचपन", + "56": "छप्पन", + "57": "सत्तावन", + "58": "अट्ठावन", + "59": "उनसठ", + "60": "साठ", + "61": "इकसठ", + "62": "बासठ", + "63": "तिरेसठ", + "64": "चौंसठ", + "65": "पैंसठ", + "66": "छयासठ", + "67": "सरसठ​", + "68": "अड़सठ", + "69": "उनहत्तर", + "70": "सत्तर", + "71": "इकहत्तर", + "72": "बहत्तर", + "73": "तिहत्तर", + "74": "चौहत्तर", + "75": "पचहत्तर", + "76": "छिहत्तर", + "77": "सतहत्तर", + "78": "अठहत्तर", + "79": "उन्यासी", + "80": "अस्सी", + "81": "इक्यासी", + "82": "बयासी", + "83": "तिरासी", + "84": "चौरासी", + "85": "पचासी", + "86": "छियासी", + "87": "सत्तासी", + "88": "अठासी", + "89": "नवासी", + "90": "नब्बे", + "91": "इक्यानवे", + "92": "बानवे", + "93": "तिरानवे", + "94": "चौरानवे", + "95": "पचानवे", + "96": "छियानवे", + "97": "सत्तानवे", + "98": "अट्ठानवे", + "99": "निन्यानवे", + "100": "सौ", + "1000": "हज़ार", + "100000": "लाख", + "10000000": "करोड़", + "1000000000": "अरब", +} # Hindi +num_dict["gu"] = { + "0": "શૂન્ય", + "1": "એક", + "2": "બે", + "3": "ત્રણ", + "4": "ચાર", + "5": "પાંચ", + "6": "છ", + "7": "સાત", + "8": "આઠ", + "9": "નવ", + "10": "દસ", + "11": "અગિયાર", + "12": "બાર", + "13": "તેર", + "14": "ચૌદ", + "15": "પંદર", + "16": "સોળ", + "17": "સત્તર", + "18": "અઢાર", + "19": "ઓગણિસ", + "20": "વીસ", + "21": "એકવીસ", + "22": "બાવીસ", + "23": "તેવીસ", + "24": "ચોવીસ", + "25": "પચ્ચીસ", + "26": "છવીસ", + "27": "સત્તાવીસ", + "28": "અઠ્ઠાવીસ", + "29": "ઓગણત્રીસ", + "30": "ત્રીસ", + "31": "એકત્રીસ", + "32": "બત્રીસ", + "33": "તેત્રીસ", + "34": "ચોત્રીસ", + "35": "પાંત્રીસ", + "36": "છત્રીસ", + "37": "સડત્રીસ", + "38": "અડત્રીસ", + "39": "ઓગણચાલીસ", + "40": "ચાલીસ", + "41": "એકતાલીસ", + "42": "બેતાલીસ", + "43": "ત્રેતાલીસ", + "44": "ચુંમાલીસ", + "45": "પિસ્તાલીસ", + "46": "છેતાલીસ", + "47": "સુડતાલીસ", + "48": "અડતાલીસ", + "49": "ઓગણપચાસ", + "50": "પચાસ", + "51": "એકાવન", + "52": "બાવન", + "53": "ત્રેપન", + "54": "ચોપન", + "55": "પંચાવન", + "56": "છપ્પન", + "57": "સત્તાવન", + "58": "અઠ્ઠાવન", + "59": "ઓગણસાઠ", + "60": "સાઈઠ", + "61": "એકસઠ", + "62": "બાસઠ", + "63": "ત્રેસઠ", + "64": "ચોસઠ", + "65": "પાંસઠ", + "66": "છાસઠ", + "67": "સડસઠ", + "68": "અડસઠ", + "69": "અગણોસિત્તેર", + "70": "સિત્તેર", + "71": "એકોતેર", + "72": "બોતેર", + "73": "તોતેર", + "74": "ચુમોતેર", + "75": "પંચોતેર", + "76": "છોતેર", + "77": "સિત્યોતેર", + "78": "ઇઠ્યોતેર", + "79": "ઓગણાએંસી", + "80": "એંસી", + "81": "એક્યાસી", + "82": "બ્યાસી", + "83": "ત્યાસી", + "84": "ચોર્યાસી", + "85": "પંચાસી", + "86": "છ્યાસી", + "87": "સિત્યાસી", + "88": "ઈઠ્યાસી", + "89": "નેવ્યાસી", + "90": "નેવું", + "91": "એકાણું", + "92": "બાણું", + "93": "ત્રાણું", + "94": "ચોરાણું", + "95": "પંચાણું", + "96": "છન્નું", + "97": "સત્તાણું", + "98": "અઠ્ઠાણું", + "99": "નવ્વાણું", + "100": "સો", + "1000": "હજાર", + "100000": "લાખ", + "1000000": "દસ લાખ", + "10000000": "કરોડ઼", +} # Gujarati +num_dict["mr"] = { + "0": "शून्य", + "1": "एक", + "2": "दोन", + "3": "तीन", + "4": "चार", + "5": "पाच", + "6": "सहा", + "7": "सात", + "8": "आठ", + "9": "नऊ", + "10": "दहा", + "11": "अकरा", + "12": "बारा", + "13": "तेरा", + "14": "चौदा", + "15": "पंधरा", + "16": "सोळा", + "17": "सतरा", + "18": "अठरा", + "19": "एकोणीस", + "20": "वीस", + "21": "एकवीस", + "22": "बावीस", + "23": "तेवीस", + "24": "चोवीस", + "25": "पंचवीस", + "26": "सव्वीस", + "27": "सत्तावीस", + "28": "अठ्ठावीस", + "29": "एकोणतीस", + "30": "तीस", + "31": "एकतीस", + "32": "बत्तीस", + "33": "तेहेतीस", + "34": "चौतीस", + "35": "पस्तीस", + "36": "छत्तीस", + "37": "सदतीस", + "38": "अडतीस", + "39": "एकोणचाळीस", + "40": "चाळीस", + "41": "एक्केचाळीस", + "42": "बेचाळीस", + "43": "त्रेचाळीस", + "44": "चव्वेचाळीस", + "45": "पंचेचाळीस", + "46": "सेहेचाळीस", + "47": "सत्तेचाळीस", + "48": "अठ्ठेचाळीस", + "49": "एकोणपन्नास", + "50": "पन्नास", + "51": "एक्कावन्न", + "52": "बावन्न", + "53": "त्रेपन्न", + "54": "चोपन्न", + "55": "पंचावन्न", + "56": "छप्पन्न", + "57": "सत्तावन्न", + "58": "अठ्ठावन्न", + "59": "एकोणसाठ", + "60": "साठ", + "61": "एकसष्ठ", + "62": "बासष्ठ", + "63": "त्रेसष्ठ", + "64": "चौसष्ठ", + "65": "पासष्ठ", + "66": "सहासष्ठ", + "67": "सदुसष्ठ", + "68": "अडुसष्ठ", + "69": "एकोणसत्तर", + "70": "सत्तर", + "71": "एक्काहत्तर", + "72": "बाहत्तर", + "73": "त्र्याहत्तर", + "74": "चौर्‍याहत्तर", + "75": "पंच्याहत्तर", + "76": "शहात्तर", + "77": "सत्याहत्तर", + "78": "अठ्ठ्याहत्तर", + "79": "एकोण ऐंशी", + "80": "ऐंशी", + "81": "एक्क्याऐंशी", + "82": "ब्याऐंशी", + "83": "त्र्याऐंशी", + "84": "चौऱ्याऐंशी", + "85": "पंच्याऐंशी", + "86": "शहाऐंशी", + "87": "सत्त्याऐंशी", + "88": "अठ्ठ्याऐंशी", + "89": "एकोणनव्वद", + "90": "नव्वद", + "91": "एक्क्याण्णव", + "92": "ब्याण्णव", + "93": "त्र्याण्णव", + "94": "चौऱ्याण्णव", + "95": "पंच्याण्णव", + "96": "शहाण्णव", + "97": "सत्त्याण्णव", + "98": "अठ्ठ्याण्णव", + "99": "नव्व्याण्णव", + "100": "शे", + "1000": "हजार", + "100000": "लाख", + "10000000": "कोटी", + "1000000000": "अब्ज", +} # Marathi +num_dict["bn"] = { + "0": "শূন্য", + "1": "এক", + "2": "দুই", + "3": "তিন", + "4": "চার", + "5": "পাঁচ", + "6": "ছয়", + "7": "সাত", + "8": "আট", + "9": "নয়", + "10": "দশ", + "11": "এগার", + "12": "বার", + "13": "তের", + "14": "চৌদ্দ", + "15": "পনের", + "16": "ষোল", + "17": "সতের", + "18": "আঠার", + "19": "ঊনিশ", + "20": "বিশ", + "21": "একুশ", + "22": "বাইশ", + "23": "তেইশ", + "24": "চব্বিশ", + "25": "পঁচিশ", + "26": "ছাব্বিশ", + "27": "সাতাশ", + "28": "আঠাশ", + "29": "ঊনত্রিশ", + "30": "ত্রিশ", + "31": "একত্রিশ", + "32": "বত্রিশ", + "33": "তেত্রিশ", + "34": "চৌত্রিশ", + "35": "পঁয়ত্রিশ", + "36": "ছত্রিশ", + "37": "সাঁইত্রিশ", + "38": "আটত্রিশ", + "39": "ঊনচল্লিশ", + "40": "চল্লিশ", + "41": "একচল্লিশ", + "42": "বিয়াল্লিশ", + "43": "তেতাল্লিশ", + "44": "চুয়াল্লিশ", + "45": "পঁয়তাল্লিশ", + "46": "ছেচল্লিশ", + "47": "সাতচল্লিশ", + "48": "আটচল্লিশ", + "49": "ঊনপঞ্চাশ", + "50": "পঞ্চাশ", + "51": "একান্ন", + "52": "বায়ান্ন", + "53": "তিপ্পান্ন", + "54": "চুয়ান্ন", + "55": "পঞ্চান্ন", + "56": "ছাপ্পান্ন", + "57": "সাতান্ন", + "58": "আটান্ন", + "59": "ঊনষাট", + "60": "ষাট", + "61": "একষট্টি", + "62": "বাষট্টি", + "63": "তেষট্টি", + "64": "চৌষট্টি", + "65": "পঁয়ষট্টি", + "66": "ছেষট্টি", + "67": "সাতষট্টি", + "68": "আটষট্টি", + "69": "ঊনসত্তর", + "70": "সত্তর", + "71": "একাত্তর", + "72": "বাহাত্তর", + "73": "তিয়াত্তর", + "74": "চুয়াত্তর", + "75": "পঁচাত্তর", + "76": "ছিয়াত্তর", + "77": "সাতাত্তর", + "78": "আটাত্তর", + "79": "ঊনআশি", + "80": "আশি", + "81": "একাশি", + "82": "বিরাশি", + "83": "তিরাশি", + "84": "চুরাশি", + "85": "পঁচাশি", + "86": "ছিয়াশি", + "87": "সাতাশি", + "88": "আটাশি", + "89": "ঊননব্বই", + "90": "নব্বই", + "91": "একানব্বই", + "92": "বিরানব্বই", + "93": "তিরানব্বই", + "94": "চুরানব্বই", + "95": "পঁচানব্বই", + "96": "ছিয়ানব্বই", + "97": "সাতানব্বই", + "98": "আটানব্বই", + "99": "নিরানব্বই", + "100": "শো", + "1000": "হাজার", + "100000": "লাখ", + "10000000": "কোটি", + "1000000000": "একশ’ কোটি", +} # Bengali +num_dict["te"] = { + "0": "సున్నా", + "1": "ఒకటి", + "2": "రెండు", + "3": "మూడు", + "4": "నాలుగు", + "5": "ఐదు", + "6": "ఆరు", + "7": "ఏడు", + "8": "ఎనిమిది", + "9": "తొమ్మిది", + "10": "పది", + "11": "పదకొండు", + "12": "పన్నెండు", + "13": "పదమూడు", + "14": "పద్నాలుగు", + "15": "పదిహేను", + "16": "పదహారు", + "17": "పదిహేడు", + "18": "పద్దెనిమిది", + "19": "పందొమ్మిది", + "20": "ఇరవై", + "21": "ఇరవై ఒకటి", + "22": "ఇరవై రెండు", + "23": "ఇరవై మూడు", + "24": "ఇరవై నాలుగు", + "25": "ఇరవై ఐదు", + "26": "ఇరవై ఆరు", + "27": "ఇరవై ఏడు", + "28": "ఇరవై ఎనిమిది", + "29": "ఇరవై తొమ్మిది", + "30": "ముప్పై", + "31": "ముప్పై ఒకటి", + "32": "ముప్పై రెండు", + "33": "ముప్పై మూడు", + "34": "ముప్పై నాలుగు", + "35": "ముప్పై ఐదు", + "36": "ముప్పై ఆరు", + "37": "ముప్పై ఏడు", + "38": "ముప్పై ఎనిమిది", + "39": "ముప్పై తొమ్మిది", + "40": "నలభై", + "41": "నలభై ఒకటి", + "42": "నలభై రెండు", + "43": "నలభై మూడు", + "44": "నలభై నాలుగు", + "45": "నలభై ఐదు", + "46": "నలభై ఆరు", + "47": "నలభై ఏడు", + "48": "నలభై ఎనిమిది", + "49": "నలభై తొమ్మిది", + "50": "యాభై", + "51": "యాభై ఒకటి", + "52": "యాభై రెండు", + "53": "యాభై మూడు", + "54": "యాభై నాలుగు", + "55": "యాభై ఐదు", + "56": "యాభై ఆరు", + "57": "యాభై ఏడు", + "58": "యాభై ఎనిమిది", + "59": "యాభై తొమ్మిది", + "60": "అరవై", + "61": "అరవై ఒకటి", + "62": "అరవై రెండు", + "63": "అరవై మూడు", + "64": "అరవై నాలుగు", + "65": "అరవై ఐదు", + "66": "అరవై ఆరు", + "67": "అరవై ఏడు", + "68": "అరవై ఎనిమిది", + "69": "అరవై తొమ్మిది", + "70": "డెబ్బై", + "71": "డెబ్బై ఒకటి", + "72": "డెబ్బై రెండు", + "73": "డెబ్బై మూడు", + "74": "డెబ్బై నాలుగు", + "75": "డెబ్బై ఐదు", + "76": "డెబ్బై ఆరు", + "77": "డెబ్బై ఏడు", + "78": "డెబ్బై ఎనిమిది", + "79": "డెబ్బై తొమ్మిది", + "80": "ఎనభై", + "81": "ఎనభై ఒకటి", + "82": "ఎనభై రెండు", + "83": "ఎనభై మూడు", + "84": "ఎనభై నాలుగు", + "85": "ఎనభై ఐదు", + "86": "ఎనభై ఆరు", + "87": "ఎనభై ఏడు", + "88": "ఎనభై ఎనిమిది", + "89": "ఎనభై తొమ్మిది", + "90": "తొంభై", + "91": "తొంభై ఒకటి", + "92": "తొంభై రెండు", + "93": "తొంభై మూడు", + "94": "తొంభై నాలుగు", + "95": "తొంభై ఐదు", + "96": "తొంభై ఆరు", + "97": "తొంభై ఏడు", + "98": "తొంభై ఎనిమిది", + "99": "తొంభై తొమ్మిది", + "100": "వందల", + "1000": "వేల", + "100000": "లక్షల", + "10000000": "కోట్ల", + "1000000000": "బిలియన్", +} # Telugu +num_dict["ta"] = { + "0": "பூஜ்ஜியம்", + "1": "ஒன்று", + "2": "இரண்டு", + "3": "மூன்று", + "4": "நான்கு", + "5": "ஐந்து", + "6": "ஆறு", + "7": "ஏழு", + "8": "எட்டு", + "9": "ஒன்பது", + "10": "பத்து", + "11": "பதினொன்று", + "12": "பன்னிரண்டு", + "13": "பதிமூன்று", + "14": "பதினான்கு", + "15": "பதினைந்து", + "16": "பதினாறு", + "17": "பதினேழு", + "18": "பதினெட்டு", + "19": "பத்தொன்பது", + "20": "இருபது", + "21": "இருபது ஒன்று", + "22": "இருபத்து இரண்டு", + "23": "இருபத்து மூன்று", + "24": "இருபத்து நான்கு", + "25": "இருபத்து ஐந்து", + "26": "இருபத்து ஆறு", + "27": "இருபத்து ஏழு", + "28": "இருபத்து எட்டு", + "29": "இருபத்து ஒன்பது", + "30": "முப்பது", + "31": "முப்பத்து ஒன்று", + "32": "முப்பத்து இரண்டு", + "33": "முப்பத்து மூன்று", + "34": "முப்பத்து நான்கு", + "35": "முப்பத்து ஐந்து", + "36": "முப்பத்து ஆறு", + "37": "முப்பத்து ஏழு", + "38": "முப்பத்து எட்டு", + "39": "முப்பத்து ஒன்பது", + "40": "நாற்பது", + "41": "நாற்பத்து ஒன்று", + "42": "நாற்பத்து இரண்டு", + "43": "நாற்பத்து மூன்று", + "44": "நாற்பத்து நான்கு", + "45": "நாற்பத்து ஐந்து", + "46": "நாற்பத்து ஆறு", + "47": " நாற்பத்து ஏழு", + "48": "நாற்பத்து எட்டு", + "49": "நாற்பத்து ஒன்பது", + "50": "ஐம்பது", + "51": "ஐம்பத்து ஒன்று", + "52": "ஐம்பத்து இரண்டு", + "53": "ஐம்பத்து மூன்று", + "54": "ஐம்பத்து நான்கு", + "55": "ஐம்பத்து ஐந்து", + "56": "ஐம்பத்து ஆறு", + "57": "ஐம்பத்து ஏழு", + "58": "ஐம்பத்து எட்டு", + "59": "ஐம்பத்து ஒன்பது", + "60": "அறுபது", + "61": "அறுபத்து ஒன்று", + "62": "அறுபத்து இரண்டு", + "63": "அறுபத்து மூன்று", + "64": "அறுபத்து நான்கு", + "65": "அறுபத்து ஐந்து", + "66": "அறுபத்து ஆறு", + "67": "அறுபத்து ஏழு", + "68": "அறுபத்து எட்டு", + "69": "அறுபத்து ஒன்பது", + "70": "எழுபது", + "71": "எழுபத்தி ஒன்று", + "72": "எழுபத்தி இரண்டு", + "73": "எழுபத்தி முச்சக்கர", + "74": "எழுபத்தி நான்கு", + "75": "எழுபத்தி ஐந்து", + "76": "எழுபத்தி ஆறு", + "77": "எழுபத்தி ஏழு", + "78": "எழுபத்தி எட்டு", + "79": "எழுபத்தி ஒன்பது", + "80": "எண்பது", + "81": "எண்பத்தியொன்று", + "82": "எண்பத்திரண்டு", + "83": "எண்பத்திமூன்று", + "84": "என்பதினான்கு", + "85": "என்பதினைந்து", + "86": "எண்பத்திஆறு", + "87": "எண்பத்திஏழு", + "88": "எண்பத்தியெட்டு", + "89": "எண்பத்தியொன்பது", + "90": "தொன்னூறு", + "91": "தொண்ணூற்றியொன்று", + "92": "தொண்ணூற்றிரண்டு", + "93": "தொண்ணூற்றிமூன்று", + "94": "தொண்ணூற்றிநான்கு", + "95": "தொண்ணூற்றிஐந்து", + "96": "தொண்ணூற்றியாறு", + "97": "தொண்ணூற்றியேழு", + "98": "தொண்ணூற்றியெட்டு", + "99": "தொண்ணூற்றிஒன்பது", + "100": "நூறு", + "1000": "ஆயிரம்", + "100000": "இலட்சம்", + "10000000": "கோடி", + "1000000000": "பில்லியன்", +} # Tamil +num_dict["kn"] = { + "0": "ಸೊನ್ನೆ", + "1": "ಒಂದು", + "2": "ಎರಡು", + "3": "ಮೂರು", + "4": "ನಾಲ್ಕು", + "5": "ಅಯ್ದು", + "6": "ಆರು", + "7": "ಏಳು", + "8": "ಎಂಟು", + "9": "ಒಂಬತ್ತು", + "10": "ಹತ್ತು", + "11": "ಹನ್ನೊಂದು", + "12": "ಹನ್ನೆರಡು", + "13": "ಹದಿಮೂರು", + "14": "ಹದಿನಾಲ್ಕು", + "15": "ಹದಿನೈದು", + "16": "ಹದಿನಾರು", + "17": "ಹದಿನೇಳು", + "18": "ಹದಿನೆಂಟು", + "19": "ಹತ್ತೊಂಬತ್ತು", + "20": "ಇಪ್ಪತ್ತು", + "21": "ಇಪ್ಪತ್ತ್’ಒಂದು", + "22": "ಇಪ್ಪತ್ತ್’ಎರಡು", + "23": "ಇಪ್ಪತ್ತ್’ಮೂರು", + "24": "ಇಪ್ಪತ್ತ್’ನಾಲ್ಕು", + "25": "ಇಪ್ಪತ್ತ್’ಐದು", + "26": "ಇಪ್ಪತ್ತ್’ಆರು", + "27": "ಇಪ್ಪತ್ತ್’ಏಳು", + "28": "ಇಪ್ಪತ್ತ್’ಎಂಟು", + "29": "ಇಪ್ಪತ್ತ್’ಒಂಬತ್ತು", + "30": "ಮೂವತ್ತು", + "31": "ಮುವತ್ತ್’ಒಂದು", + "32": "ಮುವತ್ತ್’ಎರಡು", + "33": "ಮುವತ್ತ್’ಮೂರು", + "34": "ಮೂವತ್ತ್’ನಾಲ್ಕು", + "35": "ಮೂವತ್ತ್’ಐದು", + "36": "ಮೂವತ್ತ್’ಆರು", + "37": "ಮೂವತ್ತ್’ಏಳು", + "38": "ಮೂವತ್ತ್’ಎಂಟು", + "39": "ಮೂವತ್ತ್’ಒಂಬತ್ತು", + "40": "ನಲವತ್ತು", + "41": "ನಲವತ್ತೊಂದು", + "42": "ನಲವತ್ತ್ ಎರಡು", + "43": "ನಲವತ್ತ್ ಮೂರು", + "44": "ನಲವತ್ತ್ ನಾಲ್ಕು", + "45": "ನಲವತ್ತೈದು", + "46": "ನಲವತ್ತಾರು", + "47": "ನಲವತ್ತೇಳು", + "48": "ನಲವತ್ತೆಂಟು", + "49": "ನಲವತ್ತೊಂಬತ್ತು", + "50": "ಐವತ್ತು", + "51": "ಐವತ್ತೊಂದು", + "52": "ಐವತ್ತೆರಡು", + "53": "ಐವತ್ತಮೂರು", + "54": "ಐವತ್ತ್ನಾಲ್ಕು", + "55": "ಐವತ್ತೈದು", + "56": "ಐವತ್ತಾರು", + "57": "ಐವತ್ತೇಳು", + "58": "ಐವತ್ತೆಂಟು", + "59": "ಐವತ್ತೊಂಬತ್ತು", + "60": "ಅರವತ್ತು", + "61": "ಅರವತ್ತೊಂದು", + "62": "ಅರವತ್ತೆರಡು", + "63": "ಅರವತ್ತ್ ಮೂರು", + "64": "ಅರವತ್ತ್ ನಾಲ್ಕು", + "65": "ಅರವತ್ತೈದು", + "66": "ಅರವತ್ತಾರು", + "67": "ಅರವತ್ತೇಳು", + "68": "ಅರವತ್ತೆಂಟು", + "69": "ಅರವತ್ತೊಂಬತ್ತು", + "70": "ಎಪ್ಪತ್ತು", + "71": "ಎಪ್ಪತ್ತೊಂದು", + "72": "ಎಪ್ಪತ್ತೆರಡು", + "73": "ಎಪ್ಪತ್ತ್ ಮೂರು", + "74": "ಎಪ್ಪತ್ತ್ ನಾಲ್ಕು", + "75": "ಎಪ್ಪತ್ತೈದು", + "76": "ಎಪ್ಪತ್ತಾರು", + "77": "ಎಪ್ಪತ್ತೇಳು", + "78": "ಎಪ್ಪತ್ತೆಂಟು", + "79": "ಎಪ್ಪತ್ತೊಂಬತ್ತು", + "80": "ಎಂಬತ್ತು", + "81": "ಎಂಬತ್ತೊಂದು", + "82": "ಎಂಬತ್ತೆರಡು", + "83": "ಎಂಬತ್ತ್ ಮೂರು", + "84": "ಎಂಬತ್ತ್ ನಾಲ್ಕು", + "85": "ಎಂಬತ್ತೈದು", + "86": "ಎಂಬತ್ತಾರು", + "87": "ಎಂಬತ್ತೇಳು", + "88": "ಎಂಬತ್ತೆಂಟು", + "89": "ಎಂಬತ್ತೊಂಬತ್ತು", + "90": "ತೊಂಬತ್ತು", + "91": "ತೊಂಬತ್ತೊಂದು", + "92": "ತೊಂಬತ್ತೆರಡು", + "93": "ತೊಂಬತ್ತ ಮೂರು", + "94": "ತೊಂಬತ್ತ ನಾಲ್ಕು", + "95": "ತೊಂಬತ್ತೈದು", + "96": "ತೊಂಬತ್ತಾರು", + "97": "ತೊಂಬತ್ತೇಳು", + "98": "ತೊಂಬತ್ತೆಂಟು", + "99": "ತೊಂಬತ್ತೊಂಬತ್ತು", + "100": "ನೂರ", + "1000": "ಸಾವಿರದ", + "100000": "ಲಕ್ಷದ", + "10000000": "ಕೋಟಿ", + "1000000000": "ಶತಕೋಟಿ", +} # Kannada +num_dict["or"] = { + "0": "ଶୁନ୍ୟ", + "1": "ଏକ", + "2": "ଦୁଇ", + "3": "ତିନି", + "4": "ଚାରି", + "5": "ପାଞ୍ଚ", + "6": "ଛଅ", + "7": "ସାତ", + "8": "ଆଠ", + "9": "ନଅ", + "10": "ନଅ", + "11": "ଏଗାର", + "12": "ବାର", + "13": "ତେର", + "14": "ଚଉଦ", + "15": "ପନ୍ଦର", + "16": "ଷୋହଳ", + "17": "ସତର", + "18": "ଅଠର", + "19": "ଊଣାଇଶ", + "20": "କୋଡିଏ", + "21": "ଏକୋଇଶି", + "22": "ବାଇଶି", + "23": "ତେଇଶି", + "24": "ଚବିଶି", + "25": "ପଚିଶି", + "26": "ଛବିଶି", + "27": "ସତାଇଶି", + "28": "ଅଠାଇଶି", + "29": "ଅଣତିରିଶି", + "30": "ତିରିଶି", + "31": "ଏକତିରିଶି", + "32": "ବତିଶି", + "33": "ତେତିଶି", + "34": "ଚଉତିରିଶି", + "35": "ପଞ୍ଚତିରିଶି", + "36": "ଛତିଶି", + "37": "ସଂଇତିରିଶି", + "38": "ଅଠତିରିଶି", + "39": "ଅଣଚାଳିଶି", + "40": "ଚାଳିଶି", + "41": "ଏକଚାଳିଶି", + "42": "ବୟାଳିଶି", + "43": "ତେୟାଳିଶି", + "44": "ଚଉରାଳିଶି", + "45": "ପଞ୍ଚଚାଳିଶି", + "46": "ଛୟାଳିଶି", + "47": "ସତଚାଳିଶି", + "48": "ଅଠଚାଳିଶି", + "49": "ଅଣଚାଶ", + "50": "ପଚାଶ", + "51": "ଏକାବନ", + "52": "ବାଉନ", + "53": "ତେପନ", + "54": "ଚଉବନ", + "55": "ପଞ୍ଚାବନ", + "56": "ଛପନ", + "57": "ସତାବନ", + "58": "ଅଠାବନ", + "59": "ଅଣଷଠି", + "60": "ଷାଠିଏ", + "61": "ଏକଷଠି", + "62": "ବାଷଠି", + "63": "ତେଷଠି", + "64": "ଚଉଷଠି", + "65": "ପଞ୍ଚଷଠି", + "66": "ଛଅଷଠି", + "67": "ସତଷଠି", + "68": "ଅଠଷଠି", + "69": "ଅଣସ୍ତରୀ", + "70": "ସତୂରୀ", + "71": "ଏକସ୍ତରୀ", + "72": "ବାସ୍ତରୀ", + "73": "ତେସ୍ତରୀ", + "74": "ଚଉସ୍ତରୀ", + "75": "ପଞ୍ଚସ୍ତରୀ", + "76": "ଛଅସ୍ତରୀ", + "77": "ସତସ୍ତରୀ", + "78": "ଅଠସ୍ତରୀ", + "79": "ଅଣାଅଶୀ", + "80": "ଅଶୀ", + "81": "ଏକାଅଶୀ", + "82": "ବୟାଅଶୀ", + "83": "ତେୟାଅଶୀ", + "84": "ଚଉରାଅଶୀ", + "85": "ପଞ୍ଚାଅଶୀ", + "86": "ଛୟାଅଶୀ", + "87": "ସତାଅଶୀ", + "88": "ଅଠାଅଶୀ", + "89": "ଅଣାନବେ", + "90": "ନବେ", + "91": "ଏକାନବେ", + "92": "ବୟାନବେ", + "93": "ତେୟାନବେ", + "94": "ଚଉରାନବେ", + "95": "ପଞ୍ଚାନବେ", + "96": "ଛୟାନବେ", + "97": "ସତାନବେ", + "98": "ଅଠାନବେ", + "99": "ଅନେଶତ", + "100": "ଶହେ", + "1000": "ହଜାର", + "100000": "ଲକ୍ଷ", + "10000000": "କୋଟି", + "1000000000": "କୋଟି", +} # Oriya +num_dict["pa"] = { + "0": "ਸਿਫਰ ", + "1": "ਇੱਕ", + "2": "ਦੋ", + "3": "ਤਿੰਨ", + "4": "ਚਾਰ", + "5": "ਪੰਜ", + "6": "ਛੇ", + "7": "ਸੱਤ", + "8": "ਅੱਠ", + "9": "ਨੌਂ", + "10": "ਦੱਸ", + "11": "ਗਿਆਰਾਂ", + "12": "ਬਾਰਾਂ", + "13": "ਤੇਰਾਂ", + "14": "ਚੌਦਾਂ", + "15": "ਪੰਦਰਾਂ", + "16": "ਸੋਲ਼ਾਂ", + "17": "ਸਤਾਰਾਂ", + "18": "ਅਠਾਰਾਂ", + "19": "ਉਨੀ", + "20": "ਵੀਹ", + "21": "ਇੱਕੀ", + "22": "ਬਾਈ", + "23": "ਤੇਈ", + "24": "ਚੌਵੀ", + "25": "ਪੰਝੀ", + "26": "ਛੱਬੀ", + "27": "ਸਤਾਈ", + "28": "ਅਠਾਈ", + "29": "ਉਨੱਤੀ", + "30": "ਤੀਹ", + "31": "ਇਕੱਤੀ", + "32": "ਬੱਤੀ", + "33": "ਤੇਤੀ", + "34": "ਚੌਂਤੀ", + "35": "ਪੈਂਤੀ", + "36": "ਛੱਤੀ", + "37": "ਸੈਂਤੀ", + "38": "ਅਠੱਤੀ", + "39": "ਉਨਤਾਲੀ", + "40": "ਚਾਲੀ", + "41": "ਇਕਤਾਲੀ", + "42": "ਬਤਾਲੀ", + "43": "ਤਰਤਾਲੀ", + "44": "ਚੌਤਾਲੀ", + "45": "ਪੰਜਤਾਲੀ", + "46": "ਛਿਆਲੀ", + "47": "ਸੰਤਾਲੀ", + "48": "ਅੱਠਤਾਲੀ", + "49": "ਉਣਿੰਜਾ", + "50": "ਪੰਜਾਹ", + "51": "ਇਕਵਿੰਜਾ", + "52": "ਬਵਿੰਜਾ", + "53": "ਤਰਵਿੰਜਾ", + "54": "ਚਰਿੰਜਾ", + "55": "ਪਚਵਿੰਜਾ", + "56": "ਛਪਿੰਜਾ", + "57": "ਸਤਵਿੰਜਾ", + "58": "ਅੱਠਵਿੰਜਾ", + "59": "ਉਣਾਠ", + "60": "ਸੱਠ", + "61": "ਇਕਾਠ", + "62": "ਬਾਠ੍ਹ", + "63": "ਤਰੇਠ੍ਹ", + "64": "ਚੌਠ੍ਹ", + "65": "ਪੈਂਠ", + "66": "ਛਿਆਠ", + "67": "ਸਤਾਹਠ", + "68": "ਅੱਠਾਠ", + "69": "ਉਣੱਤਰ", + "70": "ਸੱਤਰ", + "71": "ਇਕ੍ਹੱਤਰ", + "72": "ਬਹੱਤਰ", + "73": "ਤਹੱਤਰ", + "74": "ਚੌਹੱਤਰ", + "75": "ਪੰਜੱਤਰ", + "76": "ਛਿਹੱਤਰ", + "77": "ਸਤੱਤਰ", + "78": "ਅਠੱਤਰ", + "79": "ਉਣਾਸੀ", + "80": "ਅੱਸੀ", + "81": "ਇਕਾਸੀ", + "82": "ਬਿਆਸੀ", + "83": "ਤਰਾਸੀ", + "84": "ਚਰਾਸੀ", + "85": "ਪੰਜਾਸੀ", + "86": "ਛਿਆਸੀ", + "87": "ਸਤਾਸੀ", + "88": "ਅਠਾਸੀ", + "89": "ਉਣਾਨਵੇਂ", + "90": "ਨੱਬੇ", + "91": "ਇਕਾਨਵੇਂ", + "92": "ਬਿਆਨਵੇਂ", + "93": "ਤਰਾਨਵੇਂ", + "94": "ਚਰਾਨਵੇਂ", + "95": "ਪਚਾਨਵੇਂ", + "96": "ਛਿਆਨਵੇਂ", + "97": "ਸਤਾਨਵੇਂ", + "98": "ਅਠਾਨਵੇਂ", + "99": "ਨਿੜਾਨਵੇਂ", + "100": "ਸੌ", + "1000": "ਹਜਾਰ", + "100000": "ਲੱਖ", + "10000000": "ਕਰੋੜ", + "1000000000": "ਅਰਬ", +} # Punjabi + +# --------------------------- num_to_word.py ------------------------------ +""" +Method to convert Numbers to Words +for indian languages + +Use cases:- +1) Speech recognition pre-processing +2) Language modeling Data pre-processing + +------------------------- +check indic_numbers.py to add support +for any indian language +""" + + +def language_specific_exception(words, lang, combiner): + """ + Language Specific Exception will come here + """ + + def occurs_at_end(piece): + return words[-len(piece) :] == piece + + if lang == "mr": + words = words.replace("एक" + combiner + "शे", "शंभर") + elif lang == "gu": + words = words.replace("બે" + combiner + "સો", "બસ્સો") + elif lang == "te": + exception_dict = { + "1": "ఒక", + "100": "వంద", + "100+": "వందలు", + "1000": "వెయ్యి", + "1000+": "వేలు", + "100000": "లక్ష", + "100000+": "లక్షలు", + "10000000": "కోటి", + "10000000+": "కోట్లు", + } + + test_case = ["100", "1000", "100000", "10000000"] + for test in test_case: + test_word = num_dict["te"][test] + match = num_dict["te"]["1"] + combiner + test_word + # for numbers like : 100, 1000, 100000 + if words == match: + return exception_dict[test] + # for numbers like : 200, 4000, 800000 + elif occurs_at_end(test_word): + words = words.replace(test_word, exception_dict[test + "+"]) + # for numbers like : 105, 1076, 123993 + elif not occurs_at_end(match): + replacement = exception_dict["1"] + combiner + exception_dict[test] + words = words.replace(match, replacement) + + # Exception case for 101...199 + special_case = "ఒక" + combiner + "వంద" + words = words.replace(special_case, "నూట") + elif lang == "kn": + # special case for 100 + if words == ("ಒಂದು" + combiner + "ನೂರ"): + return "ನೂರು" + exception_dict = { + "ನೂರ": "ನೂರು", + "ಸಾವಿರದ": "ಸಾವಿರ", + "ಲಕ್ಷದ": "ಲಕ್ಷ", + "ಕೋಟಿಯ": "ಕೋಟಿ", + } + for expt in exception_dict: + if occurs_at_end(expt): + words = words.replace(expt, exception_dict[expt]) + return words + + +def num_to_word(num, lang, separator=", ", combiner=" "): + """ + Main Method + :param num: Number digits from any indian language + :param lang: Language Code from supported Language + :param separator: Separator character i.e. separator = '-' --> 'two hundred-sixty' + :param combiner: combine number with position i.e. combiner = '-' --> 'two-hundred sixty' + :return: UTF-8 String of numbers in words + """ + lang = lang.lower() + num = str(num) + + # Load dictionary according to language code + assert lang in supported_lang, "Language not supported" + num_dic = num_dict[lang] + + # dash default combiner for english-india + if (lang == "en") & (combiner == " "): + combiner = "-" + + # Remove punctuations from numbers + num = str(num).replace(",", "").replace(" ", "") + + # Replace native language numbers with english digits + for language in supported_lang: + for num_index in range(10): + num = num.replace(all_num[language][num_index], all_num["en"][num_index]) + + # Assert that input contains only integer number + for digit in num: + assert digit in all_num["en"], "Give proper input" + + # Process + # For Number longer than 9 digits + def all_two_digit(digits_2): + if len(digits_2) <= 1: # Provided only one/zero digit + return num_dic.get(digits_2, "") + elif digits_2 == "00": # Two Zero provided + return num_dic["0"] + separator + num_dic["0"] + elif digits_2[0] == "0": # First digit is zero + return num_dic["0"] + separator + num_dic[digits_2[1]] + else: # Both digit provided + return num_dic[digits_2] + + # For Number less than 9 digits + def two_digit(digits_2): + digits_2 = digits_2.lstrip("0") + if len(digits_2) != 0: + return num_dic[digits_2] + else: + return "" + + def all_digit(digits): + digits = digits.lstrip("0") + digit_len = len(digits) + if digit_len > 3: + num_of_digits_to_process = (digit_len % 2) + 1 + process_digits = digits[:num_of_digits_to_process] + base = str(10 ** (int(digit_len / 2) * 2 - 1)) + remain_digits = digits[num_of_digits_to_process:] + return ( + num_dic[process_digits] + + combiner + + num_dic[base] + + separator + + all_digit(remain_digits) + ) + elif len(digits) == 3: + return ( + num_dic[digits[:1]] + + combiner + + num_dic["100"] + + separator + + two_digit(digits[1:]) + ) + else: + return two_digit(digits) + + num = num.lstrip("0") + full_digit_len = len(num) + + if full_digit_len == 0: + output = num_dic["0"] + elif full_digit_len <= 9: + output = all_digit(num) + else: + iteration = round(full_digit_len / 2) + output = all_two_digit(num[:2]) # First to digit + for i in range(1, iteration): + output = ( + output + separator + all_two_digit(num[i * 2 : (i + 1) * 2]) + ) # Next two digit pairs + remaining_digits = num[iteration * 2 :] + if not all_two_digit(remaining_digits) == "": + output = ( + output + separator + all_two_digit(remaining_digits) + ) # remaining Last one/two digits + + output = output.strip(separator) + + output = language_specific_exception(output, lang, combiner) + + return output + + +# --------------------------------- num_to_word_on_a_sent --------------------------------- + + +def is_digit(word, digit_pattern): + return re.search(digit_pattern, word) + + +def remove_punct(sent): + clean = re.sub("[%s]" % re.escape(string.punctuation), " ", sent) + return " ".join([word for word in clean.split() if word]) + + +def normalize_nums(text, lang): + """ + text: str (eg) + lang: lang code ['en', 'hi'] + + returns: str + (eg) + """ + + if lang in supported_lang: + words = text.split() + lang_digits = [str(i) for i in range(0, 10)] + + digit_pattern = "[" + "".join(lang_digits) + "]" + num_indices = [ + ind for ind, word in enumerate(words) if is_digit(word, digit_pattern) + ] + + words_up = [ + num_to_word(word, lang, separator=" ", combiner=" ") + if ind in num_indices + else word + for ind, word in enumerate(words) + ] + return " ".join(words_up) + else: + return text + + +if __name__ == "__main__": + print(normalize_nums("रीटा के पास 16 बिल्लियाँ हैं।", "hi")) diff --git a/vakyansh-tts/utils/inference/run_gradio.py b/vakyansh-tts/utils/inference/run_gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..5b4c46a79fc0b7e1760d1d321a2902e14e3020e3 --- /dev/null +++ b/vakyansh-tts/utils/inference/run_gradio.py @@ -0,0 +1,53 @@ +import gradio as gr +import argparse +import numpy as np +from argparse import Namespace +from advanced_tts import load_all_models, run_tts_paragraph + + +def hit_tts(textbox, slider_noise_scale, slider_length_sclae, choice_transliteration, choice_number_conversion, choice_split_sentences): + inputs_to_gradio = {'text' : textbox, + 'noise_scale': slider_noise_scale, + 'length_scale': slider_length_sclae, + 'transliteration' : 1 if choice_transliteration else 0, + 'number_conversion' : 1 if choice_number_conversion else 0, + 'split_sentences' : 1 if choice_split_sentences else 0 + } + + args = Namespace(**inputs_to_gradio) + args.wav = None + args.lang = lang + + if args.text: + sr, audio = run_tts_paragraph(args) + return (sr, audio) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--acoustic", required=True, type=str) + parser.add_argument("-v", "--vocoder", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-L", "--lang", type=str, required=True) + + global lang + + args = parser.parse_args() + lang = args.lang + load_all_models(args) + + textbox = gr.inputs.Textbox(placeholder="Enter Text to run", default="", label="TTS") + slider_noise_scale = gr.inputs.Slider(minimum=0, maximum=1.0, step=0.001, default=0.667, label='Enter Noise Scale') + slider_length_sclae = gr.inputs.Slider(minimum=0, maximum=2.0, step=0.1, default=1.0, label='Enter Slider Scale') + + choice_transliteration = gr.inputs.Checkbox(default=True, label="Transliteration") + choice_number_conversion = gr.inputs.Checkbox(default=True, label="Number Conversion") + choice_split_sentences = gr.inputs.Checkbox(default=True, label="Split Sentences") + + + + op = gr.outputs.Audio(type="numpy", label=None) + + inputs_to_gradio = [textbox, slider_noise_scale, slider_length_sclae, choice_transliteration, choice_number_conversion, choice_split_sentences] + iface = gr.Interface(fn=hit_tts, inputs=inputs_to_gradio, outputs=op, theme='huggingface', title='Run TTS example') + iface.launch(share=True, enable_queue=True) \ No newline at end of file diff --git a/vakyansh-tts/utils/inference/transliterate.py b/vakyansh-tts/utils/inference/transliterate.py new file mode 100644 index 0000000000000000000000000000000000000000..5ffe9f423d95c5476f97285419dab6d409b173ba --- /dev/null +++ b/vakyansh-tts/utils/inference/transliterate.py @@ -0,0 +1,919 @@ +import torch +import torch.nn as nn +import numpy as np +import pandas as pd +import random +import sys +import os +import json +import enum +import traceback +import re + +#F_DIR = os.path.dirname(os.path.realpath(__file__)) +F_DIR = '../../checkpoints/' + +class XlitError(enum.Enum): + lang_err = "Unsupported langauge ID requested ;( Please check available languages." + string_err = "String passed is incompatable ;(" + internal_err = "Internal crash ;(" + unknown_err = "Unknown Failure" + loading_err = "Loading failed ;( Check if metadata/paths are correctly configured." + + +##=================== Network ================================================== + + +class Encoder(nn.Module): + def __init__( + self, + input_dim, + embed_dim, + hidden_dim, + rnn_type="gru", + layers=1, + bidirectional=False, + dropout=0, + device="cpu", + ): + super(Encoder, self).__init__() + + self.input_dim = input_dim # src_vocab_sz + self.enc_embed_dim = embed_dim + self.enc_hidden_dim = hidden_dim + self.enc_rnn_type = rnn_type + self.enc_layers = layers + self.enc_directions = 2 if bidirectional else 1 + self.device = device + + self.embedding = nn.Embedding(self.input_dim, self.enc_embed_dim) + + if self.enc_rnn_type == "gru": + self.enc_rnn = nn.GRU( + input_size=self.enc_embed_dim, + hidden_size=self.enc_hidden_dim, + num_layers=self.enc_layers, + bidirectional=bidirectional, + ) + elif self.enc_rnn_type == "lstm": + self.enc_rnn = nn.LSTM( + input_size=self.enc_embed_dim, + hidden_size=self.enc_hidden_dim, + num_layers=self.enc_layers, + bidirectional=bidirectional, + ) + else: + raise Exception("XlitError: unknown RNN type mentioned") + + def forward(self, x, x_sz, hidden=None): + """ + x_sz: (batch_size, 1) - Unpadded sequence lengths used for pack_pad + """ + batch_sz = x.shape[0] + # x: batch_size, max_length, enc_embed_dim + x = self.embedding(x) + + ## pack the padded data + # x: max_length, batch_size, enc_embed_dim -> for pack_pad + x = x.permute(1, 0, 2) + x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad + + # output: packed_size, batch_size, enc_embed_dim + # hidden: n_layer**num_directions, batch_size, hidden_dim | if LSTM (h_n, c_n) + output, hidden = self.enc_rnn( + x + ) # gru returns hidden state of all timesteps as well as hidden state at last timestep + + ## pad the sequence to the max length in the batch + # output: max_length, batch_size, enc_emb_dim*directions) + output, _ = nn.utils.rnn.pad_packed_sequence(output) + + # output: batch_size, max_length, hidden_dim + output = output.permute(1, 0, 2) + + return output, hidden + + def get_word_embedding(self, x): + """ """ + x_sz = torch.tensor([len(x)]) + x_ = torch.tensor(x).unsqueeze(0).to(dtype=torch.long) + # x: 1, max_length, enc_embed_dim + x = self.embedding(x_) + + ## pack the padded data + # x: max_length, 1, enc_embed_dim -> for pack_pad + x = x.permute(1, 0, 2) + x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad + + # output: packed_size, 1, enc_embed_dim + # hidden: n_layer**num_directions, 1, hidden_dim | if LSTM (h_n, c_n) + output, hidden = self.enc_rnn( + x + ) # gru returns hidden state of all timesteps as well as hidden state at last timestep + + out_embed = hidden[0].squeeze() + + return out_embed + + +class Decoder(nn.Module): + def __init__( + self, + output_dim, + embed_dim, + hidden_dim, + rnn_type="gru", + layers=1, + use_attention=True, + enc_outstate_dim=None, # enc_directions * enc_hidden_dim + dropout=0, + device="cpu", + ): + super(Decoder, self).__init__() + + self.output_dim = output_dim # tgt_vocab_sz + self.dec_hidden_dim = hidden_dim + self.dec_embed_dim = embed_dim + self.dec_rnn_type = rnn_type + self.dec_layers = layers + self.use_attention = use_attention + self.device = device + if self.use_attention: + self.enc_outstate_dim = enc_outstate_dim if enc_outstate_dim else hidden_dim + else: + self.enc_outstate_dim = 0 + + self.embedding = nn.Embedding(self.output_dim, self.dec_embed_dim) + + if self.dec_rnn_type == "gru": + self.dec_rnn = nn.GRU( + input_size=self.dec_embed_dim + + self.enc_outstate_dim, # to concat attention_output + hidden_size=self.dec_hidden_dim, # previous Hidden + num_layers=self.dec_layers, + batch_first=True, + ) + elif self.dec_rnn_type == "lstm": + self.dec_rnn = nn.LSTM( + input_size=self.dec_embed_dim + + self.enc_outstate_dim, # to concat attention_output + hidden_size=self.dec_hidden_dim, # previous Hidden + num_layers=self.dec_layers, + batch_first=True, + ) + else: + raise Exception("XlitError: unknown RNN type mentioned") + + self.fc = nn.Sequential( + nn.Linear(self.dec_hidden_dim, self.dec_embed_dim), + nn.LeakyReLU(), + # nn.Linear(self.dec_embed_dim, self.dec_embed_dim), nn.LeakyReLU(), # removing to reduce size + nn.Linear(self.dec_embed_dim, self.output_dim), + ) + + ##----- Attention ---------- + if self.use_attention: + self.W1 = nn.Linear(self.enc_outstate_dim, self.dec_hidden_dim) + self.W2 = nn.Linear(self.dec_hidden_dim, self.dec_hidden_dim) + self.V = nn.Linear(self.dec_hidden_dim, 1) + + def attention(self, x, hidden, enc_output): + """ + x: (batch_size, 1, dec_embed_dim) -> after Embedding + enc_output: batch_size, max_length, enc_hidden_dim *num_directions + hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n) + """ + + ## perform addition to calculate the score + + # hidden_with_time_axis: batch_size, 1, hidden_dim + ## hidden_with_time_axis = hidden.permute(1, 0, 2) ## replaced with below 2lines + hidden_with_time_axis = ( + torch.sum(hidden, axis=0) + if self.dec_rnn_type != "lstm" + else torch.sum(hidden[0], axis=0) + ) # h_n + + hidden_with_time_axis = hidden_with_time_axis.unsqueeze(1) + + # score: batch_size, max_length, hidden_dim + score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)) + + # attention_weights: batch_size, max_length, 1 + # we get 1 at the last axis because we are applying score to self.V + attention_weights = torch.softmax(self.V(score), dim=1) + + # context_vector shape after sum == (batch_size, hidden_dim) + context_vector = attention_weights * enc_output + context_vector = torch.sum(context_vector, dim=1) + # context_vector: batch_size, 1, hidden_dim + context_vector = context_vector.unsqueeze(1) + + # attend_out (batch_size, 1, dec_embed_dim + hidden_size) + attend_out = torch.cat((context_vector, x), -1) + + return attend_out, attention_weights + + def forward(self, x, hidden, enc_output): + """ + x: (batch_size, 1) + enc_output: batch_size, max_length, dec_embed_dim + hidden: n_layer, batch_size, hidden_size | lstm: (h_n, c_n) + """ + if (hidden is None) and (self.use_attention is False): + raise Exception( + "XlitError: No use of a decoder with No attention and No Hidden" + ) + + batch_sz = x.shape[0] + + if hidden is None: + # hidden: n_layers, batch_size, hidden_dim + hid_for_att = torch.zeros( + (self.dec_layers, batch_sz, self.dec_hidden_dim) + ).to(self.device) + elif self.dec_rnn_type == "lstm": + hid_for_att = hidden[1] # c_n + + # x (batch_size, 1, dec_embed_dim) -> after embedding + x = self.embedding(x) + + if self.use_attention: + # x (batch_size, 1, dec_embed_dim + hidden_size) -> after attention + # aw: (batch_size, max_length, 1) + x, aw = self.attention(x, hidden, enc_output) + else: + x, aw = x, 0 + + # passing the concatenated vector to the GRU + # output: (batch_size, n_layers, hidden_size) + # hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n) + output, hidden = ( + self.dec_rnn(x, hidden) if hidden is not None else self.dec_rnn(x) + ) + + # output :shp: (batch_size * 1, hidden_size) + output = output.view(-1, output.size(2)) + + # output :shp: (batch_size * 1, output_dim) + output = self.fc(output) + + return output, hidden, aw + + +class Seq2Seq(nn.Module): + """ + Class dependency: Encoder, Decoder + """ + + def __init__( + self, encoder, decoder, pass_enc2dec_hid=False, dropout=0, device="cpu" + ): + super(Seq2Seq, self).__init__() + + self.encoder = encoder + self.decoder = decoder + self.device = device + self.pass_enc2dec_hid = pass_enc2dec_hid + _force_en2dec_hid_conv = False + + if self.pass_enc2dec_hid: + assert ( + decoder.dec_hidden_dim == encoder.enc_hidden_dim + ), "Hidden Dimension of encoder and decoder must be same, or unset `pass_enc2dec_hid`" + if decoder.use_attention: + assert ( + decoder.enc_outstate_dim + == encoder.enc_directions * encoder.enc_hidden_dim + ), "Set `enc_out_dim` correctly in decoder" + assert ( + self.pass_enc2dec_hid or decoder.use_attention + ), "No use of a decoder with No attention and No Hidden from Encoder" + + self.use_conv_4_enc2dec_hid = False + if ( + self.pass_enc2dec_hid + and (encoder.enc_directions * encoder.enc_layers != decoder.dec_layers) + ) or _force_en2dec_hid_conv: + if encoder.enc_rnn_type == "lstm" or encoder.enc_rnn_type == "lstm": + raise Exception( + "XlitError: conv for enc2dec_hid not implemented; Change the layer numbers appropriately" + ) + + self.use_conv_4_enc2dec_hid = True + self.enc_hid_1ax = encoder.enc_directions * encoder.enc_layers + self.dec_hid_1ax = decoder.dec_layers + self.e2d_hidden_conv = nn.Conv1d(self.enc_hid_1ax, self.dec_hid_1ax, 1) + + def enc2dec_hidden(self, enc_hidden): + """ + enc_hidden: n_layer, batch_size, hidden_dim*num_directions + TODO: Implement the logic for LSTm bsed model + """ + # hidden: batch_size, enc_layer*num_directions, enc_hidden_dim + hidden = enc_hidden.permute(1, 0, 2).contiguous() + # hidden: batch_size, dec_layers, dec_hidden_dim -> [N,C,Tstep] + hidden = self.e2d_hidden_conv(hidden) + + # hidden: dec_layers, batch_size , dec_hidden_dim + hidden_for_dec = hidden.permute(1, 0, 2).contiguous() + + return hidden_for_dec + + def active_beam_inference(self, src, beam_width=3, max_tgt_sz=50): + """Search based decoding + src: (sequence_len) + """ + + def _avg_score(p_tup): + """Used for Sorting + TODO: Dividing by length of sequence power alpha as hyperparam + """ + return p_tup[0] + + import sys + + batch_size = 1 + start_tok = src[0] + end_tok = src[-1] + src_sz = torch.tensor([len(src)]) + src_ = src.unsqueeze(0) + + # enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction) + # enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim) + enc_output, enc_hidden = self.encoder(src_, src_sz) + + if self.pass_enc2dec_hid: + # dec_hidden: dec_layers, batch_size , dec_hidden_dim + if self.use_conv_4_enc2dec_hid: + init_dec_hidden = self.enc2dec_hidden(enc_hidden) + else: + init_dec_hidden = enc_hidden + else: + # dec_hidden -> Will be initialized to zeros internally + init_dec_hidden = None + + # top_pred[][0] = Σ-log_softmax + # top_pred[][1] = sequence torch.tensor shape: (1) + # top_pred[][2] = dec_hidden + top_pred_list = [(0, start_tok.unsqueeze(0), init_dec_hidden)] + + for t in range(max_tgt_sz): + cur_pred_list = [] + + for p_tup in top_pred_list: + if p_tup[1][-1] == end_tok: + cur_pred_list.append(p_tup) + continue + + # dec_hidden: dec_layers, 1, hidden_dim + # dec_output: 1, output_dim + dec_output, dec_hidden, _ = self.decoder( + x=p_tup[1][-1].view(1, 1), # dec_input: (1,1) + hidden=p_tup[2], + enc_output=enc_output, + ) + + ## π{prob} = Σ{log(prob)} -> to prevent diminishing + # dec_output: (1, output_dim) + dec_output = nn.functional.log_softmax(dec_output, dim=1) + # pred_topk.values & pred_topk.indices: (1, beam_width) + pred_topk = torch.topk(dec_output, k=beam_width, dim=1) + + for i in range(beam_width): + sig_logsmx_ = p_tup[0] + pred_topk.values[0][i] + # seq_tensor_ : (seq_len) + seq_tensor_ = torch.cat((p_tup[1], pred_topk.indices[0][i].view(1))) + + cur_pred_list.append((sig_logsmx_, seq_tensor_, dec_hidden)) + + cur_pred_list.sort(key=_avg_score, reverse=True) # Maximized order + top_pred_list = cur_pred_list[:beam_width] + + # check if end_tok of all topk + end_flags_ = [1 if t[1][-1] == end_tok else 0 for t in top_pred_list] + if beam_width == sum(end_flags_): + break + + pred_tnsr_list = [t[1] for t in top_pred_list] + + return pred_tnsr_list + + +##===================== Glyph handlers ======================================= + + +class GlyphStrawboss: + def __init__(self, glyphs="en"): + """list of letters in a language in unicode + lang: ISO Language code + glyphs: json file with script information + """ + if glyphs == "en": + # Smallcase alone + self.glyphs = [chr(alpha) for alpha in range(97, 122 + 1)] + else: + self.dossier = json.load(open(glyphs, encoding="utf-8")) + self.glyphs = self.dossier["glyphs"] + self.numsym_map = self.dossier["numsym_map"] + + self.char2idx = {} + self.idx2char = {} + self._create_index() + + def _create_index(self): + + self.char2idx["_"] = 0 # pad + self.char2idx["$"] = 1 # start + self.char2idx["#"] = 2 # end + self.char2idx["*"] = 3 # Mask + self.char2idx["'"] = 4 # apostrophe U+0027 + self.char2idx["%"] = 5 # unused + self.char2idx["!"] = 6 # unused + + # letter to index mapping + for idx, char in enumerate(self.glyphs): + self.char2idx[char] = idx + 7 # +7 token initially + + # index to letter mapping + for char, idx in self.char2idx.items(): + self.idx2char[idx] = char + + def size(self): + return len(self.char2idx) + + def word2xlitvec(self, word): + """Converts given string of gyphs(word) to vector(numpy) + Also adds tokens for start and end + """ + try: + vec = [self.char2idx["$"]] # start token + for i in list(word): + vec.append(self.char2idx[i]) + vec.append(self.char2idx["#"]) # end token + + vec = np.asarray(vec, dtype=np.int64) + return vec + + except Exception as error: + print("XlitError: In word:", word, "Error Char not in Token:", error) + sys.exit() + + def xlitvec2word(self, vector): + """Converts vector(numpy) to string of glyphs(word)""" + char_list = [] + for i in vector: + char_list.append(self.idx2char[i]) + + word = "".join(char_list).replace("$", "").replace("#", "") # remove tokens + word = word.replace("_", "").replace("*", "") # remove tokens + return word + + +class VocabSanitizer: + def __init__(self, data_file): + """ + data_file: path to file conatining vocabulary list + """ + extension = os.path.splitext(data_file)[-1] + if extension == ".json": + self.vocab_set = set(json.load(open(data_file, encoding="utf-8"))) + elif extension == ".csv": + self.vocab_df = pd.read_csv(data_file).set_index("WORD") + self.vocab_set = set(self.vocab_df.index) + else: + print("XlitError: Only Json/CSV file extension supported") + + def reposition(self, word_list): + """Reorder Words in list""" + new_list = [] + temp_ = word_list.copy() + for v in word_list: + if v in self.vocab_set: + new_list.append(v) + temp_.remove(v) + new_list.extend(temp_) + + return new_list + + +##=============== INSTANTIATION ================================================ + + +class XlitPiston: + """ + For handling prediction & post-processing of transliteration for a single language + Class dependency: Seq2Seq, GlyphStrawboss, VocabSanitizer + Global Variables: F_DIR + """ + + def __init__( + self, + weight_path, + vocab_file, + tglyph_cfg_file, + iglyph_cfg_file="en", + device="cpu", + ): + + self.device = device + self.in_glyph_obj = GlyphStrawboss(iglyph_cfg_file) + self.tgt_glyph_obj = GlyphStrawboss(glyphs=tglyph_cfg_file) + self.voc_sanity = VocabSanitizer(vocab_file) + + self._numsym_set = set( + json.load(open(tglyph_cfg_file, encoding="utf-8"))["numsym_map"].keys() + ) + self._inchar_set = set("abcdefghijklmnopqrstuvwxyz") + self._natscr_set = set().union( + self.tgt_glyph_obj.glyphs, sum(self.tgt_glyph_obj.numsym_map.values(), []) + ) + + ## Model Config Static TODO: add defining in json support + input_dim = self.in_glyph_obj.size() + output_dim = self.tgt_glyph_obj.size() + enc_emb_dim = 300 + dec_emb_dim = 300 + enc_hidden_dim = 512 + dec_hidden_dim = 512 + rnn_type = "lstm" + enc2dec_hid = True + attention = True + enc_layers = 1 + dec_layers = 2 + m_dropout = 0 + enc_bidirect = True + enc_outstate_dim = enc_hidden_dim * (2 if enc_bidirect else 1) + + enc = Encoder( + input_dim=input_dim, + embed_dim=enc_emb_dim, + hidden_dim=enc_hidden_dim, + rnn_type=rnn_type, + layers=enc_layers, + dropout=m_dropout, + device=self.device, + bidirectional=enc_bidirect, + ) + dec = Decoder( + output_dim=output_dim, + embed_dim=dec_emb_dim, + hidden_dim=dec_hidden_dim, + rnn_type=rnn_type, + layers=dec_layers, + dropout=m_dropout, + use_attention=attention, + enc_outstate_dim=enc_outstate_dim, + device=self.device, + ) + self.model = Seq2Seq(enc, dec, pass_enc2dec_hid=enc2dec_hid, device=self.device) + self.model = self.model.to(self.device) + weights = torch.load(weight_path, map_location=torch.device(self.device)) + + self.model.load_state_dict(weights) + self.model.eval() + + def character_model(self, word, beam_width=1): + in_vec = torch.from_numpy(self.in_glyph_obj.word2xlitvec(word)).to(self.device) + ## change to active or passive beam + p_out_list = self.model.active_beam_inference(in_vec, beam_width=beam_width) + p_result = [ + self.tgt_glyph_obj.xlitvec2word(out.cpu().numpy()) for out in p_out_list + ] + + result = self.voc_sanity.reposition(p_result) + + # List type + return result + + def numsym_model(self, seg): + """tgt_glyph_obj.numsym_map[x] returns a list object""" + if len(seg) == 1: + return [seg] + self.tgt_glyph_obj.numsym_map[seg] + + a = [self.tgt_glyph_obj.numsym_map[n][0] for n in seg] + return [seg] + ["".join(a)] + + def _word_segementer(self, sequence): + + sequence = sequence.lower() + accepted = set().union(self._numsym_set, self._inchar_set, self._natscr_set) + # sequence = ''.join([i for i in sequence if i in accepted]) + + segment = [] + idx = 0 + seq_ = list(sequence) + while len(seq_): + # for Number-Symbol + temp = "" + while len(seq_) and seq_[0] in self._numsym_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + # for Target Chars + temp = "" + while len(seq_) and seq_[0] in self._natscr_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + # for Input-Roman Chars + temp = "" + while len(seq_) and seq_[0] in self._inchar_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + temp = "" + while len(seq_) and seq_[0] not in accepted: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + return segment + + def inferencer(self, sequence, beam_width=10): + + seg = self._word_segementer(sequence[:120]) + lit_seg = [] + + p = 0 + while p < len(seg): + if seg[p][0] in self._natscr_set: + lit_seg.append([seg[p]]) + p += 1 + + elif seg[p][0] in self._inchar_set: + lit_seg.append(self.character_model(seg[p], beam_width=beam_width)) + p += 1 + + elif seg[p][0] in self._numsym_set: # num & punc + lit_seg.append(self.numsym_model(seg[p])) + p += 1 + else: + lit_seg.append([seg[p]]) + p += 1 + + ## IF segment less/equal to 2 then return combinotorial, + ## ELSE only return top1 of each result concatenated + if len(lit_seg) == 1: + final_result = lit_seg[0] + + elif len(lit_seg) == 2: + final_result = [""] + for seg in lit_seg: + new_result = [] + for s in seg: + for f in final_result: + new_result.append(f + s) + final_result = new_result + + else: + new_result = [] + for seg in lit_seg: + new_result.append(seg[0]) + final_result = ["".join(new_result)] + + return final_result + + +from collections.abc import Iterable +from pydload import dload +import zipfile + +MODEL_DOWNLOAD_URL_PREFIX = "https://github.com/AI4Bharat/IndianNLP-Transliteration/releases/download/xlit_v0.5.0/" + + +def is_folder_writable(folder): + try: + os.makedirs(folder, exist_ok=True) + tmp_file = os.path.join(folder, ".write_test") + with open(tmp_file, "w") as f: + f.write("Permission Check") + os.remove(tmp_file) + return True + except: + return False + + +def is_directory_writable(path): + if os.name == "nt": + return is_folder_writable(path) + return os.access(path, os.W_OK | os.X_OK) + + +class XlitEngine: + """ + For Managing the top level tasks and applications of transliteration + Global Variables: F_DIR + """ + + def __init__( + self, lang2use="all", config_path="translit_models/default_lineup.json" + ): + + lineup = json.load(open(os.path.join(F_DIR, config_path), encoding="utf-8")) + self.lang_config = {} + if isinstance(lang2use, str): + if lang2use == "all": + self.lang_config = lineup + elif lang2use in lineup: + self.lang_config[lang2use] = lineup[lang2use] + else: + raise Exception( + "XlitError: The entered Langauge code not found. Available are {}".format( + lineup.keys() + ) + ) + + elif isinstance(lang2use, Iterable): + for l in lang2use: + try: + self.lang_config[l] = lineup[l] + except: + print( + "XlitError: Language code {} not found, Skipping...".format(l) + ) + else: + raise Exception( + "XlitError: lang2use must be a list of language codes (or) string of single language code" + ) + + if is_directory_writable(F_DIR): + models_path = os.path.join(F_DIR, "translit_models") + else: + user_home = os.path.expanduser("~") + models_path = os.path.join(user_home, ".AI4Bharat_Xlit_Models") + os.makedirs(models_path, exist_ok=True) + self.download_models(models_path) + + self.langs = {} + self.lang_model = {} + for la in self.lang_config: + try: + print("Loading {}...".format(la)) + self.lang_model[la] = XlitPiston( + weight_path=os.path.join( + models_path, self.lang_config[la]["weight"] + ), + vocab_file=os.path.join(models_path, self.lang_config[la]["vocab"]), + tglyph_cfg_file=os.path.join( + models_path, self.lang_config[la]["script"] + ), + iglyph_cfg_file="en", + ) + self.langs[la] = self.lang_config[la]["name"] + except Exception as error: + print("XlitError: Failure in loading {} \n".format(la), error) + print(XlitError.loading_err.value) + + def download_models(self, models_path): + """ + Download models from GitHub Releases if not exists + """ + for l in self.lang_config: + lang_name = self.lang_config[l]["eng_name"] + lang_model_path = os.path.join(models_path, lang_name) + if not os.path.isdir(lang_model_path): + print("Downloading model for language: %s" % lang_name) + remote_url = MODEL_DOWNLOAD_URL_PREFIX + lang_name + ".zip" + downloaded_zip_path = os.path.join(models_path, lang_name + ".zip") + dload(url=remote_url, save_to_path=downloaded_zip_path, max_time=None) + + if not os.path.isfile(downloaded_zip_path): + exit( + f"ERROR: Unable to download model from {remote_url} into {models_path}" + ) + + with zipfile.ZipFile(downloaded_zip_path, "r") as zip_ref: + zip_ref.extractall(models_path) + + if os.path.isdir(lang_model_path): + os.remove(downloaded_zip_path) + else: + exit( + f"ERROR: Unable to find models in {lang_model_path} after download" + ) + return + + def translit_word(self, eng_word, lang_code="default", topk=7, beam_width=10): + if eng_word == "": + return [] + + if lang_code in self.langs: + try: + res_list = self.lang_model[lang_code].inferencer( + eng_word, beam_width=beam_width + ) + return res_list[:topk] + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + elif lang_code == "default": + try: + res_dict = {} + for la in self.lang_model: + res = self.lang_model[la].inferencer( + eng_word, beam_width=beam_width + ) + res_dict[la] = res[:topk] + return res_dict + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + else: + print("XlitError: Unknown Langauge requested", lang_code) + print(XlitError.lang_err.value) + return XlitError.lang_err + + def translit_sentence(self, eng_sentence, lang_code="default", beam_width=10): + if eng_sentence == "": + return [] + + if lang_code in self.langs: + try: + out_str = "" + for word in eng_sentence.split(): + res_ = self.lang_model[lang_code].inferencer( + word, beam_width=beam_width + ) + out_str = out_str + res_[0] + " " + return out_str[:-1] + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + elif lang_code == "default": + try: + res_dict = {} + for la in self.lang_model: + out_str = "" + for word in eng_sentence.split(): + res_ = self.lang_model[la].inferencer( + word, beam_width=beam_width + ) + out_str = out_str + res_[0] + " " + res_dict[la] = out_str[:-1] + return res_dict + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + else: + print("XlitError: Unknown Langauge requested", lang_code) + print(XlitError.lang_err.value) + return XlitError.lang_err + + +if __name__ == "__main__": + + available_lang = [ + "bn", + "gu", + "hi", + "kn", + "gom", + "mai", + "ml", + "mr", + "pa", + "sd", + "si", + "ta", + "te", + "ur", + ] + + reg = re.compile(r"[a-zA-Z]") + lang = "hi" + engine = XlitEngine( + lang + ) # if you don't specify lang code here, this will give results in all langs available + sent = "Hello World! ABCD क्या हाल है आपका?" + words = [ + engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word + for word in sent.split() + ] # only transliterated en words, leaves rest as it is + updated_sent = " ".join(words) + + print(updated_sent) + + # output : हेलो वर्ल्ड! क्या हाल है आपका? + + # y = engine.translit_sentence("Hello World !")['hi'] + # print(y) diff --git a/vakyansh-tts/utils/inference/tts.py b/vakyansh-tts/utils/inference/tts.py new file mode 100644 index 0000000000000000000000000000000000000000..dc485ec44dbf34ddbb69c15ad524c0fab189c3c5 --- /dev/null +++ b/vakyansh-tts/utils/inference/tts.py @@ -0,0 +1,167 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +from typing import Tuple +import sys +from argparse import ArgumentParser + +import torch +import numpy as np +import os +import json +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../src/glow_tts")) + +from scipy.io.wavfile import write +from hifi.env import AttrDict +from hifi.models import Generator + + +from text import text_to_sequence +import commons +import models +import utils + + +def check_directory(dir): + if not os.path.exists(dir): + sys.exit("Error: {} directory does not exist".format(dir)) + + +class TextToMel: + def __init__(self, glow_model_dir, device="cuda"): + self.glow_model_dir = glow_model_dir + check_directory(self.glow_model_dir) + self.device = device + self.hps, self.glow_tts_model = self.load_glow_tts() + + def load_glow_tts(self): + hps = utils.get_hparams_from_dir(self.glow_model_dir) + checkpoint_path = utils.latest_checkpoint_path(self.glow_model_dir) + symbols = list(hps.data.punc) + list(hps.data.chars) + glow_tts_model = models.FlowGenerator( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ) # .to(self.device) + + if self.device == "cuda": + glow_tts_model.to("cuda") + + utils.load_checkpoint(checkpoint_path, glow_tts_model) + glow_tts_model.decoder.store_inverse() + _ = glow_tts_model.eval() + + return hps, glow_tts_model + + def generate_mel(self, text, noise_scale=0.667, length_scale=1.0): + print(f"Noise scale: {noise_scale} and Length scale: {length_scale}") + symbols = list(self.hps.data.punc) + list(self.hps.data.chars) + cleaner = self.hps.data.text_cleaners + if getattr(self.hps.data, "add_blank", False): + text_norm = text_to_sequence(text, symbols, cleaner) + text_norm = commons.intersperse(text_norm, len(symbols)) + else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality + text = " " + text.strip() + " " + text_norm = text_to_sequence(text, symbols, cleaner) + + sequence = np.array(text_norm)[None, :] + + del symbols + del cleaner + del text + del text_norm + + if self.device == "cuda": + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() + else: + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]) + + with torch.no_grad(): + (y_gen_tst, *_), *_, (attn_gen, *_) = self.glow_tts_model( + x_tst, + x_tst_lengths, + gen=True, + noise_scale=noise_scale, + length_scale=length_scale, + ) + del x_tst + del x_tst_lengths + torch.cuda.empty_cache() + return y_gen_tst.cpu().detach().numpy() + + +class MelToWav: + def __init__(self, hifi_model_dir, device="cuda"): + self.hifi_model_dir = hifi_model_dir + check_directory(self.hifi_model_dir) + self.device = device + self.h, self.hifi_gan_generator = self.load_hifi_gan() + + def load_hifi_gan(self): + checkpoint_path = utils.latest_checkpoint_path(self.hifi_model_dir, regex="g_*") + config_file = os.path.join(self.hifi_model_dir, "config.json") + data = open(config_file).read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + + generator = Generator(h).to(self.device) + + assert os.path.isfile(checkpoint_path) + print("Loading '{}'".format(checkpoint_path)) + state_dict_g = torch.load(checkpoint_path, map_location=self.device) + print("Complete.") + + generator.load_state_dict(state_dict_g["generator"]) + + generator.eval() + generator.remove_weight_norm() + + return h, generator + + def generate_wav(self, mel): + mel = torch.FloatTensor(mel).to(self.device) + + y_g_hat = self.hifi_gan_generator(mel) # passing through vocoder + audio = y_g_hat.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().detach().numpy().astype("int16") + + del y_g_hat + del mel + torch.cuda.empty_cache() + return audio, self.h.sampling_rate + +def restricted_float(x): + try: + x = float(x) + except ValueError: + raise argparse.ArgumentTypeError("%r not a floating-point literal" % (x,)) + + if x < 0.0 or x > 1.0: + raise argparse.ArgumentTypeError("%r not in range [0.0, 1.0]"%(x,)) + return x + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("-a", "--acoustic", required=True, type=str) + parser.add_argument("-v", "--vocoder", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-t", "--text", type=str, required=True) + parser.add_argument("-w", "--wav", type=str, required=True) + parser.add_argument("-n", "--noise-scale", default=0.667, type=restricted_float ) + parser.add_argument("-l", "--length-scale", default=1.0, type=float) + + args = parser.parse_args() + + text_to_mel = TextToMel(glow_model_dir=args.acoustic, device=args.device) + mel_to_wav = MelToWav(hifi_model_dir=args.vocoder, device=args.device) + + mel = text_to_mel.generate_mel(args.text, args.noise_scale, args.length_scale) + audio, sr = mel_to_wav.generate_wav(mel) + + write(filename=args.wav, rate=sr, data=audio) +