Upload 8 files

Browse files

Files changed (8) hide show

evaluate.py +60 -0
ksc/.ksc-train.json.swp +0 -0
ksc/test_manifest.json +3 -0
ksc/train_manifest.json +3 -0
ksc/val_manifest.json +3 -0
requirements.txt +214 -0
train.py +163 -0
transcribe.py +22 -0

evaluate.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import argparse
+from typing import Dict
+import nemo.collections.asr as nemo_asr
+import torch
+from omegaconf import open_dict
+def evaluate_model(model_path: str, test_manifest: str, batch_size: int = 1) -> Dict:
+    # Determine the device (CPU or GPU)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Restore the ASR model from the provided path
+    model = nemo_asr.models.ASRModel.restore_from(restore_path=model_path)
+    model.to(device)
+    model.eval()
+    # Update the model configuration for evaluation
+    with open_dict(model.cfg):
+        model.cfg.validation_ds.manifest_filepath = test_manifest
+        model.cfg.validation_ds.batch_size = batch_size
+    # Set up the test data using the updated configuration
+    model.setup_test_data(model.cfg.validation_ds)
+    wer_nums = []
+    wer_denoms = []
+    # Iterate through the test data
+    for test_batch in model.test_dataloader():
+        # Extract elements from the test batch
+        test_batch = [x for x in test_batch]
+        targets = test_batch[2].to(device)
+        targets_lengths = test_batch[3].to(device)
+        # Forward pass through the model
+        log_probs, encoded_len, greedy_predictions = model(input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device))
+        # Compute Word Error Rate (WER) and store results
+        model._wer.update(greedy_predictions, targets, targets_lengths)
+        _, wer_num, wer_denom = model._wer.compute()
+        model._wer.reset()
+        wer_nums.append(wer_num.detach().cpu().numpy())
+        wer_denoms.append(wer_denom.detach().cpu().numpy())
+        # Free up memory by deleting variables
+        del test_batch, log_probs, targets, targets_lengths, encoded_len, greedy_predictions
+    # Compute the WER score
+    wer_score = sum(wer_nums) / sum(wer_denoms)
+    print({"WER_score": wer_score})
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default=None, help="Path to a model to evaluate.")
+    parser.add_argument("--test_manifest", help="Path for train manifest JSON file.")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size of the dataset to train.")
+    args = parser.parse_args()
+    evaluate_model(model_path=args.model_path, test_manifest=args.test_manifest, batch_size=args.batch_size)

ksc/.ksc-train.json.swp ADDED Viewed

Binary file (12.3 kB). View file

ksc/test_manifest.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{"audio_filepath": "ksc/test/crowdsourced/5f60e4ecdb745.wav", "text": "бірақ бізде онымен айналысатын қожалықтар саны саусақпен санарлық", "duration": 7.936}
+{"audio_filepath": "ksc/test/crowdsourced/5f609b2ad370e.wav", "text": "солардың бірі маңғыстаулық шопан есет өтесов", "duration": 5.4613125}
+{"audio_filepath": "ksc/test/crowdsourced/5f5682c1c4739.wav", "text": "иесіз жануарларды ату керек пе әлде асырау керек пе", "duration": 6.7413125}

ksc/train_manifest.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{"audio_filepath": "ksc/train/crowdsourced/5f5861e2baed1.wav", "text": "төрт бұрышты ошақ деп атайды", "duration": 2.816}
+{"audio_filepath": "ksc/train/crowdsourced/5f2b0a559b15f.wav", "text": "оны адамдардың жақтырмаушылық пен көре алмаушылықтары бағдат қаласынан кетуге мәжбүр етті", "duration": 10.0693125}
+{"audio_filepath": "ksc/train/crowdsourced/5f57300c9e48b.wav", "text": "кіші сордук ресейдегі өзен", "duration": 4.352}

ksc/val_manifest.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{"audio_filepath": "ksc/val/crowdsourced/5f5a35cfcd7b2.wav", "text": "егер мынандай жағдайда әуе компаниясы көмек бере алмаса мемлекет қол ұшын созады", "duration": 8.192}
+{"audio_filepath": "ksc/val/crowdsourced/5f60ff61277a3.wav", "text": "өйткені түсіру жұмыстары басталғанға дейін актерлер екі ай бойы арнайы каскадерлік жаттығулардан өткен", "duration": 7.936}
+{"audio_filepath": "ksc/val/crowdsourced/5f5a5a5a199d7.wav", "text": "беру аяқталды дауыс санау басталды", "duration": 3.072}

requirements.txt ADDED Viewed

	@@ -0,0 +1,214 @@

+absl-py==2.1.0
+aiohttp==3.8.6
+aiosignal==1.3.1
+alabaster==0.7.13
+antlr4-python3-runtime==4.9.3
+appdirs==1.4.4
+async-timeout==4.0.3
+asynctest==0.13.0
+attrdict==2.0.1
+attrs==23.2.0
+audioread==3.0.1
+Babel==2.14.0
+backcall==0.2.0
+beautifulsoup4==4.12.3
+black==19.10b0
+boto3==1.33.13
+botocore==1.33.13
+braceexpand==0.1.7
+cachetools==5.3.2
+certifi==2023.11.17
+cffi==1.15.1
+charset-normalizer==3.3.2
+click==8.0.2
+colorama==0.4.6
+comm==0.1.4
+cycler==0.11.0
+Cython==3.0.8
+decorator==5.1.1
+Distance==0.1.3
+docker-pycreds==0.4.0
+docopt==0.6.2
+docutils==0.17.1
+editdistance==0.6.2
+exceptiongroup==1.2.0
+fasttext==0.9.2
+filelock==3.12.2
+flake8==5.0.4
+Flake8-pyproject==1.2.3
+fonttools==4.38.0
+frozendict==2.4.0
+frozenlist==1.3.3
+fsspec==2023.1.0
+ftfy==6.1.1
+future==0.18.3
+g2p-en==2.1.0
+gdown==4.7.3
+gitdb==4.0.11
+GitPython==3.1.41
+google-auth==2.26.2
+google-auth-oauthlib==0.4.6
+grpcio==1.60.0
+h5py==3.8.0
+huggingface-hub==0.16.4
+Hydra==2.5
+hydra-core==1.3.2
+idna==3.6
+imagesize==1.4.1
+importlib-metadata==4.2.0
+importlib-resources==5.12.0
+inflect==6.0.5
+iniconfig==2.0.0
+ipadic==1.0.0
+ipython==7.34.0
+ipywidgets==8.1.1
+isort==4.3.21
+jedi==0.19.1
+jieba==0.42.1
+Jinja2==3.1.3
+jmespath==1.0.1
+joblib==1.3.2
+jupyterlab-widgets==3.0.9
+kaldi-io==0.9.8
+kaldi-python-io==1.2.2
+kaldiio==2.18.0
+kiwisolver==1.4.5
+latexcodec==2.0.1
+lazy_loader==0.3
+librosa==0.10.1
+llvmlite==0.39.1
+lxml==5.1.0
+Markdown==3.4.4
+markdown-it-py==2.2.0
+MarkupSafe==2.1.3
+marshmallow==3.19.0
+matplotlib==3.5.3
+matplotlib-inline==0.1.6
+mccabe==0.7.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mpmath==1.3.0
+msgpack==1.0.5
+multidict==6.0.4
+nemo-toolkit==1.7.0
+nltk==3.8.1
+numba==0.56.4
+numpy==1.21.6
+oauthlib==3.2.2
+omegaconf==2.3.0
+onnx==1.14.1
+OpenCC==1.1.6
+packaging==23.2
+pandas==1.3.5
+pangu==4.0.6.1
+parameterized==0.9.0
+parso==0.8.3
+pathspec==0.11.2
+pesq==0.0.4
+pexpect==4.9.0
+pickleshare==0.7.5
+Pillow==9.5.0
+pip-api==0.0.30
+pipreqs==0.4.13
+platformdirs==4.0.0
+pluggy==1.2.0
+pooch==1.8.0
+portalocker==2.7.0
+prompt-toolkit==3.0.43
+protobuf==3.20.3
+psutil==5.9.7
+ptyprocess==0.7.0
+pyannote.core==5.0.0
+pyannote.database==5.0.1
+pyannote.metrics==3.2.1
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pybind11==2.11.1
+pybtex==0.24.0
+pybtex-docutils==1.0.3
+pycodestyle==2.9.1
+pycparser==2.21
+pydantic==1.10.13
+pyDeprecate==0.3.1
+pydub==0.25.1
+pyflakes==2.5.0
+Pygments==2.17.2
+pyparsing==3.1.1
+pypinyin==0.50.0
+PySocks==1.7.1
+pystoi==0.4.1
+pytest==7.4.4
+pytest-runner==6.0.1
+python-dateutil==2.8.2
+pytorch-lightning==1.5.10
+pytz==2023.3.post1
+pyupgrade==3.3.2
+PyYAML==5.4.1
+rapidfuzz==3.4.0
+regex==2023.12.25
+requests==2.31.0
+requests-oauthlib==1.3.1
+rich==13.7.0
+rsa==4.9
+ruamel.yaml==0.18.5
+ruamel.yaml.clib==0.2.8
+s3transfer==0.8.2
+sacrebleu==2.4.0
+sacremoses==0.0.53
+safetensors==0.4.1
+scikit-learn==1.0.2
+scipy==1.7.3
+sentencepiece==0.1.99
+sentry-sdk==1.39.2
+setproctitle==1.3.3
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+snowballstemmer==2.2.0
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.4.1
+sox==1.4.1
+soxr==0.3.7
+Sphinx==5.3.0
+sphinxcontrib-applehelp==1.0.2
+sphinxcontrib-bibtex==2.6.2
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+sympy==1.10.1
+tabulate==0.9.0
+tensorboard==2.11.2
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+threadpoolctl==3.1.0
+tokenize-rt==5.0.0
+tokenizers==0.13.3
+toml==0.10.2
+tomli==2.0.1
+torch==1.12.1+cu116
+torch-stft==0.1.4
+torchaudio==0.12.1+cu116
+torchmetrics==0.11.4
+torchvision==0.13.1+cu116
+tqdm==4.66.1
+traitlets==5.9.0
+transformers==4.30.2
+typed-ast==1.5.5
+typer==0.9.0
+typing_extensions==4.7.1
+Unidecode==1.3.8
+urllib3==1.26.18
+wandb==0.16.2
+wcwidth==0.2.13
+webdataset==0.1.62
+Werkzeug==2.2.3
+wget==3.2
+widgetsnbextension==4.0.9
+wrapt==1.16.0
+yarg==0.1.9
+yarl==1.9.4
+youtokentome==1.0.6
+zipp==3.15.0

train.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import argparse
+import os
+import nemo.collections.asr as nemo_asr
+import pytorch_lightning as ptl
+from nemo.utils import exp_manager, logging
+from omegaconf import OmegaConf, open_dict
+def train_model(train_manifest: str, val_manifest: str, accelerator: str, batch_size: int, num_epochs: int, model_save_path: str = None,) -> None:
+    # Loading a STT Quartznet 15x5 model
+    model = nemo_asr.models.ASRModel.from_pretrained("stt_en_quartznet15x5")
+    # New vocabulary for a model
+    new_vocabulary = [
+        " ",
+        "а",
+        "б",
+        "в",
+        "г",
+        "д",
+        "е",
+        "ж",
+        "з",
+        "и",
+        "й",
+        "к",
+        "л",
+        "м",
+        "н",
+        "о",
+        "п",
+        "р",
+        "с",
+        "т",
+        "у",
+        "ф",
+        "х",
+        "ц",
+        "ч",
+        "ш",
+        "щ",
+        "ъ",
+        "ы",
+        "ь",
+        "э",
+        "ю",
+        "я",
+        "і",
+        "ғ",
+        "қ",
+        "ң",
+        "ү",
+        "ұ",
+        "һ",
+        "ә",
+        "ө",
+    ]
+    with open_dict(model.cfg):
+        # Setting up the labels and sample rate
+        model.cfg.labels = new_vocabulary
+        model.cfg.sample_rate = 16000
+        # Train dataset
+        model.cfg.train_ds.manifest_filepath = train_manifest
+        model.cfg.train_ds.labels = new_vocabulary
+        model.cfg.train_ds.normalize_transcripts = False
+        model.cfg.train_ds.batch_size = batch_size
+        model.cfg.train_ds.num_workers = 10
+        model.cfg.train_ds.pin_memory = True
+        model.cfg.train_ds.trim_silence = True
+        # Validation dataset
+        model.cfg.validation_ds.manifest_filepath = val_manifest
+        model.cfg.validation_ds.labels = new_vocabulary
+        model.cfg.validation_ds.normalize_transcripts = False
+        model.cfg.validation_ds.batch_size = batch_size
+        model.cfg.validation_ds.num_workers = 10
+        model.cfg.validation_ds.pin_memory = True
+        model.cfg.validation_ds.trim_silence = True
+        # Setting up an optimizer and scheduler
+        model.cfg.optim.lr = 0.001
+        model.cfg.optim.betas = [0.8, 0.5]
+        model.cfg.optim.weight_decay = 0.001
+        model.cfg.optim.sched.warmup_steps = 500
+        model.cfg.optim.sched.min_lr = 1e-6
+    model.change_vocabulary(new_vocabulary=new_vocabulary)
+    model.setup_training_data(model.cfg.train_ds)
+    model.setup_validation_data(model.cfg.validation_ds)
+    # Unfreezing encoders to update the parameters
+    model.encoder.unfreeze()
+    logging.info("Model encoder has been un-frozen")
+    # Setting up data augmentation
+    model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)
+    # Setting up the metrics
+    model._wer.use_cer = True
+    model._wer.log_prediction = True
+    # Trainer
+    trainer = ptl.Trainer(
+        accelerator="gpu",
+        max_epochs=num_epochs,
+        accumulate_grad_batches=1,
+        enable_checkpointing=False,
+        logger=False,
+        log_every_n_steps=100,
+        check_val_every_n_epoch=1,
+        precision=16,
+    )
+    # Setting up model with the trainer
+    model.set_trainer(trainer)
+    # Experiment tracking
+    LANGUAGE = "kz"
+    config = exp_manager.ExpManagerConfig(
+        exp_dir=f"experiments/lang-{LANGUAGE}/",
+        name=f"ASR-Model-Language-{LANGUAGE}",
+        checkpoint_callback_params=exp_manager.CallbackParams(monitor="val_wer", mode="min", always_save_nemo=True, save_best_model=True,),
+    )
+    config = OmegaConf.structured(config)
+    exp_manager.exp_manager(trainer, config)
+    # Final Configuration
+    print("-----------------------------------------------------------")
+    print("Updated STT Model Configuration:")
+    print(OmegaConf.to_yaml(model.cfg))
+    print("-----------------------------------------------------------")
+    #     # Fitting the model
+    trainer.fit(model)
+    #     # Saving the model
+    if model_save_path:
+        model.save_to(f"{model_save_path}")
+        print(f"Model saved at path : {os.getcwd() + os.path.sep + model_save_path}")
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_manifest", help="Path for train manifest JSON file.")
+    parser.add_argument("--val_manifest", help="Path for validation manifest JSON file.")
+    parser.add_argument("--accelerator", help="What accelerator type to use (cpu, gpu, tpu, etc.).")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size of the dataset to train.")
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs to train for.")
+    parser.add_argument("--model_save_path", default=None, help="Path for saving a trained model.")
+    args = parser.parse_args()
+    train_model(
+        train_manifest=args.train_manifest,
+        val_manifest=args.val_manifest,
+        accelerator=args.accelerator,
+        batch_size=args.batch_size,
+        num_epochs=args.num_epochs,
+        model_save_path=args.model_save_path,
+    )

transcribe.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import argparse
+from typing import Dict
+import nemo.collections.asr as nemo_asr
+def predict_model(model_path: str, audio_file_path: str) -> Dict:
+    # Restore the ASR model from the provided path
+    model = nemo_asr.models.ASRModel.restore_from(restore_path=model_path)
+    # Transcribe the given audio file
+    text = model.transcribe([audio_file_path])
+    print({"result": text[0]})
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default=None, help="Path to a model to evaluate.")
+    parser.add_argument("--audio_file_path", help="Path for train manifest JSON file.")
+    args = parser.parse_args()
+    predict_model(model_path=args.model_path, audio_file_path=args.audio_file_path)