diff --git "a/lib/voicecraft/inference_speech_editing.ipynb" "b/lib/voicecraft/inference_speech_editing.ipynb" new file mode 100644--- /dev/null +++ "b/lib/voicecraft/inference_speech_editing.ipynb" @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"\n", + "os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/pyp/miniconda3/envs/voicecraft/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "# import libs\n", + "import torch\n", + "import torchaudio\n", + "import numpy as np\n", + "import random\n", + "\n", + "from data.tokenizer import (\n", + " AudioTokenizer,\n", + " TextTokenizer,\n", + ")\n", + "\n", + "from models import voicecraft" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install MFA models and dictionaries if you haven't done so already\n", + "!source ~/.bashrc && \\\n", + " conda activate voicecraft && \\\n", + " mfa model download dictionary english_us_arpa && \\\n", + " mfa model download acoustic english_us_arpa" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# hyperparameters for inference\n", + "left_margin = 0.08\n", + "right_margin = 0.08\n", + "codec_audio_sr = 16000\n", + "codec_sr = 50\n", + "top_k = 0\n", + "top_p = 0.8\n", + "temperature = 1\n", + "kvcache = 0\n", + "# adjust the below three arguments if the generation is not as good\n", + "seed = 1 # random seed magic\n", + "silence_tokens = [1388,1898,131] # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n", + "stop_repetition = -1 # -1 means do not adjust prob of silence tokens. if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n", + "# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n", + "def seed_everything(seed):\n", + " os.environ['PYTHONHASHSEED'] = str(seed)\n", + " random.seed(seed)\n", + " np.random.seed(seed)\n", + " torch.manual_seed(seed)\n", + " torch.cuda.manual_seed(seed)\n", + " torch.backends.cudnn.benchmark = False\n", + " torch.backends.cudnn.deterministic = True\n", + "seed_everything(seed)\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "# point to the original file or record the file\n", + "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n", + "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n", + "orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n", + "# move the audio and transcript to temp folder\n", + "temp_folder = \"./demo/temp\"\n", + "os.makedirs(temp_folder, exist_ok=True)\n", + "os.system(f\"cp {orig_audio} {temp_folder}\")\n", + "filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n", + "with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n", + " f.write(orig_transcript)\n", + "# run MFA to get the alignment\n", + "align_temp = f\"{temp_folder}/mfa_alignments\"\n", + "os.makedirs(align_temp, exist_ok=True)\n", + "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n", + "# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n", + "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n", + "audio_fn = f\"{temp_folder}/{filename}.wav\"\n", + "transcript_fn = f\"{temp_folder}/{filename}.txt\"\n", + "align_fn = f\"{align_temp}/{filename}.csv\"\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original:\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "edited:\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "editTypes_set = set(['substitution', 'insertion', 'deletion'])\n", + "# propose what do you want the target modified transcript to be\n", + "target_transcript = \"But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,\"\n", + "edit_type = \"substitution\"\n", + "assert edit_type in editTypes_set, f\"Invalid edit type {edit_type}. Must be one of {editTypes_set}.\"\n", + "\n", + "# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2)\n", + "# make sure the two modification do not overlap, if they do, you need to combine them into one modification\n", + "\n", + "# run the script to turn user input to the format that the model can take\n", + "from edit_utils import get_span\n", + "orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type)\n", + "if orig_span[0] > orig_span[1]:\n", + " RuntimeError(f\"example {audio_fn} failed\")\n", + "if orig_span[0] == orig_span[1]:\n", + " orig_span_save = [orig_span[0]]\n", + "else:\n", + " orig_span_save = orig_span\n", + "if new_span[0] == new_span[1]:\n", + " new_span_save = [new_span[0]]\n", + "else:\n", + " new_span_save = new_span\n", + "\n", + "orig_span_save = \",\".join([str(item) for item in orig_span_save])\n", + "new_span_save = \",\".join([str(item) for item in new_span_save])\n", + "from inference_speech_editing_scale import get_mask_interval\n", + "\n", + "start, end = get_mask_interval(align_fn, orig_span_save, edit_type)\n", + "info = torchaudio.info(audio_fn)\n", + "audio_dur = info.num_frames / info.sample_rate\n", + "morphed_span = (max(start - left_margin, 1/codec_sr), min(end + right_margin, audio_dur)) # in seconds\n", + "\n", + "# span in codec frames\n", + "mask_interval = [[round(morphed_span[0]*codec_sr), round(morphed_span[1]*codec_sr)]]\n", + "mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n", + "\n", + "# load model, tokenizer, and other necessary files\n", + "voicecraft_name=\"giga330M.pth\"\n", + "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n", + "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", + "if not os.path.exists(ckpt_fn):\n", + " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n", + " os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n", + "if not os.path.exists(encodec_fn):\n", + " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", + " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", + "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n", + "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", + "model.load_state_dict(ckpt[\"model\"])\n", + "model.to(device)\n", + "model.eval()\n", + "\n", + "phn2num = ckpt['phn2num']\n", + "\n", + "text_tokenizer = TextTokenizer(backend=\"espeak\")\n", + "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n", + "\n", + "# run the model to get the output\n", + "from inference_speech_editing_scale import inference_one_sample\n", + "\n", + "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens}\n", + "orig_audio, new_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)\n", + " \n", + "# save segments for comparison\n", + "orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()\n", + "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n", + "\n", + "# display the audio\n", + "from IPython.display import Audio\n", + "print(\"original:\")\n", + "display(Audio(orig_audio, rate=codec_audio_sr))\n", + "\n", + "print(\"edited:\")\n", + "display(Audio(new_audio, rate=codec_audio_sr))\n", + "\n", + "# # save the audio\n", + "# # output_dir\n", + "# output_dir = \"./demo/generated_se\"\n", + "# os.makedirs(output_dir, exist_ok=True)\n", + "\n", + "# save_fn_new = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav\"\n", + "\n", + "# torchaudio.save(save_fn_new, new_audio, codec_audio_sr)\n", + "\n", + "# save_fn_orig = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav\"\n", + "# if not os.path.isfile(save_fn_orig):\n", + "# orig_audio, orig_sr = torchaudio.load(audio_fn)\n", + "# if orig_sr != codec_audio_sr:\n", + "# orig_audio = torchaudio.transforms.Resample(orig_sr, codec_audio_sr)(orig_audio)\n", + "# torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)\n", + "\n", + "# # if you get error importing T5 in transformers\n", + "# # try \n", + "# # pip uninstall Pillow\n", + "# # pip install Pillow\n", + "# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "voicecraft", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}