File size: 3,446 Bytes

8520a55

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install --upgrade vocos encodec librosa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pprint\n",
    "import IPython.display as ipd\n",
    "import torch\n",
    "import librosa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load model\n",
    "mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now that the model is loaded, pick a reference audio to clone from. If you want to use deep clone, also specify its transcript. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download example ref audio\n",
    "!wget -O example.wav https://github.com/Camb-ai/mars5-tts/raw/master/docs/assets/example_ref.wav "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wav, sr = librosa.load('./example.wav', \n",
    "                       sr=mars5.sr, mono=True)\n",
    "wav = torch.from_numpy(wav)\n",
    "ref_transcript = \"We actually haven't managed to meet demand.\"\n",
    "print(\"Reference audio:\")\n",
    "ipd.display(ipd.Audio(wav.numpy(), rate=mars5.sr))\n",
    "print(f\"Reference transcript: {ref_transcript}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "deep_clone = True # set to False if you don't know prompt transcript or want fast inference.\n",
    "# Below you can tune other inference settings, like top_k, temperature, top_p, etc...\n",
    "cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100,\n",
    "                      top_k=100, temperature=0.7, freq_penalty=3)\n",
    "\n",
    "ar_codes, wav_out = mars5.tts(\"The quick brown rat.\", wav, \n",
    "          ref_transcript,\n",
    "          cfg=cfg)\n",
    "\n",
    "print('Synthesized output audio:')\n",
    "ipd.Audio(wav_out.numpy(), rate=mars5.sr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can see all the inference settings available to tune in the inference config here:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pprint.pprint(config_class())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can also listen to the vocoded raw coarse codes, for debugging purposes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ar_wav = mars5.vocode(ar_codes.cpu()[:, None])\n",
    "ipd.Audio(ar_wav.numpy(), rate=mars5.sr)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matt-py311",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}