Spaces:
No application file
No application file
hubsnippetai
commited on
Upload text2speech.ipynb
Browse files- text2speech.ipynb +96 -0
text2speech.ipynb
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "fce4ae66-1841-45b4-a0c0-6202287437d4",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stderr",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"/home/hubsnippet/.local/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
14 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
15 |
+
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n",
|
16 |
+
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
|
17 |
+
]
|
18 |
+
}
|
19 |
+
],
|
20 |
+
"source": [
|
21 |
+
"from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n",
|
22 |
+
"#from datasets import load_dataset\n",
|
23 |
+
"#import torch\n",
|
24 |
+
"import soundfile as sf\n",
|
25 |
+
"#from datasets import load_dataset"
|
26 |
+
]
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"cell_type": "code",
|
30 |
+
"execution_count": 2,
|
31 |
+
"id": "2c4a3d90-f2ac-440a-a7ff-011fab098867",
|
32 |
+
"metadata": {},
|
33 |
+
"outputs": [
|
34 |
+
{
|
35 |
+
"ename": "ImportError",
|
36 |
+
"evalue": "\nSpeechT5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the\ninstallation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones\nthat match your environment. Please note that you may need to restart your runtime after installation.\n",
|
37 |
+
"output_type": "error",
|
38 |
+
"traceback": [
|
39 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
40 |
+
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
41 |
+
"Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m processor \u001b[38;5;241m=\u001b[39m \u001b[43mSpeechT5Processor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmicrosoft/speecht5_tts\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m model \u001b[38;5;241m=\u001b[39m SpeechT5ForTextToSpeech\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmicrosoft/speecht5_tts\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 3\u001b[0m vocoder \u001b[38;5;241m=\u001b[39m SpeechT5HifiGan\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmicrosoft/speecht5_hifigan\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
42 |
+
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/transformers/processing_utils.py:465\u001b[0m, in \u001b[0;36mProcessorMixin.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)\u001b[0m\n\u001b[1;32m 462\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m token \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 463\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m token\n\u001b[0;32m--> 465\u001b[0m args \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_arguments_from_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 466\u001b[0m processor_dict, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mget_processor_dict(pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 468\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_args_and_dict(args, processor_dict, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
43 |
+
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/transformers/processing_utils.py:511\u001b[0m, in \u001b[0;36mProcessorMixin._get_arguments_from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m 508\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 509\u001b[0m attribute_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(transformers_module, class_name)\n\u001b[0;32m--> 511\u001b[0m args\u001b[38;5;241m.\u001b[39mappend(\u001b[43mattribute_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m(pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n\u001b[1;32m 512\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m args\n",
|
44 |
+
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/transformers/utils/import_utils.py:1450\u001b[0m, in \u001b[0;36mDummyObject.__getattribute__\u001b[0;34m(cls, key)\u001b[0m\n\u001b[1;32m 1448\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m key \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_from_config\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 1449\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__getattribute__\u001b[39m(key)\n\u001b[0;32m-> 1450\u001b[0m \u001b[43mrequires_backends\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_backends\u001b[49m\u001b[43m)\u001b[49m\n",
|
45 |
+
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/transformers/utils/import_utils.py:1438\u001b[0m, in \u001b[0;36mrequires_backends\u001b[0;34m(obj, backends)\u001b[0m\n\u001b[1;32m 1436\u001b[0m failed \u001b[38;5;241m=\u001b[39m [msg\u001b[38;5;241m.\u001b[39mformat(name) \u001b[38;5;28;01mfor\u001b[39;00m available, msg \u001b[38;5;129;01min\u001b[39;00m checks \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m available()]\n\u001b[1;32m 1437\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m failed:\n\u001b[0;32m-> 1438\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(failed))\n",
|
46 |
+
"\u001b[0;31mImportError\u001b[0m: \nSpeechT5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the\ninstallation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones\nthat match your environment. Please note that you may need to restart your runtime after installation.\n"
|
47 |
+
]
|
48 |
+
}
|
49 |
+
],
|
50 |
+
"source": [
|
51 |
+
"processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n",
|
52 |
+
"model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\")\n",
|
53 |
+
"vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n",
|
54 |
+
"\n",
|
55 |
+
"inputs = processor(text=\"Hello, my dog is cute.\", return_tensors=\"pt\")"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": null,
|
61 |
+
"id": "0fd02046-1a29-4989-9031-4b7d1729a336",
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [],
|
64 |
+
"source": [
|
65 |
+
"# load xvector containing speaker's voice characteristics from a dataset\n",
|
66 |
+
"#embeddings_dataset = load_dataset(\"Matthijs/cmu-arctic-xvectors\", split=\"validation\")\n",
|
67 |
+
"#speaker_embeddings = torch.tensor(embeddings_dataset[7306][\"xvector\"]).unsqueeze(0)\n",
|
68 |
+
"\n",
|
69 |
+
"speech = model.generate_speech(inputs[\"input_ids\"], vocoder=vocoder)\n",
|
70 |
+
"\n",
|
71 |
+
"sf.write(\"speech.wav\", speech.numpy(), samplerate=16000)"
|
72 |
+
]
|
73 |
+
}
|
74 |
+
],
|
75 |
+
"metadata": {
|
76 |
+
"kernelspec": {
|
77 |
+
"display_name": "Python 3 (ipykernel)",
|
78 |
+
"language": "python",
|
79 |
+
"name": "python3"
|
80 |
+
},
|
81 |
+
"language_info": {
|
82 |
+
"codemirror_mode": {
|
83 |
+
"name": "ipython",
|
84 |
+
"version": 3
|
85 |
+
},
|
86 |
+
"file_extension": ".py",
|
87 |
+
"mimetype": "text/x-python",
|
88 |
+
"name": "python",
|
89 |
+
"nbconvert_exporter": "python",
|
90 |
+
"pygments_lexer": "ipython3",
|
91 |
+
"version": "3.11.8"
|
92 |
+
}
|
93 |
+
},
|
94 |
+
"nbformat": 4,
|
95 |
+
"nbformat_minor": 5
|
96 |
+
}
|