oscarwang2
commited on
Upload 17 files
Browse files- .DS_Store +0 -0
- .gitattributes +35 -35
- README.md +110 -0
- base_speakers/.DS_Store +0 -0
- base_speakers/ses/en-au.pth +3 -0
- base_speakers/ses/en-br.pth +3 -0
- base_speakers/ses/en-default.pth +3 -0
- base_speakers/ses/en-india.pth +3 -0
- base_speakers/ses/en-newest.pth +3 -0
- base_speakers/ses/en-us.pth +3 -0
- base_speakers/ses/es.pth +3 -0
- base_speakers/ses/fr.pth +3 -0
- base_speakers/ses/jp.pth +3 -0
- base_speakers/ses/kr.pth +3 -0
- base_speakers/ses/zh.pth +3 -0
- converter/checkpoint.pth +3 -0
- converter/config.json +57 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitattributes
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,113 @@
|
|
1 |
---
|
2 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: mit
|
3 |
+
tags:
|
4 |
+
- audio
|
5 |
+
- text-to-speech
|
6 |
+
- instant-voice-cloning
|
7 |
+
language:
|
8 |
+
- en
|
9 |
+
- zh
|
10 |
+
inference: false
|
11 |
---
|
12 |
+
|
13 |
+
# OpenVoice V2
|
14 |
+
|
15 |
+
In April 2024, we release OpenVoice V2, which includes all features in V1 and has:
|
16 |
+
|
17 |
+
1. Better Audio Quality. OpenVoice V2 adopts a different training strategy that delivers better audio quality.
|
18 |
+
|
19 |
+
2. Native Multi-lingual Support. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
|
20 |
+
|
21 |
+
3. Free Commercial Use. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
|
22 |
+
|
23 |
+
|
24 |
+
<video controls autoplay src="https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/uCHTHD9OUotgOflqDu3QK.mp4"></video>
|
25 |
+
|
26 |
+
### Features
|
27 |
+
- **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
|
28 |
+
- **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
|
29 |
+
- **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
|
30 |
+
|
31 |
+
### How to Use
|
32 |
+
Please see [usage](https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md) for detailed instructions.
|
33 |
+
|
34 |
+
# Usage
|
35 |
+
|
36 |
+
## Table of Content
|
37 |
+
|
38 |
+
- [Quick Use](#quick-use): directly use OpenVoice without installation.
|
39 |
+
- [Linux Install](#linux-install): for researchers and developers only.
|
40 |
+
- [V1](#openvoice-v1)
|
41 |
+
- [V2](#openvoice-v2)
|
42 |
+
- [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community
|
43 |
+
|
44 |
+
## Quick Use
|
45 |
+
|
46 |
+
The input speech audio of OpenVoice can be in **Any Language**. OpenVoice can clone the voice in that speech audio, and use the voice to speak in multiple languages. For quick use, we recommend you to try the already deployed services:
|
47 |
+
|
48 |
+
- [British English](https://app.myshell.ai/widget/vYjqae)
|
49 |
+
- [American English](https://app.myshell.ai/widget/nEFFJf)
|
50 |
+
- [Indian English](https://app.myshell.ai/widget/V3iYze)
|
51 |
+
- [Australian English](https://app.myshell.ai/widget/fM7JVf)
|
52 |
+
- [Spanish](https://app.myshell.ai/widget/NNFFVz)
|
53 |
+
- [French](https://app.myshell.ai/widget/z2uyUz)
|
54 |
+
- [Chinese](https://app.myshell.ai/widget/fU7nUz)
|
55 |
+
- [Japanese](https://app.myshell.ai/widget/IfIB3u)
|
56 |
+
- [Korean](https://app.myshell.ai/widget/q6ZjIn)
|
57 |
+
|
58 |
+
## Linux Install
|
59 |
+
|
60 |
+
This section is only for developers and researchers who are familiar with Linux, Python and PyTorch. Clone this repo, and run
|
61 |
+
|
62 |
+
```
|
63 |
+
conda create -n openvoice python=3.9
|
64 |
+
conda activate openvoice
|
65 |
+
git clone [email protected]:myshell-ai/OpenVoice.git
|
66 |
+
cd OpenVoice
|
67 |
+
pip install -e .
|
68 |
+
```
|
69 |
+
|
70 |
+
No matter if you are using V1 or V2, the above installation is the same.
|
71 |
+
|
72 |
+
### OpenVoice V1
|
73 |
+
|
74 |
+
Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_1226.zip) and extract it to the `checkpoints` folder.
|
75 |
+
|
76 |
+
**1. Flexible Voice Style Control.**
|
77 |
+
Please see [`demo_part1.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part1.ipynb) for an example usage of how OpenVoice enables flexible style control over the cloned voice.
|
78 |
+
|
79 |
+
**2. Cross-Lingual Voice Cloning.**
|
80 |
+
Please see [`demo_part2.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb) for an example for languages seen or unseen in the MSML training set.
|
81 |
+
|
82 |
+
**3. Gradio Demo.**. We provide a minimalist local gradio demo here. We strongly suggest the users to look into `demo_part1.ipynb`, `demo_part2.ipynb` and the [QnA](QA.md) if they run into issues with the gradio demo. Launch a local gradio demo with `python -m openvoice_app --share`.
|
83 |
+
|
84 |
+
### OpenVoice V2
|
85 |
+
|
86 |
+
Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder.
|
87 |
+
|
88 |
+
Install [MeloTTS](https://github.com/myshell-ai/MeloTTS):
|
89 |
+
```
|
90 |
+
pip install git+https://github.com/myshell-ai/MeloTTS.git
|
91 |
+
python -m unidic download
|
92 |
+
```
|
93 |
+
|
94 |
+
**Demo Usage.** Please see [`demo_part3.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean.
|
95 |
+
|
96 |
+
|
97 |
+
## Install on Other Platforms
|
98 |
+
|
99 |
+
This section provides the unofficial installation guides by open-source contributors in the community:
|
100 |
+
|
101 |
+
- Windows
|
102 |
+
- [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups)
|
103 |
+
- You are welcome to contribute if you have a better installation guide. We will list you here.
|
104 |
+
- Docker
|
105 |
+
- [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF)
|
106 |
+
- You are welcome to contribute if you have a better installation guide. We will list you here.
|
107 |
+
|
108 |
+
|
109 |
+
### Links
|
110 |
+
- [Github](https://github.com/myshell-ai/OpenVoice)
|
111 |
+
- [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoiceV2)
|
112 |
+
- [Discord](https://discord.gg/myshell)
|
113 |
+
|
base_speakers/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
base_speakers/ses/en-au.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e9782233deef51fc5289d05ad4dd4ce12b196e282eccf6b6db6256bbd02daaa
|
3 |
+
size 1701
|
base_speakers/ses/en-br.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bf5a88025cfd10473b25d65d5c0e608338ce4533059c5f9a3383e69c812d389
|
3 |
+
size 1701
|
base_speakers/ses/en-default.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4139de3bc2ea162f45a5a5f9559b710686c9689749b5ab8945ee5e2a082d154
|
3 |
+
size 1783
|
base_speakers/ses/en-india.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad03d946757e95fe9e13239aa4b11071d98f22316f604f34b1a0b4bdf41cda48
|
3 |
+
size 1701
|
base_speakers/ses/en-newest.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a3798229b1114f0e9cc137b33211809def7dda5a8a9398d5a112c0b42699177
|
3 |
+
size 1692
|
base_speakers/ses/en-us.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d092d4af0815a4bfbc6105b65621ab68dc4c61b2f55044d8a66968a34947c32
|
3 |
+
size 1701
|
base_speakers/ses/es.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8cece8853fb75b9f5217a1f5cda9807bac92a3e4c4547fc651e404d05deff63
|
3 |
+
size 1692
|
base_speakers/ses/fr.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a01f6d30a73efa368c288a542a522a2bcdd4e2ec5589d8646b307cf8e2ad9ae
|
3 |
+
size 1692
|
base_speakers/ses/jp.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b645ff428de4a57a22122318968f1e6127ac81fda2e2aa66062deccd3864416
|
3 |
+
size 1692
|
base_speakers/ses/kr.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f501479d6072741a396725bec79144653e9f4a5381b85901e29683aa169795df
|
3 |
+
size 1692
|
base_speakers/ses/zh.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b353de562700c13faacf096ecfc0adcafd26e6704a9feef572be1279714e031
|
3 |
+
size 1692
|
converter/checkpoint.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
|
3 |
+
size 131320490
|
converter/config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_version_": "v2",
|
3 |
+
"data": {
|
4 |
+
"sampling_rate": 22050,
|
5 |
+
"filter_length": 1024,
|
6 |
+
"hop_length": 256,
|
7 |
+
"win_length": 1024,
|
8 |
+
"n_speakers": 0
|
9 |
+
},
|
10 |
+
"model": {
|
11 |
+
"zero_g": true,
|
12 |
+
"inter_channels": 192,
|
13 |
+
"hidden_channels": 192,
|
14 |
+
"filter_channels": 768,
|
15 |
+
"n_heads": 2,
|
16 |
+
"n_layers": 6,
|
17 |
+
"kernel_size": 3,
|
18 |
+
"p_dropout": 0.1,
|
19 |
+
"resblock": "1",
|
20 |
+
"resblock_kernel_sizes": [
|
21 |
+
3,
|
22 |
+
7,
|
23 |
+
11
|
24 |
+
],
|
25 |
+
"resblock_dilation_sizes": [
|
26 |
+
[
|
27 |
+
1,
|
28 |
+
3,
|
29 |
+
5
|
30 |
+
],
|
31 |
+
[
|
32 |
+
1,
|
33 |
+
3,
|
34 |
+
5
|
35 |
+
],
|
36 |
+
[
|
37 |
+
1,
|
38 |
+
3,
|
39 |
+
5
|
40 |
+
]
|
41 |
+
],
|
42 |
+
"upsample_rates": [
|
43 |
+
8,
|
44 |
+
8,
|
45 |
+
2,
|
46 |
+
2
|
47 |
+
],
|
48 |
+
"upsample_initial_channel": 512,
|
49 |
+
"upsample_kernel_sizes": [
|
50 |
+
16,
|
51 |
+
16,
|
52 |
+
4,
|
53 |
+
4
|
54 |
+
],
|
55 |
+
"gin_channels": 256
|
56 |
+
}
|
57 |
+
}
|