Spaces:
Running
on
Zero
Running
on
Zero
add FAQ.md
Browse files- FAQ.md +16 -0
- README.md +9 -9
- runtime/python/Dockerfile +1 -1
- runtime/python/server.py +1 -1
- webui.py +6 -6
FAQ.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## ModuleNotFoundError: No module named 'matcha'
|
2 |
+
|
3 |
+
Matcha-TTS is a third_party module. Please check `third_party` directory. If there is no `Matcha-TTS`, execute `git submodule update --init --recursive`.
|
4 |
+
|
5 |
+
run `export PYTHONPATH=third_party/AcademiCodec:third_party/Matcha-TTS` if you want to use `from cosyvoice.cli.cosyvoice import CosyVoice` in python script.
|
6 |
+
|
7 |
+
## cannot find resource.zip or cannot unzip resource.zip
|
8 |
+
|
9 |
+
Please make sure you have git-lfs installed. Execute
|
10 |
+
|
11 |
+
```sh
|
12 |
+
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
13 |
+
cd pretrained_models/CosyVoice-ttsfrd/
|
14 |
+
unzip resource.zip -d .
|
15 |
+
pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
|
16 |
+
```
|
README.md
CHANGED
@@ -67,10 +67,10 @@ pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
|
|
67 |
For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model.
|
68 |
For sft inference, please use `CosyVoice-300M-SFT` model.
|
69 |
For instruct inference, please use `CosyVoice-300M-Instruct` model.
|
70 |
-
First, add `third_party/
|
71 |
|
72 |
``` sh
|
73 |
-
export PYTHONPATH=third_party/
|
74 |
```
|
75 |
|
76 |
``` python
|
@@ -78,13 +78,13 @@ from cosyvoice.cli.cosyvoice import CosyVoice
|
|
78 |
from cosyvoice.utils.file_utils import load_wav
|
79 |
import torchaudio
|
80 |
|
81 |
-
cosyvoice = CosyVoice('
|
82 |
# sft usage
|
83 |
print(cosyvoice.list_avaliable_spks())
|
84 |
output = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女')
|
85 |
torchaudio.save('sft.wav', output['tts_speech'], 22050)
|
86 |
|
87 |
-
cosyvoice = CosyVoice('
|
88 |
# zero_shot usage
|
89 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
90 |
output = cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k)
|
@@ -94,7 +94,7 @@ prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
|
94 |
output = cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k)
|
95 |
torchaudio.save('cross_lingual.wav', output['tts_speech'], 22050)
|
96 |
|
97 |
-
cosyvoice = CosyVoice('
|
98 |
# instruct usage
|
99 |
output = cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
|
100 |
torchaudio.save('instruct.wav', output['tts_speech'], 22050)
|
@@ -108,8 +108,8 @@ We support sft/zero_shot/cross_lingual/instruct inference in web demo.
|
|
108 |
Please see the demo website for details.
|
109 |
|
110 |
``` python
|
111 |
-
# change
|
112 |
-
python3 webui.py --port 50000 --model_dir
|
113 |
```
|
114 |
|
115 |
**Advanced Usage**
|
@@ -125,8 +125,8 @@ you can run following steps. Otherwise, you can just ignore this step.
|
|
125 |
``` sh
|
126 |
cd runtime/python
|
127 |
docker build -t cosyvoice:v1.0 .
|
128 |
-
# change
|
129 |
-
docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir
|
130 |
python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
|
131 |
```
|
132 |
|
|
|
67 |
For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model.
|
68 |
For sft inference, please use `CosyVoice-300M-SFT` model.
|
69 |
For instruct inference, please use `CosyVoice-300M-Instruct` model.
|
70 |
+
First, add `third_party/Matcha-TTS` to your `PYTHONPATH`.
|
71 |
|
72 |
``` sh
|
73 |
+
export PYTHONPATH=third_party/Matcha-TTS
|
74 |
```
|
75 |
|
76 |
``` python
|
|
|
78 |
from cosyvoice.utils.file_utils import load_wav
|
79 |
import torchaudio
|
80 |
|
81 |
+
cosyvoice = CosyVoice('iic/CosyVoice-300M-SFT')
|
82 |
# sft usage
|
83 |
print(cosyvoice.list_avaliable_spks())
|
84 |
output = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女')
|
85 |
torchaudio.save('sft.wav', output['tts_speech'], 22050)
|
86 |
|
87 |
+
cosyvoice = CosyVoice('iic/CosyVoice-300M')
|
88 |
# zero_shot usage
|
89 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
90 |
output = cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k)
|
|
|
94 |
output = cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k)
|
95 |
torchaudio.save('cross_lingual.wav', output['tts_speech'], 22050)
|
96 |
|
97 |
+
cosyvoice = CosyVoice('iic/CosyVoice-300M-Instruct')
|
98 |
# instruct usage
|
99 |
output = cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
|
100 |
torchaudio.save('instruct.wav', output['tts_speech'], 22050)
|
|
|
108 |
Please see the demo website for details.
|
109 |
|
110 |
``` python
|
111 |
+
# change iic/CosyVoice-300M-SFT for sft inference, or iic/CosyVoice-300M-Instruct for instruct inference
|
112 |
+
python3 webui.py --port 50000 --model_dir iic/CosyVoice-300M
|
113 |
```
|
114 |
|
115 |
**Advanced Usage**
|
|
|
125 |
``` sh
|
126 |
cd runtime/python
|
127 |
docker build -t cosyvoice:v1.0 .
|
128 |
+
# change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
|
129 |
+
docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"
|
130 |
python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
|
131 |
```
|
132 |
|
runtime/python/Dockerfile
CHANGED
@@ -12,4 +12,4 @@ RUN apt install git-lfs && git lfs install
|
|
12 |
RUN cd CosyVoice && git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
13 |
RUN cd CosyVoice/pretrained_models/CosyVoice-ttsfrd && unzip resource.zip -d . && pip3 install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
|
14 |
RUN cd CosyVoice/runtime/python && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
|
15 |
-
CMD ["/bin/bash", "-c", "cd /opt/CosyVoice/CosyVoice/runtime/python && . ./path/sh && python3 server.py --port 50000 --max_conc 4 --model_dir
|
|
|
12 |
RUN cd CosyVoice && git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
13 |
RUN cd CosyVoice/pretrained_models/CosyVoice-ttsfrd && unzip resource.zip -d . && pip3 install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
|
14 |
RUN cd CosyVoice/runtime/python && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
|
15 |
+
CMD ["/bin/bash", "-c", "cd /opt/CosyVoice/CosyVoice/runtime/python && . ./path/sh && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"]
|
runtime/python/server.py
CHANGED
@@ -79,7 +79,7 @@ if __name__ == '__main__':
|
|
79 |
parser.add_argument('--model_dir',
|
80 |
type=str,
|
81 |
required=True,
|
82 |
-
default='
|
83 |
help='local path or modelscope repo id')
|
84 |
args = parser.parse_args()
|
85 |
main()
|
|
|
79 |
parser.add_argument('--model_dir',
|
80 |
type=str,
|
81 |
required=True,
|
82 |
+
default='iic/CosyVoice-300M',
|
83 |
help='local path or modelscope repo id')
|
84 |
args = parser.parse_args()
|
85 |
main()
|
webui.py
CHANGED
@@ -74,20 +74,20 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
|
|
74 |
prompt_wav = prompt_wav_record
|
75 |
else:
|
76 |
prompt_wav = None
|
77 |
-
# if instruct mode, please make sure that model is
|
78 |
if mode_checkbox_group in ['自然语言控制']:
|
79 |
if cosyvoice.frontend.instruct is False:
|
80 |
-
gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用
|
81 |
return (target_sr, default_data)
|
82 |
if instruct_text == '':
|
83 |
gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
|
84 |
return (target_sr, default_data)
|
85 |
if prompt_wav is not None or prompt_text != '':
|
86 |
gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
|
87 |
-
# if cross_lingual mode, please make sure that model is
|
88 |
if mode_checkbox_group in ['跨语种复刻']:
|
89 |
if cosyvoice.frontend.instruct is True:
|
90 |
-
gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用
|
91 |
return (target_sr, default_data)
|
92 |
if instruct_text != '':
|
93 |
gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
|
@@ -138,7 +138,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
|
|
138 |
|
139 |
def main():
|
140 |
with gr.Blocks() as demo:
|
141 |
-
gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 预训练模型 [CosyVoice-300M](https://www.modelscope.cn/models/
|
142 |
gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
|
143 |
|
144 |
tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
|
@@ -176,7 +176,7 @@ if __name__ == '__main__':
|
|
176 |
default=8000)
|
177 |
parser.add_argument('--model_dir',
|
178 |
type=str,
|
179 |
-
default='
|
180 |
help='local path or modelscope repo id')
|
181 |
args = parser.parse_args()
|
182 |
cosyvoice = CosyVoice(args.model_dir)
|
|
|
74 |
prompt_wav = prompt_wav_record
|
75 |
else:
|
76 |
prompt_wav = None
|
77 |
+
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
78 |
if mode_checkbox_group in ['自然语言控制']:
|
79 |
if cosyvoice.frontend.instruct is False:
|
80 |
+
gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
|
81 |
return (target_sr, default_data)
|
82 |
if instruct_text == '':
|
83 |
gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
|
84 |
return (target_sr, default_data)
|
85 |
if prompt_wav is not None or prompt_text != '':
|
86 |
gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
|
87 |
+
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
88 |
if mode_checkbox_group in ['跨语种复刻']:
|
89 |
if cosyvoice.frontend.instruct is True:
|
90 |
+
gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
|
91 |
return (target_sr, default_data)
|
92 |
if instruct_text != '':
|
93 |
gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
|
|
|
138 |
|
139 |
def main():
|
140 |
with gr.Blocks() as demo:
|
141 |
+
gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 预训练模型 [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
|
142 |
gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
|
143 |
|
144 |
tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
|
|
|
176 |
default=8000)
|
177 |
parser.add_argument('--model_dir',
|
178 |
type=str,
|
179 |
+
default='iic/CosyVoice-300M',
|
180 |
help='local path or modelscope repo id')
|
181 |
args = parser.parse_args()
|
182 |
cosyvoice = CosyVoice(args.model_dir)
|