Spaces:
Running
on
Zero
Running
on
Zero
add train cfg in flow matching
Browse files- README.md +4 -2
- cosyvoice/flow/flow_matching.py +7 -0
- runtime/python/Dockerfile +1 -2
- runtime/python/{fastapi_client.py β fastapi/fastapi_client.py} +0 -0
- runtime/python/{fastapi_server.py β fastapi/fastapi_server.py} +0 -0
- runtime/python/{client.py β grpc/client.py} +3 -3
- runtime/python/{cosyvoice.proto β grpc/cosyvoice.proto} +0 -0
- runtime/python/{server.py β grpc/server.py} +2 -3
README.md
CHANGED
@@ -131,8 +131,10 @@ you can run following steps. Otherwise, you can just ignore this step.
|
|
131 |
cd runtime/python
|
132 |
docker build -t cosyvoice:v1.0 .
|
133 |
# change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
|
134 |
-
|
135 |
-
python3
|
|
|
|
|
136 |
```
|
137 |
|
138 |
## Discussion & Communication
|
|
|
131 |
cd runtime/python
|
132 |
docker build -t cosyvoice:v1.0 .
|
133 |
# change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
|
134 |
+
# for grpc usage
|
135 |
+
docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/grpc && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"
|
136 |
+
python3 grpc/client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
|
137 |
+
# for fastapi usage
|
138 |
```
|
139 |
|
140 |
## Discussion & Communication
|
cosyvoice/flow/flow_matching.py
CHANGED
@@ -126,6 +126,13 @@ class ConditionalCFM(BASECFM):
|
|
126 |
y = (1 - (1 - self.sigma_min) * t) * z + t * x1
|
127 |
u = x1 - (1 - self.sigma_min) * z
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
|
130 |
loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
|
131 |
return loss, y
|
|
|
126 |
y = (1 - (1 - self.sigma_min) * t) * z + t * x1
|
127 |
u = x1 - (1 - self.sigma_min) * z
|
128 |
|
129 |
+
# during training, we randomly drop condition to trade off mode coverage and sample fidelity
|
130 |
+
if self.training_cfg_rate > 0:
|
131 |
+
cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
|
132 |
+
mu = mu * cfg_mask.view(-1, 1, 1)
|
133 |
+
spks = spks * cfg_mask.view(-1, 1)
|
134 |
+
cond = cond * cfg_mask.view(-1, 1, 1)
|
135 |
+
|
136 |
pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
|
137 |
loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
|
138 |
return loss, y
|
runtime/python/Dockerfile
CHANGED
@@ -10,5 +10,4 @@ RUN git lfs install
|
|
10 |
RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
|
11 |
# here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed
|
12 |
RUN cd CosyVoice && pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
|
13 |
-
RUN cd CosyVoice/runtime/python && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
|
14 |
-
CMD ["/bin/bash", "-c", "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"]
|
|
|
10 |
RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
|
11 |
# here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed
|
12 |
RUN cd CosyVoice && pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
|
13 |
+
RUN cd CosyVoice/runtime/python/grpc && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
|
|
runtime/python/{fastapi_client.py β fastapi/fastapi_client.py}
RENAMED
File without changes
|
runtime/python/{fastapi_server.py β fastapi/fastapi_server.py}
RENAMED
File without changes
|
runtime/python/{client.py β grpc/client.py}
RENAMED
@@ -14,8 +14,8 @@
|
|
14 |
import os
|
15 |
import sys
|
16 |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
17 |
-
sys.path.append('{}
|
18 |
-
sys.path.append('{}
|
19 |
import logging
|
20 |
import argparse
|
21 |
import torchaudio
|
@@ -90,7 +90,7 @@ if __name__ == "__main__":
|
|
90 |
default='εΈζδ½ δ»₯εθ½ε€εηζ―ζθΏε₯½ε¦γ')
|
91 |
parser.add_argument('--prompt_wav',
|
92 |
type=str,
|
93 |
-
default='
|
94 |
parser.add_argument('--instruct_text',
|
95 |
type=str,
|
96 |
default='Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
|
|
|
14 |
import os
|
15 |
import sys
|
16 |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
17 |
+
sys.path.append('{}/../../..'.format(ROOT_DIR))
|
18 |
+
sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
|
19 |
import logging
|
20 |
import argparse
|
21 |
import torchaudio
|
|
|
90 |
default='εΈζδ½ δ»₯εθ½ε€εηζ―ζθΏε₯½ε¦γ')
|
91 |
parser.add_argument('--prompt_wav',
|
92 |
type=str,
|
93 |
+
default='../../../zero_shot_prompt.wav')
|
94 |
parser.add_argument('--instruct_text',
|
95 |
type=str,
|
96 |
default='Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
|
runtime/python/{cosyvoice.proto β grpc/cosyvoice.proto}
RENAMED
File without changes
|
runtime/python/{server.py β grpc/server.py}
RENAMED
@@ -14,8 +14,8 @@
|
|
14 |
import os
|
15 |
import sys
|
16 |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
17 |
-
sys.path.append('{}
|
18 |
-
sys.path.append('{}
|
19 |
from concurrent import futures
|
20 |
import argparse
|
21 |
import cosyvoice_pb2
|
@@ -77,7 +77,6 @@ if __name__ == '__main__':
|
|
77 |
default=4)
|
78 |
parser.add_argument('--model_dir',
|
79 |
type=str,
|
80 |
-
required=True,
|
81 |
default='iic/CosyVoice-300M',
|
82 |
help='local path or modelscope repo id')
|
83 |
args = parser.parse_args()
|
|
|
14 |
import os
|
15 |
import sys
|
16 |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
17 |
+
sys.path.append('{}/../../..'.format(ROOT_DIR))
|
18 |
+
sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
|
19 |
from concurrent import futures
|
20 |
import argparse
|
21 |
import cosyvoice_pb2
|
|
|
77 |
default=4)
|
78 |
parser.add_argument('--model_dir',
|
79 |
type=str,
|
|
|
80 |
default='iic/CosyVoice-300M',
|
81 |
help='local path or modelscope repo id')
|
82 |
args = parser.parse_args()
|