CosyVoice commited on
Commit
44aea80
β€’
1 Parent(s): c7d9754

add train cfg in flow matching

Browse files
README.md CHANGED
@@ -131,8 +131,10 @@ you can run following steps. Otherwise, you can just ignore this step.
131
  cd runtime/python
132
  docker build -t cosyvoice:v1.0 .
133
  # change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
134
- docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"
135
- python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
 
 
136
  ```
137
 
138
  ## Discussion & Communication
 
131
  cd runtime/python
132
  docker build -t cosyvoice:v1.0 .
133
  # change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
134
+ # for grpc usage
135
+ docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/grpc && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"
136
+ python3 grpc/client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
137
+ # for fastapi usage
138
  ```
139
 
140
  ## Discussion & Communication
cosyvoice/flow/flow_matching.py CHANGED
@@ -126,6 +126,13 @@ class ConditionalCFM(BASECFM):
126
  y = (1 - (1 - self.sigma_min) * t) * z + t * x1
127
  u = x1 - (1 - self.sigma_min) * z
128
 
 
 
 
 
 
 
 
129
  pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
130
  loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
131
  return loss, y
 
126
  y = (1 - (1 - self.sigma_min) * t) * z + t * x1
127
  u = x1 - (1 - self.sigma_min) * z
128
 
129
+ # during training, we randomly drop condition to trade off mode coverage and sample fidelity
130
+ if self.training_cfg_rate > 0:
131
+ cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
132
+ mu = mu * cfg_mask.view(-1, 1, 1)
133
+ spks = spks * cfg_mask.view(-1, 1)
134
+ cond = cond * cfg_mask.view(-1, 1, 1)
135
+
136
  pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
137
  loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
138
  return loss, y
runtime/python/Dockerfile CHANGED
@@ -10,5 +10,4 @@ RUN git lfs install
10
  RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
11
  # here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed
12
  RUN cd CosyVoice && pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
13
- RUN cd CosyVoice/runtime/python && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
14
- CMD ["/bin/bash", "-c", "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"]
 
10
  RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
11
  # here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed
12
  RUN cd CosyVoice && pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
13
+ RUN cd CosyVoice/runtime/python/grpc && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
 
runtime/python/{fastapi_client.py β†’ fastapi/fastapi_client.py} RENAMED
File without changes
runtime/python/{fastapi_server.py β†’ fastapi/fastapi_server.py} RENAMED
File without changes
runtime/python/{client.py β†’ grpc/client.py} RENAMED
@@ -14,8 +14,8 @@
14
  import os
15
  import sys
16
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
17
- sys.path.append('{}/../..'.format(ROOT_DIR))
18
- sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
19
  import logging
20
  import argparse
21
  import torchaudio
@@ -90,7 +90,7 @@ if __name__ == "__main__":
90
  default='εΈŒζœ›δ½ δ»₯εŽθƒ½ε€Ÿεšηš„ζ―”ζˆ‘θΏ˜ε₯½ε‘¦γ€‚')
91
  parser.add_argument('--prompt_wav',
92
  type=str,
93
- default='../../zero_shot_prompt.wav')
94
  parser.add_argument('--instruct_text',
95
  type=str,
96
  default='Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
 
14
  import os
15
  import sys
16
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
17
+ sys.path.append('{}/../../..'.format(ROOT_DIR))
18
+ sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
19
  import logging
20
  import argparse
21
  import torchaudio
 
90
  default='εΈŒζœ›δ½ δ»₯εŽθƒ½ε€Ÿεšηš„ζ―”ζˆ‘θΏ˜ε₯½ε‘¦γ€‚')
91
  parser.add_argument('--prompt_wav',
92
  type=str,
93
+ default='../../../zero_shot_prompt.wav')
94
  parser.add_argument('--instruct_text',
95
  type=str,
96
  default='Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
runtime/python/{cosyvoice.proto β†’ grpc/cosyvoice.proto} RENAMED
File without changes
runtime/python/{server.py β†’ grpc/server.py} RENAMED
@@ -14,8 +14,8 @@
14
  import os
15
  import sys
16
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
17
- sys.path.append('{}/../..'.format(ROOT_DIR))
18
- sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
19
  from concurrent import futures
20
  import argparse
21
  import cosyvoice_pb2
@@ -77,7 +77,6 @@ if __name__ == '__main__':
77
  default=4)
78
  parser.add_argument('--model_dir',
79
  type=str,
80
- required=True,
81
  default='iic/CosyVoice-300M',
82
  help='local path or modelscope repo id')
83
  args = parser.parse_args()
 
14
  import os
15
  import sys
16
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
17
+ sys.path.append('{}/../../..'.format(ROOT_DIR))
18
+ sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
19
  from concurrent import futures
20
  import argparse
21
  import cosyvoice_pb2
 
77
  default=4)
78
  parser.add_argument('--model_dir',
79
  type=str,
 
80
  default='iic/CosyVoice-300M',
81
  help='local path or modelscope repo id')
82
  args = parser.parse_args()