feat: base project
Browse files- .gitignore +5 -0
- app.py +2101 -0
- audios/.gitignore +0 -0
- config.py +204 -0
- i18n.py +28 -0
- i18n/en_US.json +126 -0
- i18n/es_ES.json +132 -0
- i18n/it_IT.json +130 -0
- i18n/ja_JP.json +126 -0
- i18n/locale_diff.py +45 -0
- i18n/ru-RU.json +130 -0
- i18n/tr_TR.json +130 -0
- i18n/zh_CN.json +132 -0
- i18n/zh_HK.json +132 -0
- i18n/zh_SG.json +132 -0
- i18n/zh_TW.json +132 -0
- lib/infer_pack/attentions.py +417 -0
- lib/infer_pack/commons.py +166 -0
- lib/infer_pack/models.py +1142 -0
- lib/infer_pack/models_onnx.py +819 -0
- lib/infer_pack/modules.py +522 -0
- lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +90 -0
- lib/infer_pack/modules/F0Predictor/F0Predictor.py +16 -0
- lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +86 -0
- lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +97 -0
- lib/infer_pack/modules/F0Predictor/__init__.py +0 -0
- lib/infer_pack/transforms.py +209 -0
- packages.txt +3 -0
- requirements.txt +22 -0
- utils.py +151 -0
- vc_infer_pipeline.py +646 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
weights/
|
3 |
+
TEMP/
|
4 |
+
logs/
|
5 |
+
csvdb/
|
app.py
ADDED
@@ -0,0 +1,2101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess, torch, os, traceback, sys, warnings, shutil, numpy as np
|
2 |
+
from mega import Mega
|
3 |
+
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
|
4 |
+
import threading
|
5 |
+
from time import sleep
|
6 |
+
from subprocess import Popen
|
7 |
+
import faiss
|
8 |
+
from random import shuffle
|
9 |
+
import json, datetime, requests
|
10 |
+
from gtts import gTTS
|
11 |
+
now_dir = os.getcwd()
|
12 |
+
sys.path.append(now_dir)
|
13 |
+
tmp = os.path.join(now_dir, "TEMP")
|
14 |
+
shutil.rmtree(tmp, ignore_errors=True)
|
15 |
+
shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
|
16 |
+
os.makedirs(tmp, exist_ok=True)
|
17 |
+
os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True)
|
18 |
+
os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True)
|
19 |
+
os.environ["TEMP"] = tmp
|
20 |
+
warnings.filterwarnings("ignore")
|
21 |
+
torch.manual_seed(114514)
|
22 |
+
from i18n import I18nAuto
|
23 |
+
|
24 |
+
import signal
|
25 |
+
|
26 |
+
import math
|
27 |
+
|
28 |
+
from utils import load_audio, CSVutil
|
29 |
+
|
30 |
+
global DoFormant, Quefrency, Timbre
|
31 |
+
|
32 |
+
if not os.path.isdir('csvdb/'):
|
33 |
+
os.makedirs('csvdb')
|
34 |
+
frmnt, stp = open("csvdb/formanting.csv", 'w'), open("csvdb/stop.csv", 'w')
|
35 |
+
frmnt.close()
|
36 |
+
stp.close()
|
37 |
+
|
38 |
+
try:
|
39 |
+
DoFormant, Quefrency, Timbre = CSVutil('csvdb/formanting.csv', 'r', 'formanting')
|
40 |
+
DoFormant = (
|
41 |
+
lambda DoFormant: True if DoFormant.lower() == 'true' else (False if DoFormant.lower() == 'false' else DoFormant)
|
42 |
+
)(DoFormant)
|
43 |
+
except (ValueError, TypeError, IndexError):
|
44 |
+
DoFormant, Quefrency, Timbre = False, 1.0, 1.0
|
45 |
+
CSVutil('csvdb/formanting.csv', 'w+', 'formanting', DoFormant, Quefrency, Timbre)
|
46 |
+
|
47 |
+
#from MDXNet import MDXNetDereverb
|
48 |
+
|
49 |
+
# Check if we're in a Google Colab environment
|
50 |
+
if os.path.exists('/content/'):
|
51 |
+
print("\n-------------------------------\nRVC v2 Easy GUI (Colab Edition)\n-------------------------------\n")
|
52 |
+
|
53 |
+
print("-------------------------------")
|
54 |
+
# Check if the file exists at the specified path
|
55 |
+
if os.path.exists('/content/Retrieval-based-Voice-Conversion-WebUI/hubert_base.pt'):
|
56 |
+
# If the file exists, print a statement saying so
|
57 |
+
print("File /content/Retrieval-based-Voice-Conversion-WebUI/hubert_base.pt already exists. No need to download.")
|
58 |
+
else:
|
59 |
+
# If the file doesn't exist, print a statement saying it's downloading
|
60 |
+
print("File /content/Retrieval-based-Voice-Conversion-WebUI/hubert_base.pt does not exist. Starting download.")
|
61 |
+
|
62 |
+
# Make a request to the URL
|
63 |
+
response = requests.get('https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt')
|
64 |
+
|
65 |
+
# Ensure the request was successful
|
66 |
+
if response.status_code == 200:
|
67 |
+
# If the response was a success, save the content to the specified file path
|
68 |
+
with open('/content/Retrieval-based-Voice-Conversion-WebUI/hubert_base.pt', 'wb') as f:
|
69 |
+
f.write(response.content)
|
70 |
+
print("Download complete. File saved to /content/Retrieval-based-Voice-Conversion-WebUI/hubert_base.pt.")
|
71 |
+
else:
|
72 |
+
# If the response was a failure, print an error message
|
73 |
+
print("Failed to download file. Status code: " + str(response.status_code) + ".")
|
74 |
+
else:
|
75 |
+
print("\n-------------------------------\nRVC v2 Easy GUI (Local Edition)\n-------------------------------\n")
|
76 |
+
print("-------------------------------\nNot running on Google Colab, skipping download.")
|
77 |
+
|
78 |
+
def formant_apply(qfrency, tmbre):
|
79 |
+
Quefrency = qfrency
|
80 |
+
Timbre = tmbre
|
81 |
+
DoFormant = True
|
82 |
+
CSVutil('csvdb/formanting.csv', 'w+', 'formanting', DoFormant, qfrency, tmbre)
|
83 |
+
|
84 |
+
return ({"value": Quefrency, "__type__": "update"}, {"value": Timbre, "__type__": "update"})
|
85 |
+
|
86 |
+
def get_fshift_presets():
|
87 |
+
fshift_presets_list = []
|
88 |
+
for dirpath, _, filenames in os.walk("./formantshiftcfg/"):
|
89 |
+
for filename in filenames:
|
90 |
+
if filename.endswith(".txt"):
|
91 |
+
fshift_presets_list.append(os.path.join(dirpath,filename).replace('\\','/'))
|
92 |
+
|
93 |
+
if len(fshift_presets_list) > 0:
|
94 |
+
return fshift_presets_list
|
95 |
+
else:
|
96 |
+
return ''
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
def formant_enabled(cbox, qfrency, tmbre, frmntapply, formantpreset, formant_refresh_button):
|
101 |
+
|
102 |
+
if (cbox):
|
103 |
+
|
104 |
+
DoFormant = True
|
105 |
+
CSVutil('csvdb/formanting.csv', 'w+', 'formanting', DoFormant, qfrency, tmbre)
|
106 |
+
#print(f"is checked? - {cbox}\ngot {DoFormant}")
|
107 |
+
|
108 |
+
return (
|
109 |
+
{"value": True, "__type__": "update"},
|
110 |
+
{"visible": True, "__type__": "update"},
|
111 |
+
{"visible": True, "__type__": "update"},
|
112 |
+
{"visible": True, "__type__": "update"},
|
113 |
+
{"visible": True, "__type__": "update"},
|
114 |
+
{"visible": True, "__type__": "update"},
|
115 |
+
)
|
116 |
+
|
117 |
+
|
118 |
+
else:
|
119 |
+
|
120 |
+
DoFormant = False
|
121 |
+
CSVutil('csvdb/formanting.csv', 'w+', 'formanting', DoFormant, qfrency, tmbre)
|
122 |
+
|
123 |
+
#print(f"is checked? - {cbox}\ngot {DoFormant}")
|
124 |
+
return (
|
125 |
+
{"value": False, "__type__": "update"},
|
126 |
+
{"visible": False, "__type__": "update"},
|
127 |
+
{"visible": False, "__type__": "update"},
|
128 |
+
{"visible": False, "__type__": "update"},
|
129 |
+
{"visible": False, "__type__": "update"},
|
130 |
+
{"visible": False, "__type__": "update"},
|
131 |
+
{"visible": False, "__type__": "update"},
|
132 |
+
)
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
def preset_apply(preset, qfer, tmbr):
|
137 |
+
if str(preset) != '':
|
138 |
+
with open(str(preset), 'r') as p:
|
139 |
+
content = p.readlines()
|
140 |
+
qfer, tmbr = content[0].split('\n')[0], content[1]
|
141 |
+
|
142 |
+
formant_apply(qfer, tmbr)
|
143 |
+
else:
|
144 |
+
pass
|
145 |
+
return ({"value": qfer, "__type__": "update"}, {"value": tmbr, "__type__": "update"})
|
146 |
+
|
147 |
+
def update_fshift_presets(preset, qfrency, tmbre):
|
148 |
+
|
149 |
+
qfrency, tmbre = preset_apply(preset, qfrency, tmbre)
|
150 |
+
|
151 |
+
if (str(preset) != ''):
|
152 |
+
with open(str(preset), 'r') as p:
|
153 |
+
content = p.readlines()
|
154 |
+
qfrency, tmbre = content[0].split('\n')[0], content[1]
|
155 |
+
|
156 |
+
formant_apply(qfrency, tmbre)
|
157 |
+
else:
|
158 |
+
pass
|
159 |
+
return (
|
160 |
+
{"choices": get_fshift_presets(), "__type__": "update"},
|
161 |
+
{"value": qfrency, "__type__": "update"},
|
162 |
+
{"value": tmbre, "__type__": "update"},
|
163 |
+
)
|
164 |
+
|
165 |
+
i18n = I18nAuto()
|
166 |
+
#i18n.print()
|
167 |
+
# 判断是否有能用来训练和加速推理的N卡
|
168 |
+
ngpu = torch.cuda.device_count()
|
169 |
+
gpu_infos = []
|
170 |
+
mem = []
|
171 |
+
if (not torch.cuda.is_available()) or ngpu == 0:
|
172 |
+
if_gpu_ok = False
|
173 |
+
else:
|
174 |
+
if_gpu_ok = False
|
175 |
+
for i in range(ngpu):
|
176 |
+
gpu_name = torch.cuda.get_device_name(i)
|
177 |
+
if (
|
178 |
+
"10" in gpu_name
|
179 |
+
or "16" in gpu_name
|
180 |
+
or "20" in gpu_name
|
181 |
+
or "30" in gpu_name
|
182 |
+
or "40" in gpu_name
|
183 |
+
or "A2" in gpu_name.upper()
|
184 |
+
or "A3" in gpu_name.upper()
|
185 |
+
or "A4" in gpu_name.upper()
|
186 |
+
or "P4" in gpu_name.upper()
|
187 |
+
or "A50" in gpu_name.upper()
|
188 |
+
or "A60" in gpu_name.upper()
|
189 |
+
or "70" in gpu_name
|
190 |
+
or "80" in gpu_name
|
191 |
+
or "90" in gpu_name
|
192 |
+
or "M4" in gpu_name.upper()
|
193 |
+
or "T4" in gpu_name.upper()
|
194 |
+
or "TITAN" in gpu_name.upper()
|
195 |
+
): # A10#A100#V100#A40#P40#M40#K80#A4500
|
196 |
+
if_gpu_ok = True # 至少有一张能用的N卡
|
197 |
+
gpu_infos.append("%s\t%s" % (i, gpu_name))
|
198 |
+
mem.append(
|
199 |
+
int(
|
200 |
+
torch.cuda.get_device_properties(i).total_memory
|
201 |
+
/ 1024
|
202 |
+
/ 1024
|
203 |
+
/ 1024
|
204 |
+
+ 0.4
|
205 |
+
)
|
206 |
+
)
|
207 |
+
if if_gpu_ok == True and len(gpu_infos) > 0:
|
208 |
+
gpu_info = "\n".join(gpu_infos)
|
209 |
+
default_batch_size = min(mem) // 2
|
210 |
+
else:
|
211 |
+
gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
|
212 |
+
default_batch_size = 1
|
213 |
+
gpus = "-".join([i[0] for i in gpu_infos])
|
214 |
+
from lib.infer_pack.models import (
|
215 |
+
SynthesizerTrnMs256NSFsid,
|
216 |
+
SynthesizerTrnMs256NSFsid_nono,
|
217 |
+
SynthesizerTrnMs768NSFsid,
|
218 |
+
SynthesizerTrnMs768NSFsid_nono,
|
219 |
+
)
|
220 |
+
import soundfile as sf
|
221 |
+
from fairseq import checkpoint_utils
|
222 |
+
import gradio as gr
|
223 |
+
import logging
|
224 |
+
from vc_infer_pipeline import VC
|
225 |
+
from config import Config
|
226 |
+
|
227 |
+
config = Config()
|
228 |
+
# from trainset_preprocess_pipeline import PreProcess
|
229 |
+
logging.getLogger("numba").setLevel(logging.WARNING)
|
230 |
+
|
231 |
+
hubert_model = None
|
232 |
+
|
233 |
+
def load_hubert():
|
234 |
+
global hubert_model
|
235 |
+
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
236 |
+
["hubert_base.pt"],
|
237 |
+
suffix="",
|
238 |
+
)
|
239 |
+
hubert_model = models[0]
|
240 |
+
hubert_model = hubert_model.to(config.device)
|
241 |
+
if config.is_half:
|
242 |
+
hubert_model = hubert_model.half()
|
243 |
+
else:
|
244 |
+
hubert_model = hubert_model.float()
|
245 |
+
hubert_model.eval()
|
246 |
+
|
247 |
+
|
248 |
+
weight_root = "weights"
|
249 |
+
index_root = "logs"
|
250 |
+
names = []
|
251 |
+
for name in os.listdir(weight_root):
|
252 |
+
if name.endswith(".pth"):
|
253 |
+
names.append(name)
|
254 |
+
index_paths = []
|
255 |
+
for root, dirs, files in os.walk(index_root, topdown=False):
|
256 |
+
for name in files:
|
257 |
+
if name.endswith(".index") and "trained" not in name:
|
258 |
+
index_paths.append("%s/%s" % (root, name))
|
259 |
+
|
260 |
+
|
261 |
+
|
262 |
+
def vc_single(
|
263 |
+
sid,
|
264 |
+
input_audio_path,
|
265 |
+
f0_up_key,
|
266 |
+
f0_file,
|
267 |
+
f0_method,
|
268 |
+
file_index,
|
269 |
+
#file_index2,
|
270 |
+
# file_big_npy,
|
271 |
+
index_rate,
|
272 |
+
filter_radius,
|
273 |
+
resample_sr,
|
274 |
+
rms_mix_rate,
|
275 |
+
protect,
|
276 |
+
crepe_hop_length,
|
277 |
+
): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
|
278 |
+
global tgt_sr, net_g, vc, hubert_model, version
|
279 |
+
if input_audio_path is None:
|
280 |
+
return "You need to upload an audio", None
|
281 |
+
f0_up_key = int(f0_up_key)
|
282 |
+
try:
|
283 |
+
audio = load_audio(input_audio_path, 16000, DoFormant, Quefrency, Timbre)
|
284 |
+
audio_max = np.abs(audio).max() / 0.95
|
285 |
+
if audio_max > 1:
|
286 |
+
audio /= audio_max
|
287 |
+
times = [0, 0, 0]
|
288 |
+
if hubert_model == None:
|
289 |
+
load_hubert()
|
290 |
+
if_f0 = cpt.get("f0", 1)
|
291 |
+
file_index = (
|
292 |
+
(
|
293 |
+
file_index.strip(" ")
|
294 |
+
.strip('"')
|
295 |
+
.strip("\n")
|
296 |
+
.strip('"')
|
297 |
+
.strip(" ")
|
298 |
+
.replace("trained", "added")
|
299 |
+
)
|
300 |
+
) # 防止小白写错,自动帮他替换掉
|
301 |
+
# file_big_npy = (
|
302 |
+
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
303 |
+
# )
|
304 |
+
audio_opt = vc.pipeline(
|
305 |
+
hubert_model,
|
306 |
+
net_g,
|
307 |
+
sid,
|
308 |
+
audio,
|
309 |
+
input_audio_path,
|
310 |
+
times,
|
311 |
+
f0_up_key,
|
312 |
+
f0_method,
|
313 |
+
file_index,
|
314 |
+
# file_big_npy,
|
315 |
+
index_rate,
|
316 |
+
if_f0,
|
317 |
+
filter_radius,
|
318 |
+
tgt_sr,
|
319 |
+
resample_sr,
|
320 |
+
rms_mix_rate,
|
321 |
+
version,
|
322 |
+
protect,
|
323 |
+
crepe_hop_length,
|
324 |
+
f0_file=f0_file,
|
325 |
+
)
|
326 |
+
if resample_sr >= 16000 and tgt_sr != resample_sr:
|
327 |
+
tgt_sr = resample_sr
|
328 |
+
index_info = (
|
329 |
+
"Using index:%s." % file_index
|
330 |
+
if os.path.exists(file_index)
|
331 |
+
else "Index not used."
|
332 |
+
)
|
333 |
+
return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
|
334 |
+
index_info,
|
335 |
+
times[0],
|
336 |
+
times[1],
|
337 |
+
times[2],
|
338 |
+
), (tgt_sr, audio_opt)
|
339 |
+
except:
|
340 |
+
info = traceback.format_exc()
|
341 |
+
print(info)
|
342 |
+
return info, (None, None)
|
343 |
+
|
344 |
+
|
345 |
+
def vc_multi(
|
346 |
+
sid,
|
347 |
+
dir_path,
|
348 |
+
opt_root,
|
349 |
+
paths,
|
350 |
+
f0_up_key,
|
351 |
+
f0_method,
|
352 |
+
file_index,
|
353 |
+
file_index2,
|
354 |
+
# file_big_npy,
|
355 |
+
index_rate,
|
356 |
+
filter_radius,
|
357 |
+
resample_sr,
|
358 |
+
rms_mix_rate,
|
359 |
+
protect,
|
360 |
+
format1,
|
361 |
+
crepe_hop_length,
|
362 |
+
):
|
363 |
+
try:
|
364 |
+
dir_path = (
|
365 |
+
dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
366 |
+
) # 防止小白拷路径头尾带了空格和"和回车
|
367 |
+
opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
368 |
+
os.makedirs(opt_root, exist_ok=True)
|
369 |
+
try:
|
370 |
+
if dir_path != "":
|
371 |
+
paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)]
|
372 |
+
else:
|
373 |
+
paths = [path.name for path in paths]
|
374 |
+
except:
|
375 |
+
traceback.print_exc()
|
376 |
+
paths = [path.name for path in paths]
|
377 |
+
infos = []
|
378 |
+
for path in paths:
|
379 |
+
info, opt = vc_single(
|
380 |
+
sid,
|
381 |
+
path,
|
382 |
+
f0_up_key,
|
383 |
+
None,
|
384 |
+
f0_method,
|
385 |
+
file_index,
|
386 |
+
# file_big_npy,
|
387 |
+
index_rate,
|
388 |
+
filter_radius,
|
389 |
+
resample_sr,
|
390 |
+
rms_mix_rate,
|
391 |
+
protect,
|
392 |
+
crepe_hop_length
|
393 |
+
)
|
394 |
+
if "Success" in info:
|
395 |
+
try:
|
396 |
+
tgt_sr, audio_opt = opt
|
397 |
+
if format1 in ["wav", "flac"]:
|
398 |
+
sf.write(
|
399 |
+
"%s/%s.%s" % (opt_root, os.path.basename(path), format1),
|
400 |
+
audio_opt,
|
401 |
+
tgt_sr,
|
402 |
+
)
|
403 |
+
else:
|
404 |
+
path = "%s/%s.wav" % (opt_root, os.path.basename(path))
|
405 |
+
sf.write(
|
406 |
+
path,
|
407 |
+
audio_opt,
|
408 |
+
tgt_sr,
|
409 |
+
)
|
410 |
+
if os.path.exists(path):
|
411 |
+
os.system(
|
412 |
+
"ffmpeg -i %s -vn %s -q:a 2 -y"
|
413 |
+
% (path, path[:-4] + ".%s" % format1)
|
414 |
+
)
|
415 |
+
except:
|
416 |
+
info += traceback.format_exc()
|
417 |
+
infos.append("%s->%s" % (os.path.basename(path), info))
|
418 |
+
yield "\n".join(infos)
|
419 |
+
yield "\n".join(infos)
|
420 |
+
except:
|
421 |
+
yield traceback.format_exc()
|
422 |
+
|
423 |
+
# 一个选项卡全局只能有一个音色
|
424 |
+
def get_vc(sid):
|
425 |
+
global n_spk, tgt_sr, net_g, vc, cpt, version
|
426 |
+
if sid == "" or sid == []:
|
427 |
+
global hubert_model
|
428 |
+
if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
|
429 |
+
print("clean_empty_cache")
|
430 |
+
del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
|
431 |
+
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
|
432 |
+
if torch.cuda.is_available():
|
433 |
+
torch.cuda.empty_cache()
|
434 |
+
###楼下不这么折腾清理不干净
|
435 |
+
if_f0 = cpt.get("f0", 1)
|
436 |
+
version = cpt.get("version", "v1")
|
437 |
+
if version == "v1":
|
438 |
+
if if_f0 == 1:
|
439 |
+
net_g = SynthesizerTrnMs256NSFsid(
|
440 |
+
*cpt["config"], is_half=config.is_half
|
441 |
+
)
|
442 |
+
else:
|
443 |
+
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
|
444 |
+
elif version == "v2":
|
445 |
+
if if_f0 == 1:
|
446 |
+
net_g = SynthesizerTrnMs768NSFsid(
|
447 |
+
*cpt["config"], is_half=config.is_half
|
448 |
+
)
|
449 |
+
else:
|
450 |
+
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
451 |
+
del net_g, cpt
|
452 |
+
if torch.cuda.is_available():
|
453 |
+
torch.cuda.empty_cache()
|
454 |
+
cpt = None
|
455 |
+
return {"visible": False, "__type__": "update"}
|
456 |
+
person = "%s/%s" % (weight_root, sid)
|
457 |
+
print("loading %s" % person)
|
458 |
+
cpt = torch.load(person, map_location="cpu")
|
459 |
+
tgt_sr = cpt["config"][-1]
|
460 |
+
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
461 |
+
if_f0 = cpt.get("f0", 1)
|
462 |
+
version = cpt.get("version", "v1")
|
463 |
+
if version == "v1":
|
464 |
+
if if_f0 == 1:
|
465 |
+
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
|
466 |
+
else:
|
467 |
+
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
|
468 |
+
elif version == "v2":
|
469 |
+
if if_f0 == 1:
|
470 |
+
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
|
471 |
+
else:
|
472 |
+
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
473 |
+
del net_g.enc_q
|
474 |
+
print(net_g.load_state_dict(cpt["weight"], strict=False))
|
475 |
+
net_g.eval().to(config.device)
|
476 |
+
if config.is_half:
|
477 |
+
net_g = net_g.half()
|
478 |
+
else:
|
479 |
+
net_g = net_g.float()
|
480 |
+
vc = VC(tgt_sr, config)
|
481 |
+
n_spk = cpt["config"][-3]
|
482 |
+
return {"visible": False, "maximum": n_spk, "__type__": "update"}
|
483 |
+
|
484 |
+
|
485 |
+
def change_choices():
|
486 |
+
names = []
|
487 |
+
for name in os.listdir(weight_root):
|
488 |
+
if name.endswith(".pth"):
|
489 |
+
names.append(name)
|
490 |
+
index_paths = []
|
491 |
+
for root, dirs, files in os.walk(index_root, topdown=False):
|
492 |
+
for name in files:
|
493 |
+
if name.endswith(".index") and "trained" not in name:
|
494 |
+
index_paths.append("%s/%s" % (root, name))
|
495 |
+
return {"choices": sorted(names), "__type__": "update"}, {
|
496 |
+
"choices": sorted(index_paths),
|
497 |
+
"__type__": "update",
|
498 |
+
}
|
499 |
+
|
500 |
+
|
501 |
+
def clean():
|
502 |
+
return {"value": "", "__type__": "update"}
|
503 |
+
|
504 |
+
|
505 |
+
sr_dict = {
|
506 |
+
"32k": 32000,
|
507 |
+
"40k": 40000,
|
508 |
+
"48k": 48000,
|
509 |
+
}
|
510 |
+
|
511 |
+
|
512 |
+
def if_done(done, p):
|
513 |
+
while 1:
|
514 |
+
if p.poll() == None:
|
515 |
+
sleep(0.5)
|
516 |
+
else:
|
517 |
+
break
|
518 |
+
done[0] = True
|
519 |
+
|
520 |
+
|
521 |
+
def if_done_multi(done, ps):
|
522 |
+
while 1:
|
523 |
+
# poll==None代表进程未结束
|
524 |
+
# 只要有一个进程未结束都不停
|
525 |
+
flag = 1
|
526 |
+
for p in ps:
|
527 |
+
if p.poll() == None:
|
528 |
+
flag = 0
|
529 |
+
sleep(0.5)
|
530 |
+
break
|
531 |
+
if flag == 1:
|
532 |
+
break
|
533 |
+
done[0] = True
|
534 |
+
|
535 |
+
|
536 |
+
def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
|
537 |
+
sr = sr_dict[sr]
|
538 |
+
os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
|
539 |
+
f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
|
540 |
+
f.close()
|
541 |
+
cmd = (
|
542 |
+
config.python_cmd
|
543 |
+
+ " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s "
|
544 |
+
% (trainset_dir, sr, n_p, now_dir, exp_dir)
|
545 |
+
+ str(config.noparallel)
|
546 |
+
)
|
547 |
+
print(cmd)
|
548 |
+
p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
|
549 |
+
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
|
550 |
+
done = [False]
|
551 |
+
threading.Thread(
|
552 |
+
target=if_done,
|
553 |
+
args=(
|
554 |
+
done,
|
555 |
+
p,
|
556 |
+
),
|
557 |
+
).start()
|
558 |
+
while 1:
|
559 |
+
with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
|
560 |
+
yield (f.read())
|
561 |
+
sleep(1)
|
562 |
+
if done[0] == True:
|
563 |
+
break
|
564 |
+
with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
|
565 |
+
log = f.read()
|
566 |
+
print(log)
|
567 |
+
yield log
|
568 |
+
|
569 |
+
# but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
|
570 |
+
def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, echl):
|
571 |
+
gpus = gpus.split("-")
|
572 |
+
os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
|
573 |
+
f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w")
|
574 |
+
f.close()
|
575 |
+
if if_f0:
|
576 |
+
cmd = config.python_cmd + " extract_f0_print.py %s/logs/%s %s %s %s" % (
|
577 |
+
now_dir,
|
578 |
+
exp_dir,
|
579 |
+
n_p,
|
580 |
+
f0method,
|
581 |
+
echl,
|
582 |
+
)
|
583 |
+
print(cmd)
|
584 |
+
p = Popen(cmd, shell=True, cwd=now_dir) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
|
585 |
+
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
|
586 |
+
done = [False]
|
587 |
+
threading.Thread(
|
588 |
+
target=if_done,
|
589 |
+
args=(
|
590 |
+
done,
|
591 |
+
p,
|
592 |
+
),
|
593 |
+
).start()
|
594 |
+
while 1:
|
595 |
+
with open(
|
596 |
+
"%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r"
|
597 |
+
) as f:
|
598 |
+
yield (f.read())
|
599 |
+
sleep(1)
|
600 |
+
if done[0] == True:
|
601 |
+
break
|
602 |
+
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
|
603 |
+
log = f.read()
|
604 |
+
print(log)
|
605 |
+
yield log
|
606 |
+
####对不同part分别开多进程
|
607 |
+
"""
|
608 |
+
n_part=int(sys.argv[1])
|
609 |
+
i_part=int(sys.argv[2])
|
610 |
+
i_gpu=sys.argv[3]
|
611 |
+
exp_dir=sys.argv[4]
|
612 |
+
os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
|
613 |
+
"""
|
614 |
+
leng = len(gpus)
|
615 |
+
ps = []
|
616 |
+
for idx, n_g in enumerate(gpus):
|
617 |
+
cmd = (
|
618 |
+
config.python_cmd
|
619 |
+
+ " extract_feature_print.py %s %s %s %s %s/logs/%s %s"
|
620 |
+
% (
|
621 |
+
config.device,
|
622 |
+
leng,
|
623 |
+
idx,
|
624 |
+
n_g,
|
625 |
+
now_dir,
|
626 |
+
exp_dir,
|
627 |
+
version19,
|
628 |
+
)
|
629 |
+
)
|
630 |
+
print(cmd)
|
631 |
+
p = Popen(
|
632 |
+
cmd, shell=True, cwd=now_dir
|
633 |
+
) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
|
634 |
+
ps.append(p)
|
635 |
+
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
|
636 |
+
done = [False]
|
637 |
+
threading.Thread(
|
638 |
+
target=if_done_multi,
|
639 |
+
args=(
|
640 |
+
done,
|
641 |
+
ps,
|
642 |
+
),
|
643 |
+
).start()
|
644 |
+
while 1:
|
645 |
+
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
|
646 |
+
yield (f.read())
|
647 |
+
sleep(1)
|
648 |
+
if done[0] == True:
|
649 |
+
break
|
650 |
+
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
|
651 |
+
log = f.read()
|
652 |
+
print(log)
|
653 |
+
yield log
|
654 |
+
|
655 |
+
|
656 |
+
def change_sr2(sr2, if_f0_3, version19):
|
657 |
+
path_str = "" if version19 == "v1" else "_v2"
|
658 |
+
f0_str = "f0" if if_f0_3 else ""
|
659 |
+
if_pretrained_generator_exist = os.access("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK)
|
660 |
+
if_pretrained_discriminator_exist = os.access("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK)
|
661 |
+
if (if_pretrained_generator_exist == False):
|
662 |
+
print("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
|
663 |
+
if (if_pretrained_discriminator_exist == False):
|
664 |
+
print("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
|
665 |
+
return (
|
666 |
+
("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_generator_exist else "",
|
667 |
+
("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_discriminator_exist else "",
|
668 |
+
{"visible": True, "__type__": "update"}
|
669 |
+
)
|
670 |
+
|
671 |
+
def change_version19(sr2, if_f0_3, version19):
|
672 |
+
path_str = "" if version19 == "v1" else "_v2"
|
673 |
+
f0_str = "f0" if if_f0_3 else ""
|
674 |
+
if_pretrained_generator_exist = os.access("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK)
|
675 |
+
if_pretrained_discriminator_exist = os.access("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK)
|
676 |
+
if (if_pretrained_generator_exist == False):
|
677 |
+
print("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
|
678 |
+
if (if_pretrained_discriminator_exist == False):
|
679 |
+
print("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
|
680 |
+
return (
|
681 |
+
("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_generator_exist else "",
|
682 |
+
("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_discriminator_exist else "",
|
683 |
+
)
|
684 |
+
|
685 |
+
|
686 |
+
def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
|
687 |
+
path_str = "" if version19 == "v1" else "_v2"
|
688 |
+
if_pretrained_generator_exist = os.access("pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK)
|
689 |
+
if_pretrained_discriminator_exist = os.access("pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK)
|
690 |
+
if (if_pretrained_generator_exist == False):
|
691 |
+
print("pretrained%s/f0G%s.pth" % (path_str, sr2), "not exist, will not use pretrained model")
|
692 |
+
if (if_pretrained_discriminator_exist == False):
|
693 |
+
print("pretrained%s/f0D%s.pth" % (path_str, sr2), "not exist, will not use pretrained model")
|
694 |
+
if if_f0_3:
|
695 |
+
return (
|
696 |
+
{"visible": True, "__type__": "update"},
|
697 |
+
"pretrained%s/f0G%s.pth" % (path_str, sr2) if if_pretrained_generator_exist else "",
|
698 |
+
"pretrained%s/f0D%s.pth" % (path_str, sr2) if if_pretrained_discriminator_exist else "",
|
699 |
+
)
|
700 |
+
return (
|
701 |
+
{"visible": False, "__type__": "update"},
|
702 |
+
("pretrained%s/G%s.pth" % (path_str, sr2)) if if_pretrained_generator_exist else "",
|
703 |
+
("pretrained%s/D%s.pth" % (path_str, sr2)) if if_pretrained_discriminator_exist else "",
|
704 |
+
)
|
705 |
+
|
706 |
+
|
707 |
+
global log_interval
|
708 |
+
|
709 |
+
|
710 |
+
def set_log_interval(exp_dir, batch_size12):
|
711 |
+
log_interval = 1
|
712 |
+
|
713 |
+
folder_path = os.path.join(exp_dir, "1_16k_wavs")
|
714 |
+
|
715 |
+
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
716 |
+
wav_files = [f for f in os.listdir(folder_path) if f.endswith(".wav")]
|
717 |
+
if wav_files:
|
718 |
+
sample_size = len(wav_files)
|
719 |
+
log_interval = math.ceil(sample_size / batch_size12)
|
720 |
+
if log_interval > 1:
|
721 |
+
log_interval += 1
|
722 |
+
return log_interval
|
723 |
+
|
724 |
+
# but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
|
725 |
+
def click_train(
|
726 |
+
exp_dir1,
|
727 |
+
sr2,
|
728 |
+
if_f0_3,
|
729 |
+
spk_id5,
|
730 |
+
save_epoch10,
|
731 |
+
total_epoch11,
|
732 |
+
batch_size12,
|
733 |
+
if_save_latest13,
|
734 |
+
pretrained_G14,
|
735 |
+
pretrained_D15,
|
736 |
+
gpus16,
|
737 |
+
if_cache_gpu17,
|
738 |
+
if_save_every_weights18,
|
739 |
+
version19,
|
740 |
+
):
|
741 |
+
CSVutil('csvdb/stop.csv', 'w+', 'formanting', False)
|
742 |
+
# 生成filelist
|
743 |
+
exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
|
744 |
+
os.makedirs(exp_dir, exist_ok=True)
|
745 |
+
gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir)
|
746 |
+
feature_dir = (
|
747 |
+
"%s/3_feature256" % (exp_dir)
|
748 |
+
if version19 == "v1"
|
749 |
+
else "%s/3_feature768" % (exp_dir)
|
750 |
+
)
|
751 |
+
|
752 |
+
log_interval = set_log_interval(exp_dir, batch_size12)
|
753 |
+
|
754 |
+
if if_f0_3:
|
755 |
+
f0_dir = "%s/2a_f0" % (exp_dir)
|
756 |
+
f0nsf_dir = "%s/2b-f0nsf" % (exp_dir)
|
757 |
+
names = (
|
758 |
+
set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
|
759 |
+
& set([name.split(".")[0] for name in os.listdir(feature_dir)])
|
760 |
+
& set([name.split(".")[0] for name in os.listdir(f0_dir)])
|
761 |
+
& set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
|
762 |
+
)
|
763 |
+
else:
|
764 |
+
names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
|
765 |
+
[name.split(".")[0] for name in os.listdir(feature_dir)]
|
766 |
+
)
|
767 |
+
opt = []
|
768 |
+
for name in names:
|
769 |
+
if if_f0_3:
|
770 |
+
opt.append(
|
771 |
+
"%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
|
772 |
+
% (
|
773 |
+
gt_wavs_dir.replace("\\", "\\\\"),
|
774 |
+
name,
|
775 |
+
feature_dir.replace("\\", "\\\\"),
|
776 |
+
name,
|
777 |
+
f0_dir.replace("\\", "\\\\"),
|
778 |
+
name,
|
779 |
+
f0nsf_dir.replace("\\", "\\\\"),
|
780 |
+
name,
|
781 |
+
spk_id5,
|
782 |
+
)
|
783 |
+
)
|
784 |
+
else:
|
785 |
+
opt.append(
|
786 |
+
"%s/%s.wav|%s/%s.npy|%s"
|
787 |
+
% (
|
788 |
+
gt_wavs_dir.replace("\\", "\\\\"),
|
789 |
+
name,
|
790 |
+
feature_dir.replace("\\", "\\\\"),
|
791 |
+
name,
|
792 |
+
spk_id5,
|
793 |
+
)
|
794 |
+
)
|
795 |
+
fea_dim = 256 if version19 == "v1" else 768
|
796 |
+
if if_f0_3:
|
797 |
+
for _ in range(2):
|
798 |
+
opt.append(
|
799 |
+
"%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
|
800 |
+
% (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
|
801 |
+
)
|
802 |
+
else:
|
803 |
+
for _ in range(2):
|
804 |
+
opt.append(
|
805 |
+
"%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
|
806 |
+
% (now_dir, sr2, now_dir, fea_dim, spk_id5)
|
807 |
+
)
|
808 |
+
shuffle(opt)
|
809 |
+
with open("%s/filelist.txt" % exp_dir, "w") as f:
|
810 |
+
f.write("\n".join(opt))
|
811 |
+
print("write filelist done")
|
812 |
+
# 生成config#无需生成config
|
813 |
+
# cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
|
814 |
+
print("use gpus:", gpus16)
|
815 |
+
if pretrained_G14 == "":
|
816 |
+
print("no pretrained Generator")
|
817 |
+
if pretrained_D15 == "":
|
818 |
+
print("no pretrained Discriminator")
|
819 |
+
if gpus16:
|
820 |
+
cmd = (
|
821 |
+
config.python_cmd
|
822 |
+
+ " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s -li %s"
|
823 |
+
% (
|
824 |
+
exp_dir1,
|
825 |
+
sr2,
|
826 |
+
1 if if_f0_3 else 0,
|
827 |
+
batch_size12,
|
828 |
+
gpus16,
|
829 |
+
total_epoch11,
|
830 |
+
save_epoch10,
|
831 |
+
("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
|
832 |
+
("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
|
833 |
+
1 if if_save_latest13 == True else 0,
|
834 |
+
1 if if_cache_gpu17 == True else 0,
|
835 |
+
1 if if_save_every_weights18 == True else 0,
|
836 |
+
version19,
|
837 |
+
log_interval,
|
838 |
+
)
|
839 |
+
)
|
840 |
+
else:
|
841 |
+
cmd = (
|
842 |
+
config.python_cmd
|
843 |
+
+ " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s -li %s"
|
844 |
+
% (
|
845 |
+
exp_dir1,
|
846 |
+
sr2,
|
847 |
+
1 if if_f0_3 else 0,
|
848 |
+
batch_size12,
|
849 |
+
total_epoch11,
|
850 |
+
save_epoch10,
|
851 |
+
("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "\b",
|
852 |
+
("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "\b",
|
853 |
+
1 if if_save_latest13 == True else 0,
|
854 |
+
1 if if_cache_gpu17 == True else 0,
|
855 |
+
1 if if_save_every_weights18 == True else 0,
|
856 |
+
version19,
|
857 |
+
log_interval,
|
858 |
+
)
|
859 |
+
)
|
860 |
+
print(cmd)
|
861 |
+
p = Popen(cmd, shell=True, cwd=now_dir)
|
862 |
+
global PID
|
863 |
+
PID = p.pid
|
864 |
+
p.wait()
|
865 |
+
return ("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", {"visible": False, "__type__": "update"}, {"visible": True, "__type__": "update"})
|
866 |
+
|
867 |
+
|
868 |
+
# but4.click(train_index, [exp_dir1], info3)
|
869 |
+
def train_index(exp_dir1, version19):
|
870 |
+
exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
|
871 |
+
os.makedirs(exp_dir, exist_ok=True)
|
872 |
+
feature_dir = (
|
873 |
+
"%s/3_feature256" % (exp_dir)
|
874 |
+
if version19 == "v1"
|
875 |
+
else "%s/3_feature768" % (exp_dir)
|
876 |
+
)
|
877 |
+
if os.path.exists(feature_dir) == False:
|
878 |
+
return "请先进行特征提取!"
|
879 |
+
listdir_res = list(os.listdir(feature_dir))
|
880 |
+
if len(listdir_res) == 0:
|
881 |
+
return "请先进行特征提取!"
|
882 |
+
npys = []
|
883 |
+
for name in sorted(listdir_res):
|
884 |
+
phone = np.load("%s/%s" % (feature_dir, name))
|
885 |
+
npys.append(phone)
|
886 |
+
big_npy = np.concatenate(npys, 0)
|
887 |
+
big_npy_idx = np.arange(big_npy.shape[0])
|
888 |
+
np.random.shuffle(big_npy_idx)
|
889 |
+
big_npy = big_npy[big_npy_idx]
|
890 |
+
np.save("%s/total_fea.npy" % exp_dir, big_npy)
|
891 |
+
# n_ivf = big_npy.shape[0] // 39
|
892 |
+
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
|
893 |
+
infos = []
|
894 |
+
infos.append("%s,%s" % (big_npy.shape, n_ivf))
|
895 |
+
yield "\n".join(infos)
|
896 |
+
index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
|
897 |
+
# index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf)
|
898 |
+
infos.append("training")
|
899 |
+
yield "\n".join(infos)
|
900 |
+
index_ivf = faiss.extract_index_ivf(index) #
|
901 |
+
index_ivf.nprobe = 1
|
902 |
+
index.train(big_npy)
|
903 |
+
faiss.write_index(
|
904 |
+
index,
|
905 |
+
"%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
|
906 |
+
% (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
|
907 |
+
)
|
908 |
+
# faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
|
909 |
+
infos.append("adding")
|
910 |
+
yield "\n".join(infos)
|
911 |
+
batch_size_add = 8192
|
912 |
+
for i in range(0, big_npy.shape[0], batch_size_add):
|
913 |
+
index.add(big_npy[i : i + batch_size_add])
|
914 |
+
faiss.write_index(
|
915 |
+
index,
|
916 |
+
"%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
|
917 |
+
% (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
|
918 |
+
)
|
919 |
+
infos.append(
|
920 |
+
"成功构建索引,added_IVF%s_Flat_nprobe_%s_%s_%s.index"
|
921 |
+
% (n_ivf, index_ivf.nprobe, exp_dir1, version19)
|
922 |
+
)
|
923 |
+
# faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
|
924 |
+
# infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
|
925 |
+
yield "\n".join(infos)
|
926 |
+
|
927 |
+
|
928 |
+
# but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
|
929 |
+
def train1key(
|
930 |
+
exp_dir1,
|
931 |
+
sr2,
|
932 |
+
if_f0_3,
|
933 |
+
trainset_dir4,
|
934 |
+
spk_id5,
|
935 |
+
np7,
|
936 |
+
f0method8,
|
937 |
+
save_epoch10,
|
938 |
+
total_epoch11,
|
939 |
+
batch_size12,
|
940 |
+
if_save_latest13,
|
941 |
+
pretrained_G14,
|
942 |
+
pretrained_D15,
|
943 |
+
gpus16,
|
944 |
+
if_cache_gpu17,
|
945 |
+
if_save_every_weights18,
|
946 |
+
version19,
|
947 |
+
echl
|
948 |
+
):
|
949 |
+
infos = []
|
950 |
+
|
951 |
+
def get_info_str(strr):
|
952 |
+
infos.append(strr)
|
953 |
+
return "\n".join(infos)
|
954 |
+
|
955 |
+
model_log_dir = "%s/logs/%s" % (now_dir, exp_dir1)
|
956 |
+
preprocess_log_path = "%s/preprocess.log" % model_log_dir
|
957 |
+
extract_f0_feature_log_path = "%s/extract_f0_feature.log" % model_log_dir
|
958 |
+
gt_wavs_dir = "%s/0_gt_wavs" % model_log_dir
|
959 |
+
feature_dir = (
|
960 |
+
"%s/3_feature256" % model_log_dir
|
961 |
+
if version19 == "v1"
|
962 |
+
else "%s/3_feature768" % model_log_dir
|
963 |
+
)
|
964 |
+
|
965 |
+
os.makedirs(model_log_dir, exist_ok=True)
|
966 |
+
#########step1:处理数据
|
967 |
+
open(preprocess_log_path, "w").close()
|
968 |
+
cmd = (
|
969 |
+
config.python_cmd
|
970 |
+
+ " trainset_preprocess_pipeline_print.py %s %s %s %s "
|
971 |
+
% (trainset_dir4, sr_dict[sr2], np7, model_log_dir)
|
972 |
+
+ str(config.noparallel)
|
973 |
+
)
|
974 |
+
yield get_info_str(i18n("step1:正在处理数据"))
|
975 |
+
yield get_info_str(cmd)
|
976 |
+
p = Popen(cmd, shell=True)
|
977 |
+
p.wait()
|
978 |
+
with open(preprocess_log_path, "r") as f:
|
979 |
+
print(f.read())
|
980 |
+
#########step2a:提取音高
|
981 |
+
open(extract_f0_feature_log_path, "w")
|
982 |
+
if if_f0_3:
|
983 |
+
yield get_info_str("step2a:正在提取音高")
|
984 |
+
cmd = config.python_cmd + " extract_f0_print.py %s %s %s %s" % (
|
985 |
+
model_log_dir,
|
986 |
+
np7,
|
987 |
+
f0method8,
|
988 |
+
echl
|
989 |
+
)
|
990 |
+
yield get_info_str(cmd)
|
991 |
+
p = Popen(cmd, shell=True, cwd=now_dir)
|
992 |
+
p.wait()
|
993 |
+
with open(extract_f0_feature_log_path, "r") as f:
|
994 |
+
print(f.read())
|
995 |
+
else:
|
996 |
+
yield get_info_str(i18n("step2a:无需提取音高"))
|
997 |
+
#######step2b:提取特征
|
998 |
+
yield get_info_str(i18n("step2b:正在提取特���"))
|
999 |
+
gpus = gpus16.split("-")
|
1000 |
+
leng = len(gpus)
|
1001 |
+
ps = []
|
1002 |
+
for idx, n_g in enumerate(gpus):
|
1003 |
+
cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s %s" % (
|
1004 |
+
config.device,
|
1005 |
+
leng,
|
1006 |
+
idx,
|
1007 |
+
n_g,
|
1008 |
+
model_log_dir,
|
1009 |
+
version19,
|
1010 |
+
)
|
1011 |
+
yield get_info_str(cmd)
|
1012 |
+
p = Popen(
|
1013 |
+
cmd, shell=True, cwd=now_dir
|
1014 |
+
) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
|
1015 |
+
ps.append(p)
|
1016 |
+
for p in ps:
|
1017 |
+
p.wait()
|
1018 |
+
with open(extract_f0_feature_log_path, "r") as f:
|
1019 |
+
print(f.read())
|
1020 |
+
#######step3a:训练模型
|
1021 |
+
yield get_info_str(i18n("step3a:正在训练模型"))
|
1022 |
+
# 生成filelist
|
1023 |
+
if if_f0_3:
|
1024 |
+
f0_dir = "%s/2a_f0" % model_log_dir
|
1025 |
+
f0nsf_dir = "%s/2b-f0nsf" % model_log_dir
|
1026 |
+
names = (
|
1027 |
+
set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
|
1028 |
+
& set([name.split(".")[0] for name in os.listdir(feature_dir)])
|
1029 |
+
& set([name.split(".")[0] for name in os.listdir(f0_dir)])
|
1030 |
+
& set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
|
1031 |
+
)
|
1032 |
+
else:
|
1033 |
+
names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
|
1034 |
+
[name.split(".")[0] for name in os.listdir(feature_dir)]
|
1035 |
+
)
|
1036 |
+
opt = []
|
1037 |
+
for name in names:
|
1038 |
+
if if_f0_3:
|
1039 |
+
opt.append(
|
1040 |
+
"%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
|
1041 |
+
% (
|
1042 |
+
gt_wavs_dir.replace("\\", "\\\\"),
|
1043 |
+
name,
|
1044 |
+
feature_dir.replace("\\", "\\\\"),
|
1045 |
+
name,
|
1046 |
+
f0_dir.replace("\\", "\\\\"),
|
1047 |
+
name,
|
1048 |
+
f0nsf_dir.replace("\\", "\\\\"),
|
1049 |
+
name,
|
1050 |
+
spk_id5,
|
1051 |
+
)
|
1052 |
+
)
|
1053 |
+
else:
|
1054 |
+
opt.append(
|
1055 |
+
"%s/%s.wav|%s/%s.npy|%s"
|
1056 |
+
% (
|
1057 |
+
gt_wavs_dir.replace("\\", "\\\\"),
|
1058 |
+
name,
|
1059 |
+
feature_dir.replace("\\", "\\\\"),
|
1060 |
+
name,
|
1061 |
+
spk_id5,
|
1062 |
+
)
|
1063 |
+
)
|
1064 |
+
fea_dim = 256 if version19 == "v1" else 768
|
1065 |
+
if if_f0_3:
|
1066 |
+
for _ in range(2):
|
1067 |
+
opt.append(
|
1068 |
+
"%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
|
1069 |
+
% (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
|
1070 |
+
)
|
1071 |
+
else:
|
1072 |
+
for _ in range(2):
|
1073 |
+
opt.append(
|
1074 |
+
"%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
|
1075 |
+
% (now_dir, sr2, now_dir, fea_dim, spk_id5)
|
1076 |
+
)
|
1077 |
+
shuffle(opt)
|
1078 |
+
with open("%s/filelist.txt" % model_log_dir, "w") as f:
|
1079 |
+
f.write("\n".join(opt))
|
1080 |
+
yield get_info_str("write filelist done")
|
1081 |
+
if gpus16:
|
1082 |
+
cmd = (
|
1083 |
+
config.python_cmd
|
1084 |
+
+" train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
|
1085 |
+
% (
|
1086 |
+
exp_dir1,
|
1087 |
+
sr2,
|
1088 |
+
1 if if_f0_3 else 0,
|
1089 |
+
batch_size12,
|
1090 |
+
gpus16,
|
1091 |
+
total_epoch11,
|
1092 |
+
save_epoch10,
|
1093 |
+
("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
|
1094 |
+
("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
|
1095 |
+
1 if if_save_latest13 == True else 0,
|
1096 |
+
1 if if_cache_gpu17 == True else 0,
|
1097 |
+
1 if if_save_every_weights18 == True else 0,
|
1098 |
+
version19,
|
1099 |
+
)
|
1100 |
+
)
|
1101 |
+
else:
|
1102 |
+
cmd = (
|
1103 |
+
config.python_cmd
|
1104 |
+
+ " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
|
1105 |
+
% (
|
1106 |
+
exp_dir1,
|
1107 |
+
sr2,
|
1108 |
+
1 if if_f0_3 else 0,
|
1109 |
+
batch_size12,
|
1110 |
+
total_epoch11,
|
1111 |
+
save_epoch10,
|
1112 |
+
("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
|
1113 |
+
("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
|
1114 |
+
1 if if_save_latest13 == True else 0,
|
1115 |
+
1 if if_cache_gpu17 == True else 0,
|
1116 |
+
1 if if_save_every_weights18 == True else 0,
|
1117 |
+
version19,
|
1118 |
+
)
|
1119 |
+
)
|
1120 |
+
yield get_info_str(cmd)
|
1121 |
+
p = Popen(cmd, shell=True, cwd=now_dir)
|
1122 |
+
p.wait()
|
1123 |
+
yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"))
|
1124 |
+
#######step3b:训练索引
|
1125 |
+
npys = []
|
1126 |
+
listdir_res = list(os.listdir(feature_dir))
|
1127 |
+
for name in sorted(listdir_res):
|
1128 |
+
phone = np.load("%s/%s" % (feature_dir, name))
|
1129 |
+
npys.append(phone)
|
1130 |
+
big_npy = np.concatenate(npys, 0)
|
1131 |
+
|
1132 |
+
big_npy_idx = np.arange(big_npy.shape[0])
|
1133 |
+
np.random.shuffle(big_npy_idx)
|
1134 |
+
big_npy = big_npy[big_npy_idx]
|
1135 |
+
np.save("%s/total_fea.npy" % model_log_dir, big_npy)
|
1136 |
+
|
1137 |
+
# n_ivf = big_npy.shape[0] // 39
|
1138 |
+
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
|
1139 |
+
yield get_info_str("%s,%s" % (big_npy.shape, n_ivf))
|
1140 |
+
index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
|
1141 |
+
yield get_info_str("training index")
|
1142 |
+
index_ivf = faiss.extract_index_ivf(index) #
|
1143 |
+
index_ivf.nprobe = 1
|
1144 |
+
index.train(big_npy)
|
1145 |
+
faiss.write_index(
|
1146 |
+
index,
|
1147 |
+
"%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
|
1148 |
+
% (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
|
1149 |
+
)
|
1150 |
+
yield get_info_str("adding index")
|
1151 |
+
batch_size_add = 8192
|
1152 |
+
for i in range(0, big_npy.shape[0], batch_size_add):
|
1153 |
+
index.add(big_npy[i : i + batch_size_add])
|
1154 |
+
faiss.write_index(
|
1155 |
+
index,
|
1156 |
+
"%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
|
1157 |
+
% (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
|
1158 |
+
)
|
1159 |
+
yield get_info_str(
|
1160 |
+
"成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index"
|
1161 |
+
% (n_ivf, index_ivf.nprobe, exp_dir1, version19)
|
1162 |
+
)
|
1163 |
+
yield get_info_str(i18n("全流程结束!"))
|
1164 |
+
|
1165 |
+
|
1166 |
+
def whethercrepeornah(radio):
|
1167 |
+
mango = True if radio == 'mangio-crepe' or radio == 'mangio-crepe-tiny' else False
|
1168 |
+
return ({"visible": mango, "__type__": "update"})
|
1169 |
+
|
1170 |
+
# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
|
1171 |
+
def change_info_(ckpt_path):
|
1172 |
+
if (
|
1173 |
+
os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log"))
|
1174 |
+
== False
|
1175 |
+
):
|
1176 |
+
return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
|
1177 |
+
try:
|
1178 |
+
with open(
|
1179 |
+
ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r"
|
1180 |
+
) as f:
|
1181 |
+
info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
|
1182 |
+
sr, f0 = info["sample_rate"], info["if_f0"]
|
1183 |
+
version = "v2" if ("version" in info and info["version"] == "v2") else "v1"
|
1184 |
+
return sr, str(f0), version
|
1185 |
+
except:
|
1186 |
+
traceback.print_exc()
|
1187 |
+
return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
|
1188 |
+
|
1189 |
+
|
1190 |
+
from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
|
1191 |
+
|
1192 |
+
|
1193 |
+
def export_onnx(ModelPath, ExportedPath, MoeVS=True):
|
1194 |
+
cpt = torch.load(ModelPath, map_location="cpu")
|
1195 |
+
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
1196 |
+
hidden_channels = 256 if cpt.get("version","v1")=="v1"else 768#cpt["config"][-2] # hidden_channels,为768Vec做准备
|
1197 |
+
|
1198 |
+
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
1199 |
+
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
1200 |
+
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
1201 |
+
test_pitchf = torch.rand(1, 200) # nsf基频
|
1202 |
+
test_ds = torch.LongTensor([0]) # 说话人ID
|
1203 |
+
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
1204 |
+
|
1205 |
+
device = "cpu" # 导出时设备(不影响使用模型)
|
1206 |
+
|
1207 |
+
|
1208 |
+
net_g = SynthesizerTrnMsNSFsidM(
|
1209 |
+
*cpt["config"], is_half=False,version=cpt.get("version","v1")
|
1210 |
+
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
1211 |
+
net_g.load_state_dict(cpt["weight"], strict=False)
|
1212 |
+
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
1213 |
+
output_names = [
|
1214 |
+
"audio",
|
1215 |
+
]
|
1216 |
+
# net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
|
1217 |
+
torch.onnx.export(
|
1218 |
+
net_g,
|
1219 |
+
(
|
1220 |
+
test_phone.to(device),
|
1221 |
+
test_phone_lengths.to(device),
|
1222 |
+
test_pitch.to(device),
|
1223 |
+
test_pitchf.to(device),
|
1224 |
+
test_ds.to(device),
|
1225 |
+
test_rnd.to(device),
|
1226 |
+
),
|
1227 |
+
ExportedPath,
|
1228 |
+
dynamic_axes={
|
1229 |
+
"phone": [1],
|
1230 |
+
"pitch": [1],
|
1231 |
+
"pitchf": [1],
|
1232 |
+
"rnd": [2],
|
1233 |
+
},
|
1234 |
+
do_constant_folding=False,
|
1235 |
+
opset_version=16,
|
1236 |
+
verbose=False,
|
1237 |
+
input_names=input_names,
|
1238 |
+
output_names=output_names,
|
1239 |
+
)
|
1240 |
+
return "Finished"
|
1241 |
+
|
1242 |
+
#region RVC WebUI App
|
1243 |
+
|
1244 |
+
def get_presets():
|
1245 |
+
data = None
|
1246 |
+
with open('../inference-presets.json', 'r') as file:
|
1247 |
+
data = json.load(file)
|
1248 |
+
preset_names = []
|
1249 |
+
for preset in data['presets']:
|
1250 |
+
preset_names.append(preset['name'])
|
1251 |
+
|
1252 |
+
return preset_names
|
1253 |
+
|
1254 |
+
def change_choices2():
|
1255 |
+
audio_files=[]
|
1256 |
+
for filename in os.listdir("./audios"):
|
1257 |
+
if filename.endswith(('.wav','.mp3','.ogg','.flac','.m4a','.aac','.mp4')):
|
1258 |
+
audio_files.append(os.path.join('./audios',filename).replace('\\', '/'))
|
1259 |
+
return {"choices": sorted(audio_files), "__type__": "update"}, {"__type__": "update"}
|
1260 |
+
|
1261 |
+
audio_files=[]
|
1262 |
+
for filename in os.listdir("./audios"):
|
1263 |
+
if filename.endswith(('.wav','.mp3','.ogg','.flac','.m4a','.aac','.mp4')):
|
1264 |
+
audio_files.append(os.path.join('./audios',filename).replace('\\', '/'))
|
1265 |
+
|
1266 |
+
def get_index():
|
1267 |
+
if check_for_name() != '':
|
1268 |
+
chosen_model=sorted(names)[0].split(".")[0]
|
1269 |
+
logs_path="./logs/"+chosen_model
|
1270 |
+
if os.path.exists(logs_path):
|
1271 |
+
for file in os.listdir(logs_path):
|
1272 |
+
if file.endswith(".index"):
|
1273 |
+
return os.path.join(logs_path, file)
|
1274 |
+
return ''
|
1275 |
+
else:
|
1276 |
+
return ''
|
1277 |
+
|
1278 |
+
def get_indexes():
|
1279 |
+
indexes_list=[]
|
1280 |
+
for dirpath, dirnames, filenames in os.walk("./logs/"):
|
1281 |
+
for filename in filenames:
|
1282 |
+
if filename.endswith(".index"):
|
1283 |
+
indexes_list.append(os.path.join(dirpath,filename))
|
1284 |
+
if len(indexes_list) > 0:
|
1285 |
+
return indexes_list
|
1286 |
+
else:
|
1287 |
+
return ''
|
1288 |
+
|
1289 |
+
def get_name():
|
1290 |
+
if len(audio_files) > 0:
|
1291 |
+
return sorted(audio_files)[0]
|
1292 |
+
else:
|
1293 |
+
return ''
|
1294 |
+
|
1295 |
+
def save_to_wav(record_button):
|
1296 |
+
if record_button is None:
|
1297 |
+
pass
|
1298 |
+
else:
|
1299 |
+
path_to_file=record_button
|
1300 |
+
new_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+'.wav'
|
1301 |
+
new_path='./audios/'+new_name
|
1302 |
+
shutil.move(path_to_file,new_path)
|
1303 |
+
return new_path
|
1304 |
+
|
1305 |
+
def save_to_wav2(dropbox):
|
1306 |
+
file_path=dropbox.name
|
1307 |
+
shutil.move(file_path,'./audios')
|
1308 |
+
return os.path.join('./audios',os.path.basename(file_path))
|
1309 |
+
|
1310 |
+
def match_index(sid0):
|
1311 |
+
folder=sid0.split(".")[0]
|
1312 |
+
parent_dir="./logs/"+folder
|
1313 |
+
if os.path.exists(parent_dir):
|
1314 |
+
for filename in os.listdir(parent_dir):
|
1315 |
+
if filename.endswith(".index"):
|
1316 |
+
index_path=os.path.join(parent_dir,filename)
|
1317 |
+
return index_path
|
1318 |
+
else:
|
1319 |
+
return ''
|
1320 |
+
|
1321 |
+
def check_for_name():
|
1322 |
+
if len(names) > 0:
|
1323 |
+
return sorted(names)[0]
|
1324 |
+
else:
|
1325 |
+
return ''
|
1326 |
+
|
1327 |
+
def download_from_url(url, model):
|
1328 |
+
if url == '':
|
1329 |
+
return "URL cannot be left empty."
|
1330 |
+
if model =='':
|
1331 |
+
return "You need to name your model. For example: My-Model"
|
1332 |
+
url = url.strip()
|
1333 |
+
zip_dirs = ["zips", "unzips"]
|
1334 |
+
for directory in zip_dirs:
|
1335 |
+
if os.path.exists(directory):
|
1336 |
+
shutil.rmtree(directory)
|
1337 |
+
os.makedirs("zips", exist_ok=True)
|
1338 |
+
os.makedirs("unzips", exist_ok=True)
|
1339 |
+
zipfile = model + '.zip'
|
1340 |
+
zipfile_path = './zips/' + zipfile
|
1341 |
+
try:
|
1342 |
+
if "drive.google.com" in url:
|
1343 |
+
subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
|
1344 |
+
elif "mega.nz" in url:
|
1345 |
+
m = Mega()
|
1346 |
+
m.download_url(url, './zips')
|
1347 |
+
else:
|
1348 |
+
subprocess.run(["wget", url, "-O", zipfile_path])
|
1349 |
+
for filename in os.listdir("./zips"):
|
1350 |
+
if filename.endswith(".zip"):
|
1351 |
+
zipfile_path = os.path.join("./zips/",filename)
|
1352 |
+
shutil.unpack_archive(zipfile_path, "./unzips", 'zip')
|
1353 |
+
else:
|
1354 |
+
return "No zipfile found."
|
1355 |
+
for root, dirs, files in os.walk('./unzips'):
|
1356 |
+
for file in files:
|
1357 |
+
file_path = os.path.join(root, file)
|
1358 |
+
if file.endswith(".index"):
|
1359 |
+
os.mkdir(f'./logs/{model}')
|
1360 |
+
shutil.copy2(file_path,f'./logs/{model}')
|
1361 |
+
elif "G_" not in file and "D_" not in file and file.endswith(".pth"):
|
1362 |
+
shutil.copy(file_path,f'./weights/{model}.pth')
|
1363 |
+
shutil.rmtree("zips")
|
1364 |
+
shutil.rmtree("unzips")
|
1365 |
+
return "Success."
|
1366 |
+
except:
|
1367 |
+
return "There's been an error."
|
1368 |
+
def success_message(face):
|
1369 |
+
return f'{face.name} has been uploaded.', 'None'
|
1370 |
+
def mouth(size, face, voice, faces):
|
1371 |
+
if size == 'Half':
|
1372 |
+
size = 2
|
1373 |
+
else:
|
1374 |
+
size = 1
|
1375 |
+
if faces == 'None':
|
1376 |
+
character = face.name
|
1377 |
+
else:
|
1378 |
+
if faces == 'Ben Shapiro':
|
1379 |
+
character = '/content/wav2lip-HD/inputs/ben-shapiro-10.mp4'
|
1380 |
+
elif faces == 'Andrew Tate':
|
1381 |
+
character = '/content/wav2lip-HD/inputs/tate-7.mp4'
|
1382 |
+
command = "python inference.py " \
|
1383 |
+
"--checkpoint_path checkpoints/wav2lip.pth " \
|
1384 |
+
f"--face {character} " \
|
1385 |
+
f"--audio {voice} " \
|
1386 |
+
"--pads 0 20 0 0 " \
|
1387 |
+
"--outfile /content/wav2lip-HD/outputs/result.mp4 " \
|
1388 |
+
"--fps 24 " \
|
1389 |
+
f"--resize_factor {size}"
|
1390 |
+
process = subprocess.Popen(command, shell=True, cwd='/content/wav2lip-HD/Wav2Lip-master')
|
1391 |
+
stdout, stderr = process.communicate()
|
1392 |
+
return '/content/wav2lip-HD/outputs/result.mp4', 'Animation completed.'
|
1393 |
+
eleven_voices = ['Adam','Antoni','Josh','Arnold','Sam','Bella','Rachel','Domi','Elli']
|
1394 |
+
eleven_voices_ids=['pNInz6obpgDQGcFmaJgB','ErXwobaYiN019PkySvjV','TxGEqnHWrfWFTfGW9XjX','VR6AewLTigWG4xSOukaG','yoZ06aMxZJJ28mfd3POQ','EXAVITQu4vr4xnSDxMaL','21m00Tcm4TlvDq8ikWAM','AZnzlk1XvdvUeBnXmlld','MF3mGyEYCl7XYWbV9V6O']
|
1395 |
+
chosen_voice = dict(zip(eleven_voices, eleven_voices_ids))
|
1396 |
+
|
1397 |
+
def stoptraining(mim):
|
1398 |
+
if int(mim) == 1:
|
1399 |
+
try:
|
1400 |
+
CSVutil('csvdb/stop.csv', 'w+', 'stop', 'True')
|
1401 |
+
os.kill(PID, signal.SIGTERM)
|
1402 |
+
except Exception as e:
|
1403 |
+
print(f"Couldn't click due to {e}")
|
1404 |
+
return (
|
1405 |
+
{"visible": False, "__type__": "update"},
|
1406 |
+
{"visible": True, "__type__": "update"},
|
1407 |
+
)
|
1408 |
+
|
1409 |
+
|
1410 |
+
def elevenTTS(xiapi, text, id, lang):
|
1411 |
+
if xiapi!= '' and id !='':
|
1412 |
+
choice = chosen_voice[id]
|
1413 |
+
CHUNK_SIZE = 1024
|
1414 |
+
url = f"https://api.elevenlabs.io/v1/text-to-speech/{choice}"
|
1415 |
+
headers = {
|
1416 |
+
"Accept": "audio/mpeg",
|
1417 |
+
"Content-Type": "application/json",
|
1418 |
+
"xi-api-key": xiapi
|
1419 |
+
}
|
1420 |
+
if lang == 'en':
|
1421 |
+
data = {
|
1422 |
+
"text": text,
|
1423 |
+
"model_id": "eleven_monolingual_v1",
|
1424 |
+
"voice_settings": {
|
1425 |
+
"stability": 0.5,
|
1426 |
+
"similarity_boost": 0.5
|
1427 |
+
}
|
1428 |
+
}
|
1429 |
+
else:
|
1430 |
+
data = {
|
1431 |
+
"text": text,
|
1432 |
+
"model_id": "eleven_multilingual_v1",
|
1433 |
+
"voice_settings": {
|
1434 |
+
"stability": 0.5,
|
1435 |
+
"similarity_boost": 0.5
|
1436 |
+
}
|
1437 |
+
}
|
1438 |
+
|
1439 |
+
response = requests.post(url, json=data, headers=headers)
|
1440 |
+
with open('./temp_eleven.mp3', 'wb') as f:
|
1441 |
+
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
|
1442 |
+
if chunk:
|
1443 |
+
f.write(chunk)
|
1444 |
+
aud_path = save_to_wav('./temp_eleven.mp3')
|
1445 |
+
return aud_path, aud_path
|
1446 |
+
else:
|
1447 |
+
tts = gTTS(text, lang=lang)
|
1448 |
+
tts.save('./temp_gTTS.mp3')
|
1449 |
+
aud_path = save_to_wav('./temp_gTTS.mp3')
|
1450 |
+
return aud_path, aud_path
|
1451 |
+
|
1452 |
+
def upload_to_dataset(files, dir):
|
1453 |
+
if dir == '':
|
1454 |
+
dir = './dataset'
|
1455 |
+
if not os.path.exists(dir):
|
1456 |
+
os.makedirs(dir)
|
1457 |
+
count = 0
|
1458 |
+
for file in files:
|
1459 |
+
path=file.name
|
1460 |
+
shutil.copy2(path,dir)
|
1461 |
+
count += 1
|
1462 |
+
return f' {count} files uploaded to {dir}.'
|
1463 |
+
|
1464 |
+
def zip_downloader(model):
|
1465 |
+
if not os.path.exists(f'./weights/{model}.pth'):
|
1466 |
+
return {"__type__": "update"}, f'Make sure the Voice Name is correct. I could not find {model}.pth'
|
1467 |
+
index_found = False
|
1468 |
+
for file in os.listdir(f'./logs/{model}'):
|
1469 |
+
if file.endswith('.index') and 'added' in file:
|
1470 |
+
log_file = file
|
1471 |
+
index_found = True
|
1472 |
+
if index_found:
|
1473 |
+
return [f'./weights/{model}.pth', f'./logs/{model}/{log_file}'], "Done"
|
1474 |
+
else:
|
1475 |
+
return f'./weights/{model}.pth', "Could not find Index file."
|
1476 |
+
|
1477 |
+
with gr.Blocks(theme=gr.themes.Base(), title='Mangio-RVC-Web 💻') as app:
|
1478 |
+
with gr.Tabs():
|
1479 |
+
with gr.TabItem("Inference"):
|
1480 |
+
gr.HTML("<h1> Easy GUI v2 (rejekts) - adapted to Mangio-RVC-Fork 💻 [With extra features and fixes by kalomaze & alexlnkp]</h1>")
|
1481 |
+
|
1482 |
+
# Inference Preset Row
|
1483 |
+
# with gr.Row():
|
1484 |
+
# mangio_preset = gr.Dropdown(label="Inference Preset", choices=sorted(get_presets()))
|
1485 |
+
# mangio_preset_name_save = gr.Textbox(
|
1486 |
+
# label="Your preset name"
|
1487 |
+
# )
|
1488 |
+
# mangio_preset_save_btn = gr.Button('Save Preset', variant="primary")
|
1489 |
+
|
1490 |
+
# Other RVC stuff
|
1491 |
+
with gr.Row():
|
1492 |
+
sid0 = gr.Dropdown(label="1.Choose your Model.", choices=sorted(names), value=check_for_name())
|
1493 |
+
refresh_button = gr.Button("Refresh", variant="primary")
|
1494 |
+
if check_for_name() != '':
|
1495 |
+
get_vc(sorted(names)[0])
|
1496 |
+
vc_transform0 = gr.Number(label="Optional: You can change the pitch here or leave it at 0.", value=0)
|
1497 |
+
#clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
|
1498 |
+
spk_item = gr.Slider(
|
1499 |
+
minimum=0,
|
1500 |
+
maximum=2333,
|
1501 |
+
step=1,
|
1502 |
+
label=i18n("请选择说话人id"),
|
1503 |
+
value=0,
|
1504 |
+
visible=False,
|
1505 |
+
interactive=True,
|
1506 |
+
)
|
1507 |
+
#clean_button.click(fn=clean, inputs=[], outputs=[sid0])
|
1508 |
+
sid0.change(
|
1509 |
+
fn=get_vc,
|
1510 |
+
inputs=[sid0],
|
1511 |
+
outputs=[spk_item],
|
1512 |
+
)
|
1513 |
+
but0 = gr.Button("Convert", variant="primary")
|
1514 |
+
with gr.Row():
|
1515 |
+
with gr.Column():
|
1516 |
+
with gr.Row():
|
1517 |
+
dropbox = gr.File(label="Drop your audio here & hit the Reload button.")
|
1518 |
+
with gr.Row():
|
1519 |
+
record_button=gr.Audio(source="microphone", label="OR Record audio.", type="filepath")
|
1520 |
+
with gr.Row():
|
1521 |
+
input_audio0 = gr.Dropdown(
|
1522 |
+
label="2.Choose your audio.",
|
1523 |
+
value="./audios/someguy.mp3",
|
1524 |
+
choices=audio_files
|
1525 |
+
)
|
1526 |
+
dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
|
1527 |
+
dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
1528 |
+
refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
|
1529 |
+
record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
|
1530 |
+
record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
1531 |
+
with gr.Row():
|
1532 |
+
with gr.Accordion('Text To Speech', open=False):
|
1533 |
+
with gr.Column():
|
1534 |
+
lang = gr.Radio(label='Chinese & Japanese do not work with ElevenLabs currently.',choices=['en','es','fr','pt','zh-CN','de','hi','ja'], value='en')
|
1535 |
+
api_box = gr.Textbox(label="Enter your API Key for ElevenLabs, or leave empty to use GoogleTTS", value='')
|
1536 |
+
elevenid=gr.Dropdown(label="Voice:", choices=eleven_voices)
|
1537 |
+
with gr.Column():
|
1538 |
+
tfs = gr.Textbox(label="Input your Text", interactive=True, value="This is a test.")
|
1539 |
+
tts_button = gr.Button(value="Speak")
|
1540 |
+
tts_button.click(fn=elevenTTS, inputs=[api_box,tfs, elevenid, lang], outputs=[record_button, input_audio0])
|
1541 |
+
with gr.Row():
|
1542 |
+
with gr.Accordion('Wav2Lip', open=False):
|
1543 |
+
with gr.Row():
|
1544 |
+
size = gr.Radio(label='Resolution:',choices=['Half','Full'])
|
1545 |
+
face = gr.UploadButton("Upload A Character",type='file')
|
1546 |
+
faces = gr.Dropdown(label="OR Choose one:", choices=['None','Ben Shapiro','Andrew Tate'])
|
1547 |
+
with gr.Row():
|
1548 |
+
preview = gr.Textbox(label="Status:",interactive=False)
|
1549 |
+
face.upload(fn=success_message,inputs=[face], outputs=[preview, faces])
|
1550 |
+
with gr.Row():
|
1551 |
+
animation = gr.Video(type='filepath')
|
1552 |
+
refresh_button2.click(fn=change_choices2, inputs=[], outputs=[input_audio0, animation])
|
1553 |
+
with gr.Row():
|
1554 |
+
animate_button = gr.Button('Animate')
|
1555 |
+
|
1556 |
+
with gr.Column():
|
1557 |
+
with gr.Accordion("Index Settings", open=False):
|
1558 |
+
file_index1 = gr.Dropdown(
|
1559 |
+
label="3. Path to your added.index file (if it didn't automatically find it.)",
|
1560 |
+
choices=get_indexes(),
|
1561 |
+
value=get_index(),
|
1562 |
+
interactive=True,
|
1563 |
+
)
|
1564 |
+
sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
|
1565 |
+
refresh_button.click(
|
1566 |
+
fn=change_choices, inputs=[], outputs=[sid0, file_index1]
|
1567 |
+
)
|
1568 |
+
# file_big_npy1 = gr.Textbox(
|
1569 |
+
# label=i18n("特征文件路径"),
|
1570 |
+
# value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
|
1571 |
+
# interactive=True,
|
1572 |
+
# )
|
1573 |
+
index_rate1 = gr.Slider(
|
1574 |
+
minimum=0,
|
1575 |
+
maximum=1,
|
1576 |
+
label=i18n("检索特征占比"),
|
1577 |
+
value=0.66,
|
1578 |
+
interactive=True,
|
1579 |
+
)
|
1580 |
+
vc_output2 = gr.Audio(
|
1581 |
+
label="Output Audio (Click on the Three Dots in the Right Corner to Download)",
|
1582 |
+
type='filepath',
|
1583 |
+
interactive=False,
|
1584 |
+
)
|
1585 |
+
animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
|
1586 |
+
with gr.Accordion("Advanced Settings", open=False):
|
1587 |
+
f0method0 = gr.Radio(
|
1588 |
+
label="Optional: Change the Pitch Extraction Algorithm.\nExtraction methods are sorted from 'worst quality' to 'best quality'.\nmangio-crepe may or may not be better than rmvpe in cases where 'smoothness' is more important, but rmvpe is the best overall.",
|
1589 |
+
choices=["pm", "dio", "crepe-tiny", "mangio-crepe-tiny", "crepe", "harvest", "mangio-crepe", "rmvpe"], # Fork Feature. Add Crepe-Tiny
|
1590 |
+
value="rmvpe",
|
1591 |
+
interactive=True,
|
1592 |
+
)
|
1593 |
+
|
1594 |
+
crepe_hop_length = gr.Slider(
|
1595 |
+
minimum=1,
|
1596 |
+
maximum=512,
|
1597 |
+
step=1,
|
1598 |
+
label="Mangio-Crepe Hop Length. Higher numbers will reduce the chance of extreme pitch changes but lower numbers will increase accuracy. 64-192 is a good range to experiment with.",
|
1599 |
+
value=120,
|
1600 |
+
interactive=True,
|
1601 |
+
visible=False,
|
1602 |
+
)
|
1603 |
+
f0method0.change(fn=whethercrepeornah, inputs=[f0method0], outputs=[crepe_hop_length])
|
1604 |
+
filter_radius0 = gr.Slider(
|
1605 |
+
minimum=0,
|
1606 |
+
maximum=7,
|
1607 |
+
label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
|
1608 |
+
value=3,
|
1609 |
+
step=1,
|
1610 |
+
interactive=True,
|
1611 |
+
)
|
1612 |
+
resample_sr0 = gr.Slider(
|
1613 |
+
minimum=0,
|
1614 |
+
maximum=48000,
|
1615 |
+
label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
|
1616 |
+
value=0,
|
1617 |
+
step=1,
|
1618 |
+
interactive=True,
|
1619 |
+
visible=False
|
1620 |
+
)
|
1621 |
+
rms_mix_rate0 = gr.Slider(
|
1622 |
+
minimum=0,
|
1623 |
+
maximum=1,
|
1624 |
+
label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
|
1625 |
+
value=0.21,
|
1626 |
+
interactive=True,
|
1627 |
+
)
|
1628 |
+
protect0 = gr.Slider(
|
1629 |
+
minimum=0,
|
1630 |
+
maximum=0.5,
|
1631 |
+
label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"),
|
1632 |
+
value=0.33,
|
1633 |
+
step=0.01,
|
1634 |
+
interactive=True,
|
1635 |
+
)
|
1636 |
+
formanting = gr.Checkbox(
|
1637 |
+
value=bool(DoFormant),
|
1638 |
+
label="[EXPERIMENTAL] Formant shift inference audio",
|
1639 |
+
info="Used for male to female and vice-versa conversions",
|
1640 |
+
interactive=True,
|
1641 |
+
visible=True,
|
1642 |
+
)
|
1643 |
+
|
1644 |
+
formant_preset = gr.Dropdown(
|
1645 |
+
value='',
|
1646 |
+
choices=get_fshift_presets(),
|
1647 |
+
label="browse presets for formanting",
|
1648 |
+
visible=bool(DoFormant),
|
1649 |
+
)
|
1650 |
+
formant_refresh_button = gr.Button(
|
1651 |
+
value='\U0001f504',
|
1652 |
+
visible=bool(DoFormant),
|
1653 |
+
variant='primary',
|
1654 |
+
)
|
1655 |
+
#formant_refresh_button = ToolButton( elem_id='1')
|
1656 |
+
#create_refresh_button(formant_preset, lambda: {"choices": formant_preset}, "refresh_list_shiftpresets")
|
1657 |
+
|
1658 |
+
qfrency = gr.Slider(
|
1659 |
+
value=Quefrency,
|
1660 |
+
info="Default value is 1.0",
|
1661 |
+
label="Quefrency for formant shifting",
|
1662 |
+
minimum=0.0,
|
1663 |
+
maximum=16.0,
|
1664 |
+
step=0.1,
|
1665 |
+
visible=bool(DoFormant),
|
1666 |
+
interactive=True,
|
1667 |
+
)
|
1668 |
+
tmbre = gr.Slider(
|
1669 |
+
value=Timbre,
|
1670 |
+
info="Default value is 1.0",
|
1671 |
+
label="Timbre for formant shifting",
|
1672 |
+
minimum=0.0,
|
1673 |
+
maximum=16.0,
|
1674 |
+
step=0.1,
|
1675 |
+
visible=bool(DoFormant),
|
1676 |
+
interactive=True,
|
1677 |
+
)
|
1678 |
+
|
1679 |
+
formant_preset.change(fn=preset_apply, inputs=[formant_preset, qfrency, tmbre], outputs=[qfrency, tmbre])
|
1680 |
+
frmntbut = gr.Button("Apply", variant="primary", visible=bool(DoFormant))
|
1681 |
+
formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
|
1682 |
+
frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
|
1683 |
+
formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
|
1684 |
+
with gr.Row():
|
1685 |
+
vc_output1 = gr.Textbox("")
|
1686 |
+
f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"), visible=False)
|
1687 |
+
|
1688 |
+
but0.click(
|
1689 |
+
vc_single,
|
1690 |
+
[
|
1691 |
+
spk_item,
|
1692 |
+
input_audio0,
|
1693 |
+
vc_transform0,
|
1694 |
+
f0_file,
|
1695 |
+
f0method0,
|
1696 |
+
file_index1,
|
1697 |
+
# file_index2,
|
1698 |
+
# file_big_npy1,
|
1699 |
+
index_rate1,
|
1700 |
+
filter_radius0,
|
1701 |
+
resample_sr0,
|
1702 |
+
rms_mix_rate0,
|
1703 |
+
protect0,
|
1704 |
+
crepe_hop_length
|
1705 |
+
],
|
1706 |
+
[vc_output1, vc_output2],
|
1707 |
+
)
|
1708 |
+
|
1709 |
+
with gr.Accordion("Batch Conversion",open=False):
|
1710 |
+
with gr.Row():
|
1711 |
+
with gr.Column():
|
1712 |
+
vc_transform1 = gr.Number(
|
1713 |
+
label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
|
1714 |
+
)
|
1715 |
+
opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt")
|
1716 |
+
f0method1 = gr.Radio(
|
1717 |
+
label=i18n(
|
1718 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"
|
1719 |
+
),
|
1720 |
+
choices=["pm", "harvest", "crepe", "rmvpe"],
|
1721 |
+
value="rmvpe",
|
1722 |
+
interactive=True,
|
1723 |
+
)
|
1724 |
+
filter_radius1 = gr.Slider(
|
1725 |
+
minimum=0,
|
1726 |
+
maximum=7,
|
1727 |
+
label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
|
1728 |
+
value=3,
|
1729 |
+
step=1,
|
1730 |
+
interactive=True,
|
1731 |
+
)
|
1732 |
+
with gr.Column():
|
1733 |
+
file_index3 = gr.Textbox(
|
1734 |
+
label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
|
1735 |
+
value="",
|
1736 |
+
interactive=True,
|
1737 |
+
)
|
1738 |
+
file_index4 = gr.Dropdown(
|
1739 |
+
label=i18n("自动检测index路径,下拉式选择(dropdown)"),
|
1740 |
+
choices=sorted(index_paths),
|
1741 |
+
interactive=True,
|
1742 |
+
)
|
1743 |
+
refresh_button.click(
|
1744 |
+
fn=lambda: change_choices()[1],
|
1745 |
+
inputs=[],
|
1746 |
+
outputs=file_index4,
|
1747 |
+
)
|
1748 |
+
# file_big_npy2 = gr.Textbox(
|
1749 |
+
# label=i18n("特征文件路径"),
|
1750 |
+
# value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
|
1751 |
+
# interactive=True,
|
1752 |
+
# )
|
1753 |
+
index_rate2 = gr.Slider(
|
1754 |
+
minimum=0,
|
1755 |
+
maximum=1,
|
1756 |
+
label=i18n("检索特征占比"),
|
1757 |
+
value=1,
|
1758 |
+
interactive=True,
|
1759 |
+
)
|
1760 |
+
with gr.Column():
|
1761 |
+
resample_sr1 = gr.Slider(
|
1762 |
+
minimum=0,
|
1763 |
+
maximum=48000,
|
1764 |
+
label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
|
1765 |
+
value=0,
|
1766 |
+
step=1,
|
1767 |
+
interactive=True,
|
1768 |
+
)
|
1769 |
+
rms_mix_rate1 = gr.Slider(
|
1770 |
+
minimum=0,
|
1771 |
+
maximum=1,
|
1772 |
+
label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
|
1773 |
+
value=1,
|
1774 |
+
interactive=True,
|
1775 |
+
)
|
1776 |
+
protect1 = gr.Slider(
|
1777 |
+
minimum=0,
|
1778 |
+
maximum=0.5,
|
1779 |
+
label=i18n(
|
1780 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
|
1781 |
+
),
|
1782 |
+
value=0.33,
|
1783 |
+
step=0.01,
|
1784 |
+
interactive=True,
|
1785 |
+
)
|
1786 |
+
with gr.Column():
|
1787 |
+
dir_input = gr.Textbox(
|
1788 |
+
label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"),
|
1789 |
+
value="E:\codes\py39\\test-20230416b\\todo-songs",
|
1790 |
+
)
|
1791 |
+
inputs = gr.File(
|
1792 |
+
file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
|
1793 |
+
)
|
1794 |
+
with gr.Row():
|
1795 |
+
format1 = gr.Radio(
|
1796 |
+
label=i18n("导出文件格式"),
|
1797 |
+
choices=["wav", "flac", "mp3", "m4a"],
|
1798 |
+
value="flac",
|
1799 |
+
interactive=True,
|
1800 |
+
)
|
1801 |
+
but1 = gr.Button(i18n("转换"), variant="primary")
|
1802 |
+
vc_output3 = gr.Textbox(label=i18n("输出信息"))
|
1803 |
+
but1.click(
|
1804 |
+
vc_multi,
|
1805 |
+
[
|
1806 |
+
spk_item,
|
1807 |
+
dir_input,
|
1808 |
+
opt_input,
|
1809 |
+
inputs,
|
1810 |
+
vc_transform1,
|
1811 |
+
f0method1,
|
1812 |
+
file_index3,
|
1813 |
+
file_index4,
|
1814 |
+
# file_big_npy2,
|
1815 |
+
index_rate2,
|
1816 |
+
filter_radius1,
|
1817 |
+
resample_sr1,
|
1818 |
+
rms_mix_rate1,
|
1819 |
+
protect1,
|
1820 |
+
format1,
|
1821 |
+
crepe_hop_length,
|
1822 |
+
],
|
1823 |
+
[vc_output3],
|
1824 |
+
)
|
1825 |
+
but1.click(fn=lambda: easy_uploader.clear())
|
1826 |
+
with gr.TabItem("Download Model"):
|
1827 |
+
with gr.Row():
|
1828 |
+
url=gr.Textbox(label="Enter the URL to the Model:")
|
1829 |
+
with gr.Row():
|
1830 |
+
model = gr.Textbox(label="Name your model:")
|
1831 |
+
download_button=gr.Button("Download")
|
1832 |
+
with gr.Row():
|
1833 |
+
status_bar=gr.Textbox(label="")
|
1834 |
+
download_button.click(fn=download_from_url, inputs=[url, model], outputs=[status_bar])
|
1835 |
+
with gr.Row():
|
1836 |
+
gr.Markdown(
|
1837 |
+
"""
|
1838 |
+
Original RVC:https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
|
1839 |
+
Mangio's RVC Fork:https://github.com/Mangio621/Mangio-RVC-Fork
|
1840 |
+
❤️ If you like the EasyGUI, help me keep it.❤️
|
1841 |
+
https://paypal.me/lesantillan
|
1842 |
+
"""
|
1843 |
+
)
|
1844 |
+
|
1845 |
+
def has_two_files_in_pretrained_folder():
|
1846 |
+
pretrained_folder = "./pretrained/"
|
1847 |
+
if not os.path.exists(pretrained_folder):
|
1848 |
+
return False
|
1849 |
+
|
1850 |
+
files_in_folder = os.listdir(pretrained_folder)
|
1851 |
+
num_files = len(files_in_folder)
|
1852 |
+
return num_files >= 2
|
1853 |
+
|
1854 |
+
if has_two_files_in_pretrained_folder():
|
1855 |
+
print("Pretrained weights are downloaded. Training tab enabled!\n-------------------------------")
|
1856 |
+
with gr.TabItem("Train", visible=False):
|
1857 |
+
with gr.Row():
|
1858 |
+
with gr.Column():
|
1859 |
+
exp_dir1 = gr.Textbox(label="Voice Name:", value="My-Voice")
|
1860 |
+
sr2 = gr.Radio(
|
1861 |
+
label=i18n("目标采样率"),
|
1862 |
+
choices=["40k", "48k"],
|
1863 |
+
value="40k",
|
1864 |
+
interactive=True,
|
1865 |
+
visible=False
|
1866 |
+
)
|
1867 |
+
if_f0_3 = gr.Radio(
|
1868 |
+
label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"),
|
1869 |
+
choices=[True, False],
|
1870 |
+
value=True,
|
1871 |
+
interactive=True,
|
1872 |
+
visible=False
|
1873 |
+
)
|
1874 |
+
version19 = gr.Radio(
|
1875 |
+
label="RVC version",
|
1876 |
+
choices=["v1", "v2"],
|
1877 |
+
value="v2",
|
1878 |
+
interactive=True,
|
1879 |
+
visible=False,
|
1880 |
+
)
|
1881 |
+
np7 = gr.Slider(
|
1882 |
+
minimum=0,
|
1883 |
+
maximum=config.n_cpu,
|
1884 |
+
step=1,
|
1885 |
+
label="# of CPUs for data processing (Leave as it is)",
|
1886 |
+
value=config.n_cpu,
|
1887 |
+
interactive=True,
|
1888 |
+
visible=True
|
1889 |
+
)
|
1890 |
+
trainset_dir4 = gr.Textbox(label="Path to your dataset (audios, not zip):", value="./dataset")
|
1891 |
+
easy_uploader = gr.Files(label='OR Drop your audios here. They will be uploaded in your dataset path above.',file_types=['audio'])
|
1892 |
+
but1 = gr.Button("1. Process The Dataset", variant="primary")
|
1893 |
+
info1 = gr.Textbox(label="Status (wait until it says 'end preprocess'):", value="")
|
1894 |
+
easy_uploader.upload(fn=upload_to_dataset, inputs=[easy_uploader, trainset_dir4], outputs=[info1])
|
1895 |
+
but1.click(
|
1896 |
+
preprocess_dataset, [trainset_dir4, exp_dir1, sr2, np7], [info1]
|
1897 |
+
)
|
1898 |
+
with gr.Column():
|
1899 |
+
spk_id5 = gr.Slider(
|
1900 |
+
minimum=0,
|
1901 |
+
maximum=4,
|
1902 |
+
step=1,
|
1903 |
+
label=i18n("请指定说话人id"),
|
1904 |
+
value=0,
|
1905 |
+
interactive=True,
|
1906 |
+
visible=False
|
1907 |
+
)
|
1908 |
+
with gr.Accordion('GPU Settings', open=False, visible=False):
|
1909 |
+
gpus6 = gr.Textbox(
|
1910 |
+
label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"),
|
1911 |
+
value=gpus,
|
1912 |
+
interactive=True,
|
1913 |
+
visible=False
|
1914 |
+
)
|
1915 |
+
gpu_info9 = gr.Textbox(label=i18n("显卡信息"), value=gpu_info)
|
1916 |
+
f0method8 = gr.Radio(
|
1917 |
+
label=i18n(
|
1918 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢"
|
1919 |
+
),
|
1920 |
+
choices=["harvest","crepe", "mangio-crepe", "rmvpe"], # Fork feature: Crepe on f0 extraction for training.
|
1921 |
+
value="rmvpe",
|
1922 |
+
interactive=True,
|
1923 |
+
)
|
1924 |
+
|
1925 |
+
extraction_crepe_hop_length = gr.Slider(
|
1926 |
+
minimum=1,
|
1927 |
+
maximum=512,
|
1928 |
+
step=1,
|
1929 |
+
label=i18n("crepe_hop_length"),
|
1930 |
+
value=128,
|
1931 |
+
interactive=True,
|
1932 |
+
visible=False,
|
1933 |
+
)
|
1934 |
+
f0method8.change(fn=whethercrepeornah, inputs=[f0method8], outputs=[extraction_crepe_hop_length])
|
1935 |
+
but2 = gr.Button("2. Pitch Extraction", variant="primary")
|
1936 |
+
info2 = gr.Textbox(label="Status(Check the Colab Notebook's cell output):", value="", max_lines=8)
|
1937 |
+
but2.click(
|
1938 |
+
extract_f0_feature,
|
1939 |
+
[gpus6, np7, f0method8, if_f0_3, exp_dir1, version19, extraction_crepe_hop_length],
|
1940 |
+
[info2],
|
1941 |
+
)
|
1942 |
+
with gr.Row():
|
1943 |
+
with gr.Column():
|
1944 |
+
total_epoch11 = gr.Slider(
|
1945 |
+
minimum=1,
|
1946 |
+
maximum=5000,
|
1947 |
+
step=10,
|
1948 |
+
label="Total # of training epochs (IF you choose a value too high, your model will sound horribly overtrained.):",
|
1949 |
+
value=250,
|
1950 |
+
interactive=True,
|
1951 |
+
)
|
1952 |
+
butstop = gr.Button(
|
1953 |
+
"Stop Training",
|
1954 |
+
variant='primary',
|
1955 |
+
visible=False,
|
1956 |
+
)
|
1957 |
+
but3 = gr.Button("3. Train Model", variant="primary", visible=True)
|
1958 |
+
|
1959 |
+
but3.click(fn=stoptraining, inputs=[gr.Number(value=0, visible=False)], outputs=[but3, butstop])
|
1960 |
+
butstop.click(fn=stoptraining, inputs=[gr.Number(value=1, visible=False)], outputs=[butstop, but3])
|
1961 |
+
|
1962 |
+
|
1963 |
+
but4 = gr.Button("4.Train Index", variant="primary")
|
1964 |
+
info3 = gr.Textbox(label="Status(Check the Colab Notebook's cell output):", value="", max_lines=10)
|
1965 |
+
with gr.Accordion("Training Preferences (You can leave these as they are)", open=False):
|
1966 |
+
#gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引"))
|
1967 |
+
with gr.Column():
|
1968 |
+
save_epoch10 = gr.Slider(
|
1969 |
+
minimum=1,
|
1970 |
+
maximum=200,
|
1971 |
+
step=1,
|
1972 |
+
label="Backup every X amount of epochs:",
|
1973 |
+
value=10,
|
1974 |
+
interactive=True,
|
1975 |
+
)
|
1976 |
+
batch_size12 = gr.Slider(
|
1977 |
+
minimum=1,
|
1978 |
+
maximum=40,
|
1979 |
+
step=1,
|
1980 |
+
label="Batch Size (LEAVE IT unless you know what you're doing!):",
|
1981 |
+
value=default_batch_size,
|
1982 |
+
interactive=True,
|
1983 |
+
)
|
1984 |
+
if_save_latest13 = gr.Checkbox(
|
1985 |
+
label="Save only the latest '.ckpt' file to save disk space.",
|
1986 |
+
value=True,
|
1987 |
+
interactive=True,
|
1988 |
+
)
|
1989 |
+
if_cache_gpu17 = gr.Checkbox(
|
1990 |
+
label="Cache all training sets to GPU memory. Caching small datasets (less than 10 minutes) can speed up training, but caching large datasets will consume a lot of GPU memory and may not provide much speed improvement.",
|
1991 |
+
value=False,
|
1992 |
+
interactive=True,
|
1993 |
+
)
|
1994 |
+
if_save_every_weights18 = gr.Checkbox(
|
1995 |
+
label="Save a small final model to the 'weights' folder at each save point.",
|
1996 |
+
value=True,
|
1997 |
+
interactive=True,
|
1998 |
+
)
|
1999 |
+
zip_model = gr.Button('5. Download Model')
|
2000 |
+
zipped_model = gr.Files(label='Your Model and Index file can be downloaded here:')
|
2001 |
+
zip_model.click(fn=zip_downloader, inputs=[exp_dir1], outputs=[zipped_model, info3])
|
2002 |
+
with gr.Group():
|
2003 |
+
with gr.Accordion("Base Model Locations:", open=False, visible=False):
|
2004 |
+
pretrained_G14 = gr.Textbox(
|
2005 |
+
label=i18n("加载预训练底模G路径"),
|
2006 |
+
value="pretrained_v2/f0G40k.pth",
|
2007 |
+
interactive=True,
|
2008 |
+
)
|
2009 |
+
pretrained_D15 = gr.Textbox(
|
2010 |
+
label=i18n("加载预训练底模D路径"),
|
2011 |
+
value="pretrained_v2/f0D40k.pth",
|
2012 |
+
interactive=True,
|
2013 |
+
)
|
2014 |
+
gpus16 = gr.Textbox(
|
2015 |
+
label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"),
|
2016 |
+
value=gpus,
|
2017 |
+
interactive=True,
|
2018 |
+
)
|
2019 |
+
sr2.change(
|
2020 |
+
change_sr2,
|
2021 |
+
[sr2, if_f0_3, version19],
|
2022 |
+
[pretrained_G14, pretrained_D15, version19],
|
2023 |
+
)
|
2024 |
+
version19.change(
|
2025 |
+
change_version19,
|
2026 |
+
[sr2, if_f0_3, version19],
|
2027 |
+
[pretrained_G14, pretrained_D15],
|
2028 |
+
)
|
2029 |
+
if_f0_3.change(
|
2030 |
+
change_f0,
|
2031 |
+
[if_f0_3, sr2, version19],
|
2032 |
+
[f0method8, pretrained_G14, pretrained_D15],
|
2033 |
+
)
|
2034 |
+
but5 = gr.Button(i18n("一键训练"), variant="primary", visible=False)
|
2035 |
+
but3.click(
|
2036 |
+
click_train,
|
2037 |
+
[
|
2038 |
+
exp_dir1,
|
2039 |
+
sr2,
|
2040 |
+
if_f0_3,
|
2041 |
+
spk_id5,
|
2042 |
+
save_epoch10,
|
2043 |
+
total_epoch11,
|
2044 |
+
batch_size12,
|
2045 |
+
if_save_latest13,
|
2046 |
+
pretrained_G14,
|
2047 |
+
pretrained_D15,
|
2048 |
+
gpus16,
|
2049 |
+
if_cache_gpu17,
|
2050 |
+
if_save_every_weights18,
|
2051 |
+
version19,
|
2052 |
+
],
|
2053 |
+
[
|
2054 |
+
info3,
|
2055 |
+
butstop,
|
2056 |
+
but3,
|
2057 |
+
],
|
2058 |
+
)
|
2059 |
+
but4.click(train_index, [exp_dir1, version19], info3)
|
2060 |
+
but5.click(
|
2061 |
+
train1key,
|
2062 |
+
[
|
2063 |
+
exp_dir1,
|
2064 |
+
sr2,
|
2065 |
+
if_f0_3,
|
2066 |
+
trainset_dir4,
|
2067 |
+
spk_id5,
|
2068 |
+
np7,
|
2069 |
+
f0method8,
|
2070 |
+
save_epoch10,
|
2071 |
+
total_epoch11,
|
2072 |
+
batch_size12,
|
2073 |
+
if_save_latest13,
|
2074 |
+
pretrained_G14,
|
2075 |
+
pretrained_D15,
|
2076 |
+
gpus16,
|
2077 |
+
if_cache_gpu17,
|
2078 |
+
if_save_every_weights18,
|
2079 |
+
version19,
|
2080 |
+
extraction_crepe_hop_length
|
2081 |
+
],
|
2082 |
+
info3,
|
2083 |
+
)
|
2084 |
+
|
2085 |
+
else:
|
2086 |
+
print(
|
2087 |
+
"Pretrained weights not downloaded. Disabling training tab.\n"
|
2088 |
+
"Wondering how to train a voice? Visit here for the RVC model training guide: https://t.ly/RVC_Training_Guide\n"
|
2089 |
+
"-------------------------------\n"
|
2090 |
+
)
|
2091 |
+
|
2092 |
+
if config.iscolab or config.paperspace: # Share gradio link for colab and paperspace (FORK FEATURE)
|
2093 |
+
app.queue(concurrency_count=511, max_size=1022).launch(share=True, quiet=True)
|
2094 |
+
else:
|
2095 |
+
app.queue(concurrency_count=511, max_size=1022).launch(
|
2096 |
+
server_name="0.0.0.0",
|
2097 |
+
inbrowser=not config.noautoopen,
|
2098 |
+
server_port=config.listen_port,
|
2099 |
+
quiet=True,
|
2100 |
+
)
|
2101 |
+
#endregion
|
audios/.gitignore
ADDED
File without changes
|
config.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import sys
|
3 |
+
import torch
|
4 |
+
import json
|
5 |
+
from multiprocessing import cpu_count
|
6 |
+
|
7 |
+
global usefp16
|
8 |
+
usefp16 = False
|
9 |
+
|
10 |
+
|
11 |
+
def use_fp32_config():
|
12 |
+
usefp16 = False
|
13 |
+
device_capability = 0
|
14 |
+
if torch.cuda.is_available():
|
15 |
+
device = torch.device("cuda:0") # Assuming you have only one GPU (index 0).
|
16 |
+
device_capability = torch.cuda.get_device_capability(device)[0]
|
17 |
+
if device_capability >= 7:
|
18 |
+
usefp16 = True
|
19 |
+
for config_file in ["32k.json", "40k.json", "48k.json"]:
|
20 |
+
with open(f"configs/{config_file}", "r") as d:
|
21 |
+
data = json.load(d)
|
22 |
+
|
23 |
+
if "train" in data and "fp16_run" in data["train"]:
|
24 |
+
data["train"]["fp16_run"] = True
|
25 |
+
|
26 |
+
with open(f"configs/{config_file}", "w") as d:
|
27 |
+
json.dump(data, d, indent=4)
|
28 |
+
|
29 |
+
print(f"Set fp16_run to true in {config_file}")
|
30 |
+
|
31 |
+
with open(
|
32 |
+
"trainset_preprocess_pipeline_print.py", "r", encoding="utf-8"
|
33 |
+
) as f:
|
34 |
+
strr = f.read()
|
35 |
+
|
36 |
+
strr = strr.replace("3.0", "3.7")
|
37 |
+
|
38 |
+
with open(
|
39 |
+
"trainset_preprocess_pipeline_print.py", "w", encoding="utf-8"
|
40 |
+
) as f:
|
41 |
+
f.write(strr)
|
42 |
+
else:
|
43 |
+
for config_file in ["32k.json", "40k.json", "48k.json"]:
|
44 |
+
with open(f"configs/{config_file}", "r") as f:
|
45 |
+
data = json.load(f)
|
46 |
+
|
47 |
+
if "train" in data and "fp16_run" in data["train"]:
|
48 |
+
data["train"]["fp16_run"] = False
|
49 |
+
|
50 |
+
with open(f"configs/{config_file}", "w") as d:
|
51 |
+
json.dump(data, d, indent=4)
|
52 |
+
|
53 |
+
print(f"Set fp16_run to false in {config_file}")
|
54 |
+
|
55 |
+
with open(
|
56 |
+
"trainset_preprocess_pipeline_print.py", "r", encoding="utf-8"
|
57 |
+
) as f:
|
58 |
+
strr = f.read()
|
59 |
+
|
60 |
+
strr = strr.replace("3.7", "3.0")
|
61 |
+
|
62 |
+
with open(
|
63 |
+
"trainset_preprocess_pipeline_print.py", "w", encoding="utf-8"
|
64 |
+
) as f:
|
65 |
+
f.write(strr)
|
66 |
+
else:
|
67 |
+
print(
|
68 |
+
"CUDA is not available. Make sure you have an NVIDIA GPU and CUDA installed."
|
69 |
+
)
|
70 |
+
return (usefp16, device_capability)
|
71 |
+
|
72 |
+
|
73 |
+
class Config:
|
74 |
+
def __init__(self):
|
75 |
+
self.device = "cuda:0"
|
76 |
+
self.is_half = True
|
77 |
+
self.n_cpu = 0
|
78 |
+
self.gpu_name = None
|
79 |
+
self.gpu_mem = None
|
80 |
+
(
|
81 |
+
self.python_cmd,
|
82 |
+
self.listen_port,
|
83 |
+
self.iscolab,
|
84 |
+
self.noparallel,
|
85 |
+
self.noautoopen,
|
86 |
+
self.paperspace,
|
87 |
+
self.is_cli,
|
88 |
+
) = self.arg_parse()
|
89 |
+
|
90 |
+
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
|
91 |
+
|
92 |
+
@staticmethod
|
93 |
+
def arg_parse() -> tuple:
|
94 |
+
exe = sys.executable or "python"
|
95 |
+
parser = argparse.ArgumentParser()
|
96 |
+
parser.add_argument("--port", type=int, default=7865, help="Listen port")
|
97 |
+
parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
|
98 |
+
parser.add_argument("--colab", action="store_true", help="Launch in colab")
|
99 |
+
parser.add_argument(
|
100 |
+
"--noparallel", action="store_true", help="Disable parallel processing"
|
101 |
+
)
|
102 |
+
parser.add_argument(
|
103 |
+
"--noautoopen",
|
104 |
+
action="store_true",
|
105 |
+
help="Do not open in browser automatically",
|
106 |
+
)
|
107 |
+
parser.add_argument( # Fork Feature. Paperspace integration for web UI
|
108 |
+
"--paperspace",
|
109 |
+
action="store_true",
|
110 |
+
help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems.",
|
111 |
+
)
|
112 |
+
parser.add_argument( # Fork Feature. Embed a CLI into the infer-web.py
|
113 |
+
"--is_cli",
|
114 |
+
action="store_true",
|
115 |
+
help="Use the CLI instead of setting up a gradio UI. This flag will launch an RVC text interface where you can execute functions from infer-web.py!",
|
116 |
+
)
|
117 |
+
cmd_opts = parser.parse_args()
|
118 |
+
|
119 |
+
cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
|
120 |
+
|
121 |
+
return (
|
122 |
+
cmd_opts.pycmd,
|
123 |
+
cmd_opts.port,
|
124 |
+
cmd_opts.colab,
|
125 |
+
cmd_opts.noparallel,
|
126 |
+
cmd_opts.noautoopen,
|
127 |
+
cmd_opts.paperspace,
|
128 |
+
cmd_opts.is_cli,
|
129 |
+
)
|
130 |
+
|
131 |
+
# has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
|
132 |
+
# check `getattr` and try it for compatibility
|
133 |
+
@staticmethod
|
134 |
+
def has_mps() -> bool:
|
135 |
+
if not torch.backends.mps.is_available():
|
136 |
+
return False
|
137 |
+
try:
|
138 |
+
torch.zeros(1).to(torch.device("mps"))
|
139 |
+
return True
|
140 |
+
except Exception:
|
141 |
+
return False
|
142 |
+
|
143 |
+
def device_config(self) -> tuple:
|
144 |
+
if torch.cuda.is_available():
|
145 |
+
i_device = int(self.device.split(":")[-1])
|
146 |
+
self.gpu_name = torch.cuda.get_device_name(i_device)
|
147 |
+
if (
|
148 |
+
("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
|
149 |
+
or "P40" in self.gpu_name.upper()
|
150 |
+
or "1060" in self.gpu_name
|
151 |
+
or "1070" in self.gpu_name
|
152 |
+
or "1080" in self.gpu_name
|
153 |
+
):
|
154 |
+
print("Found GPU", self.gpu_name, ", force to fp32")
|
155 |
+
self.is_half = False
|
156 |
+
else:
|
157 |
+
print("Found GPU", self.gpu_name)
|
158 |
+
use_fp32_config()
|
159 |
+
self.gpu_mem = int(
|
160 |
+
torch.cuda.get_device_properties(i_device).total_memory
|
161 |
+
/ 1024
|
162 |
+
/ 1024
|
163 |
+
/ 1024
|
164 |
+
+ 0.4
|
165 |
+
)
|
166 |
+
if self.gpu_mem <= 4:
|
167 |
+
with open("trainset_preprocess_pipeline_print.py", "r") as f:
|
168 |
+
strr = f.read().replace("3.7", "3.0")
|
169 |
+
with open("trainset_preprocess_pipeline_print.py", "w") as f:
|
170 |
+
f.write(strr)
|
171 |
+
elif self.has_mps():
|
172 |
+
print("No supported Nvidia GPU found, use MPS instead")
|
173 |
+
self.device = "mps"
|
174 |
+
self.is_half = False
|
175 |
+
use_fp32_config()
|
176 |
+
else:
|
177 |
+
print("No supported Nvidia GPU found, use CPU instead")
|
178 |
+
self.device = "cpu"
|
179 |
+
self.is_half = False
|
180 |
+
use_fp32_config()
|
181 |
+
|
182 |
+
if self.n_cpu == 0:
|
183 |
+
self.n_cpu = cpu_count()
|
184 |
+
|
185 |
+
if self.is_half:
|
186 |
+
# 6G显存配置
|
187 |
+
x_pad = 3
|
188 |
+
x_query = 10
|
189 |
+
x_center = 60
|
190 |
+
x_max = 65
|
191 |
+
else:
|
192 |
+
# 5G显存配置
|
193 |
+
x_pad = 1
|
194 |
+
x_query = 6
|
195 |
+
x_center = 38
|
196 |
+
x_max = 41
|
197 |
+
|
198 |
+
if self.gpu_mem != None and self.gpu_mem <= 4:
|
199 |
+
x_pad = 1
|
200 |
+
x_query = 5
|
201 |
+
x_center = 30
|
202 |
+
x_max = 32
|
203 |
+
|
204 |
+
return x_pad, x_query, x_center, x_max
|
i18n.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import locale
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
def load_language_list(language):
|
7 |
+
with open(f"./i18n/{language}.json", "r", encoding="utf-8") as f:
|
8 |
+
language_list = json.load(f)
|
9 |
+
return language_list
|
10 |
+
|
11 |
+
|
12 |
+
class I18nAuto:
|
13 |
+
def __init__(self, language=None):
|
14 |
+
if language in ["Auto", None]:
|
15 |
+
language = locale.getdefaultlocale()[
|
16 |
+
0
|
17 |
+
] # getlocale can't identify the system's language ((None, None))
|
18 |
+
if not os.path.exists(f"./i18n/{language}.json"):
|
19 |
+
language = "en_US"
|
20 |
+
self.language = language
|
21 |
+
# print("Use Language:", language)
|
22 |
+
self.language_map = load_language_list(language)
|
23 |
+
|
24 |
+
def __call__(self, key):
|
25 |
+
return self.language_map.get(key, key)
|
26 |
+
|
27 |
+
def print(self):
|
28 |
+
print("Use Language:", self.language)
|
i18n/en_US.json
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "Unfortunately, there is no compatible GPU available to support your training.",
|
3 |
+
"是": "Yes",
|
4 |
+
"step1:正在处理数据": "Step 1: Processing data",
|
5 |
+
"step2a:无需提取音高": "Step 2a: Skipping pitch extraction",
|
6 |
+
"step2b:正在提取特征": "Step 2b: Extracting features",
|
7 |
+
"step3a:正在训练模型": "Step 3a: Model training started",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Training complete. You can check the training logs in the console or the 'train.log' file under the experiment folder.",
|
9 |
+
"全流程结束!": "All processes have been completed!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>使用需遵守的协议-LICENSE.txt</b>.": "This software is open source under the MIT license. The author does not have any control over the software. Users who use the software and distribute the sounds exported by the software are solely responsible. <br>If you do not agree with this clause, you cannot use or reference any codes and files within the software package. See the root directory <b>Agreement-LICENSE.txt</b> for details.",
|
11 |
+
"模型推理": "Model Inference",
|
12 |
+
"推理音色": "Inferencing voice:",
|
13 |
+
"刷新音色列表和索引路径": "Refresh voice list and index path",
|
14 |
+
"卸载音色省显存": "Unload voice to save GPU memory:",
|
15 |
+
"请选择说话人id": "Select Speaker/Singer ID:",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recommended +12 key for male to female conversion, and -12 key for female to male conversion. If the sound range goes too far and the voice is distorted, you can also adjust it to the appropriate range by yourself.",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (default is the correct format example):",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive):",
|
20 |
+
"crepe_hop_length": "Mangio-Crepe Hop Length (Only applies to mangio-crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
21 |
+
"特征检索库文件路径": "Feature search database file path",
|
22 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.",
|
23 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "Path to the feature index file. Leave blank to use the selected result from the dropdown:",
|
24 |
+
"自动检测index路径,下拉式选择(dropdown)": "Auto-detect index path and select from the dropdown:",
|
25 |
+
"特征文件路径": "Path to feature file:",
|
26 |
+
"检索特征占比": "Search feature ratio:",
|
27 |
+
"后处理重采样至最终采样率,0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:",
|
28 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used:",
|
29 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy:",
|
30 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 curve file (optional). One pitch per line. Replaces the default F0 and pitch modulation:",
|
31 |
+
"转换": "Convert",
|
32 |
+
"输出信息": "Output information",
|
33 |
+
"输出音频(右下角三个点,点了可以下载)": "Export audio (click on the three dots in the lower right corner to download)",
|
34 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion. Enter the folder containing the audio files to be converted or upload multiple audio files. The converted audio will be output in the specified folder (default: 'opt').",
|
35 |
+
"��定输出文件夹": "Specify output folder:",
|
36 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Enter the path of the audio folder to be processed (copy it from the address bar of the file manager):",
|
37 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "You can also input audio files in batches. Choose one of the two options. Priority is given to reading from the folder.",
|
38 |
+
"导出文件格式": "Export file format",
|
39 |
+
"伴奏人声分离&去混响&去回声": "Vocals/Accompaniment Separation & Reverberation Removal",
|
40 |
+
"输入待处理音频文件夹路径": "Enter the path of the audio folder to be processed:",
|
41 |
+
"模型": "Model",
|
42 |
+
"指定输出主人声文件夹": "Specify the output folder for vocals:",
|
43 |
+
"指定输出非主人声文件夹": "Specify the output folder for accompaniment:",
|
44 |
+
"训练": "Train",
|
45 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Step 1: Fill in the experimental configuration. Experimental data is stored in the 'logs' folder, with each experiment having a separate folder. Manually enter the experiment name path, which contains the experimental configuration, logs, and trained model files.",
|
46 |
+
"输入实验名": "Enter the experiment name:",
|
47 |
+
"目标采样率": "Target sample rate:",
|
48 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "Whether the model has pitch guidance (required for singing, optional for speech):",
|
49 |
+
"版本": "Version",
|
50 |
+
"提取音高和处理数据使用的CPU进程数": "Number of CPU processes used for pitch extraction and data processing:",
|
51 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Step 2a: Automatically traverse all files in the training folder that can be decoded into audio and perform slice normalization. Generates 2 wav folders in the experiment directory. Currently, only single-singer/speaker training is supported.",
|
52 |
+
"输入训练文件夹路径": "Enter the path of the training folder:",
|
53 |
+
"请指定说话人id": "Please specify the speaker/singer ID:",
|
54 |
+
"处理数据": "Process data",
|
55 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Step 2b: Use CPU to extract pitch (if the model has pitch), use GPU to extract features (select GPU index):",
|
56 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Enter the GPU index(es) separated by '-', e.g., 0-1-2 to use GPU 0, 1, and 2:",
|
57 |
+
"显卡信息": "GPU Information",
|
58 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'dio': improved speech but slower extraction; 'harvest': better quality but slower extraction):",
|
59 |
+
"特征提取": "Feature extraction",
|
60 |
+
"step3: 填写训练设置, 开始训练模型和索引": "Step 3: Fill in the training settings and start training the model and index",
|
61 |
+
"保存频率save_every_epoch": "Save frequency (save_every_epoch):",
|
62 |
+
"总训练轮数total_epoch": "Total training epochs (total_epoch):",
|
63 |
+
"每张显卡的batch_size": "Batch size per GPU:",
|
64 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "Save only the latest '.ckpt' file to save disk space:",
|
65 |
+
"否": "No",
|
66 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Cache all training sets to GPU memory. Caching small datasets (less than 10 minutes) can speed up training, but caching large datasets will consume a lot of GPU memory and may not provide much speed improvement:",
|
67 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "Save a small final model to the 'weights' folder at each save point:",
|
68 |
+
"加载预训练底模G路径": "Load pre-trained base model G path:",
|
69 |
+
"加载预训练底模D路径": "Load pre-trained base model D path:",
|
70 |
+
"训练模型": "Train model",
|
71 |
+
"训练特征索引": "Train feature index",
|
72 |
+
"一键训练": "One-click training",
|
73 |
+
"ckpt处理": "ckpt Processing",
|
74 |
+
"模型融合, 可用于测试音色融合": "Model fusion, can be used to test timbre fusion",
|
75 |
+
"A模型路径": "Path to Model A:",
|
76 |
+
"B模型路径": "Path to Model B:",
|
77 |
+
"A模型权重": "Weight (w) for Model A:",
|
78 |
+
"模型是否带音高指导": "Whether the model has pitch guidance:",
|
79 |
+
"要置入的模型信息": "Model information to be placed:",
|
80 |
+
"保存的模型名不带后缀": "Saved model name (without extension):",
|
81 |
+
"模型版本型号": "Model architecture version:",
|
82 |
+
"融合": "Fusion",
|
83 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modify model information (only supported for small model files extracted from the 'weights' folder)",
|
84 |
+
"模型路径": "Path to Model:",
|
85 |
+
"要改的模型信息": "Model information to be modified:",
|
86 |
+
"保存的文件名, 默认空为和源文件同名": "Save file name (default: same as the source file):",
|
87 |
+
"修改": "Modify",
|
88 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "View model information (only supported for small model files extracted from the 'weights' folder)",
|
89 |
+
"查看": "View",
|
90 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Model extraction (enter the path of the large file model under the 'logs' folder). This is useful if you want to stop training halfway and manually extract and save a small model file, or if you want to test an intermediate model:",
|
91 |
+
"保存名": "Save name:",
|
92 |
+
"模型是否带音高指导,1是0否": "Whether the model has pitch guidance (1: yes, 0: no):",
|
93 |
+
"提取": "Extract",
|
94 |
+
"Onnx导出": "Export Onnx",
|
95 |
+
"RVC模型路径": "RVC Model Path:",
|
96 |
+
"Onnx输出路径": "Onnx Export Path:",
|
97 |
+
"MoeVS模型": "MoeVS Model",
|
98 |
+
"导出Onnx模型": "Export Onnx Model",
|
99 |
+
"常见问题解答": "FAQ (Frequently Asked Questions)",
|
100 |
+
"招募音高曲线前端编辑器": "Recruiting front-end editors for pitch curves",
|
101 |
+
"加开发群联系我xxxxx": "Join the development group and contact me at xxxxx",
|
102 |
+
"点击查看交流、问题反馈群号": "Click to view the communication and problem feedback group number",
|
103 |
+
"xxxxx": "xxxxx",
|
104 |
+
"加载模型": "Load model",
|
105 |
+
"Hubert模型": "Hubert Model",
|
106 |
+
"选择.pth文件": "Select the .pth file",
|
107 |
+
"选择.index文件": "Select the .index file",
|
108 |
+
"选择.npy文件": "Select the .npy file",
|
109 |
+
"输入设备": "Input device",
|
110 |
+
"输出设备": "Output device",
|
111 |
+
"音频设备(请使用同种类驱动)": "Audio device (please use the same type of driver)",
|
112 |
+
"响应阈值": "Response threshold",
|
113 |
+
"音调设置": "Pitch settings",
|
114 |
+
"Index Rate": "Index Rate",
|
115 |
+
"常规设置": "General settings",
|
116 |
+
"采样长度": "Sample length",
|
117 |
+
"淡入淡出长度": "Fade length",
|
118 |
+
"额外推理时长": "Extra inference time",
|
119 |
+
"输入降噪": "Input noise reduction",
|
120 |
+
"输出降噪": "Output noise reduction",
|
121 |
+
"性能设置": "Performance settings",
|
122 |
+
"开始音频转换": "Start audio conversion",
|
123 |
+
"停止音频转换": "Stop audio conversion",
|
124 |
+
"推理时间(ms):": "Inference time (ms):",
|
125 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。":"Batch processing for vocal accompaniment separation using the UVR5 model.<br>Example of a valid folder path format: D:\\path\\to\\input\\folder (copy it from the file manager address bar).<br>The model is divided into three categories:<br>1. Preserve vocals: Choose this option for audio without harmonies. It preserves vocals better than HP5. It includes two built-in models: HP2 and HP3. HP3 may slightly leak accompaniment but preserves vocals slightly better than HP2.<br>2. Preserve main vocals only: Choose this option for audio with harmonies. It may weaken the main vocals. It includes one built-in model: HP5.<br>3. De-reverb and de-delay models (by FoxJoy):<br> (1) MDX-Net: The best choice for stereo reverb removal but cannot remove mono reverb;<br> (234) DeEcho: Removes delay effects. Aggressive mode removes more thoroughly than Normal mode. DeReverb additionally removes reverb and can remove mono reverb, but not very effectively for heavily reverberated high-frequency content.<br>De-reverb/de-delay notes:<br>1. The processing time for the DeEcho-DeReverb model is approximately twice as long as the other two DeEcho models.<br>2. The MDX-Net-Dereverb model is quite slow.<br>3. The recommended cleanest configuration is to apply MDX-Net first and then DeEcho-Aggressive."
|
126 |
+
}
|
i18n/es_ES.json
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "Lamentablemente, no tiene una tarjeta gráfica adecuada para soportar su entrenamiento",
|
3 |
+
"是": "Sí",
|
4 |
+
"step1:正在处理数据": "Paso 1: Procesando datos",
|
5 |
+
"step2a:无需提取音高": "Paso 2a: No es necesario extraer el tono",
|
6 |
+
"step2b:正在提取特征": "Paso 2b: Extrayendo características",
|
7 |
+
"step3a:正在训练模型": "Paso 3a: Entrenando el modelo",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Entrenamiento finalizado, puede ver el registro de entrenamiento en la consola o en el archivo train.log en la carpeta del experimento",
|
9 |
+
"全流程结束!": "¡Todo el proceso ha terminado!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "Este software es de código abierto bajo la licencia MIT, el autor no tiene ningún control sobre el software, y aquellos que usan el software y difunden los sonidos exportados por el software son los únicos responsables.<br>Si no está de acuerdo con esta cláusula , no puede utilizar ni citar ningún código ni archivo del paquete de software Consulte el directorio raíz <b>Agreement-LICENSE.txt</b> para obtener más información.",
|
11 |
+
"模型推理": "inferencia del modelo",
|
12 |
+
"推理音色": "inferencia de voz",
|
13 |
+
"刷新音色列表和索引路径": "Actualizar la lista de timbres e índice de rutas",
|
14 |
+
"卸载音色省显存": "Descargue la voz para ahorrar memoria GPU",
|
15 |
+
"请选择说话人id": "seleccione una identificación de altavoz",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tecla +12 recomendada para conversión de voz de hombre a mujer, tecla -12 para conversión de voz de mujer a hombre. Si el rango de tono es demasiado amplio y causa distorsión, ajústelo usted mismo a un rango adecuado.",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "Ingrese la ruta del archivo del audio que se procesará (el formato predeterminado es el ejemplo correcto)",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Seleccione el algoritmo para la extracción de tono. Use 'pm' para acelerar las voces cantadas, o use 'harvest' para mejorar las voces bajas, pero es extremadamente lento.",
|
20 |
+
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
21 |
+
"特征检索库文件路径": "Ruta del archivo de la base de datos de búsqueda de características",
|
22 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音",
|
23 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果",
|
24 |
+
"自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)",
|
25 |
+
"特征文件路径": "Ruta del archivo de características",
|
26 |
+
"检索特征占比": "Proporción de función de búsqueda",
|
27 |
+
"后处理重采样至最终采样率,0为不进行重采样": "Remuestreo posterior al proceso a la tasa de muestreo final, 0 significa no remuestrear",
|
28 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Proporción de fusión para reemplazar el sobre de volumen de entrada con el sobre de volumen de salida, cuanto más cerca de 1, más se utiliza el sobre de salida",
|
29 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Proteger las consonantes claras y la respiración, prevenir artefactos como la distorsión de sonido electrónico, 0.5 no está activado, reducir aumentará la protección pero puede reducir el efecto del índice",
|
30 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Archivo de curva F0, opcional, un tono por línea, en lugar de F0 predeterminado y cambio de tono",
|
31 |
+
"转换": "Conversión",
|
32 |
+
"输出信息": "Información de salida",
|
33 |
+
"输出音频(右下角三个点,点了可以下载)": "Salida de audio (haga clic en los tres puntos en la esquina inferior derecha para descargar)",
|
34 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversión por lotes, ingrese la carpeta que contiene los archivos de audio para convertir o cargue varios archivos de audio. El audio convertido se emitirá en la carpeta especificada (opción predeterminada).",
|
35 |
+
"指定输出文件夹": "Especificar carpeta de salida",
|
36 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Ingrese la ruta a la carpeta de audio que se procesará (simplemente cópiela desde la barra de direcciones del administrador de archivos)",
|
37 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "También se pueden ingresar múltiples archivos de audio, cualquiera de las dos opciones, con prioridad dada a la carpeta",
|
38 |
+
"导出文件格式": "Formato de archivo de exportación",
|
39 |
+
"伴奏人声分离&去混响&去回声": "Separación de voz acompañante & eliminación de reverberación & eco",
|
40 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Procesamiento por lotes para la separación de acompañamiento vocal utilizando el modelo UVR5.<br>Ejemplo de formato de ruta de carpeta válido: D:\\ruta\\a\\la\\carpeta\\de\\entrada (copiar desde la barra de direcciones del administrador de archivos).<br>El modelo se divide en tres categorías:<br>1. Preservar voces: Elija esta opción para audio sin armonías. Preserva las voces mejor que HP5. Incluye dos modelos incorporados: HP2 y HP3. HP3 puede filtrar ligeramente el acompañamiento pero conserva las voces un poco mejor que HP2.<br>2. Preservar solo voces principales: Elija esta opción para audio con armonías. Puede debilitar las voces principales. Incluye un modelo incorporado: HP5.<br>3. Modelos de des-reverberación y des-retardo (por FoxJoy):<br> (1) MDX-Net: La mejor opción para la eliminación de reverberación estéreo pero no puede eliminar la reverberación mono;<br> (234) DeEcho: Elimina efectos de retardo. El modo Agresivo elimina más a fondo que el modo Normal. DeReverb adicionalmente elimina la reverberación y puede eliminar la reverberación mono, pero no muy efectivamente para contenido de alta frecuencia fuertemente reverberado.<br>Notas de des-reverberación/des-retardo:<br>1. El tiempo de procesamiento para el modelo DeEcho-DeReverb es aproximadamente el doble que los otros dos modelos DeEcho.<br>2. El modelo MDX-Net-Dereverb es bastante lento.<br>3. La configuración más limpia recomendada es aplicar primero MDX-Net y luego DeEcho-Agresivo.",
|
41 |
+
"输入待处理音频文件夹路径": "Ingrese la ruta a la carpeta de audio que se procesará",
|
42 |
+
"模型": "Modelo",
|
43 |
+
"指定输出主人声文件夹": "Especifique la carpeta de salida para la voz principal",
|
44 |
+
"指定输出非主人声文件夹": "Especifique la carpeta de salida para las voces no principales",
|
45 |
+
"训练": "Entrenamiento",
|
46 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "paso 1: Complete la configuración del experimento. Los datos del experimento se almacenan en el directorio 'logs', con cada experimento en una carpeta separada. La ruta del nombre del experimento debe ingresarse manualmente y debe contener la configuración del experimento, los registros y los archivos del modelo entrenado.",
|
47 |
+
"输入实验名": "Ingrese el nombre del modelo",
|
48 |
+
"目标采样率": "Tasa de muestreo objetivo",
|
49 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "Si el modelo tiene guía de tono (necesaria para cantar, pero no para hablar)",
|
50 |
+
"版本": "Versión",
|
51 |
+
"提取音高和处理数据使用的CPU进程数": "Número de procesos de CPU utilizados para extraer el tono y procesar los datos",
|
52 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "paso 2a: recorra automáticamente la carpeta de capacitación y corte y normalice todos los archivos de audio que se pueden decodificar en audio. Se generarán dos carpetas 'wav' en el directorio del experimento. Actualmente, solo se admite la capacitación de una sola persona.",
|
53 |
+
"输入训练文件夹路径": "Introduzca la ruta de la carpeta de entrenamiento",
|
54 |
+
"请指定说话人id": "Especifique el ID del hablante",
|
55 |
+
"处理数据": "Procesar datos",
|
56 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "paso 2b: use la CPU para extraer el tono (si el modelo tiene guía de tono) y la GPU para extraer características (seleccione el número de tarjeta).",
|
57 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Separe los números de identificación de la GPU con '-' al ingresarlos. Por ejemplo, '0-1-2' significa usar GPU 0, GPU 1 y GPU 2.",
|
58 |
+
"显卡信息": "información de la GPU",
|
59 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Seleccione el algoritmo de extracción de tono: utilice 'pm' para un procesamiento más rápido de la voz cantada, 'dio' para un discurso de alta calidad pero un procesamiento más lento y 'cosecha' para obtener la mejor calidad pero un procesamiento más lento.",
|
60 |
+
"特征提取": "Extracción de características",
|
61 |
+
"step3: 填写训练设置, 开始训练模型和索引": "Paso 3: complete la configuración de entrenamiento y comience a entrenar el modelo y el índice.",
|
62 |
+
"保存频率save_every_epoch": "Frecuencia de guardado (save_every_epoch)",
|
63 |
+
"总训练轮数total_epoch": "Total de épocas de entrenamiento (total_epoch)",
|
64 |
+
"每张显卡的batch_size": "Tamaño del lote (batch_size) por tarjeta gráfica",
|
65 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "Si guardar solo el archivo ckpt más reciente para ahorrar espacio en disco",
|
66 |
+
"否": "No",
|
67 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Si almacenar en caché todos los conjuntos de entrenamiento en la memoria de la GPU. Los conjuntos de datos pequeños (menos de 10 minutos) se pueden almacenar en caché para acelerar el entrenamiento, pero el almacenamiento en caché de conjuntos de datos grandes puede causar errores de memoria en la GPU y no aumenta la velocidad de manera significativa.",
|
68 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "¿Guardar el pequeño modelo final en la carpeta 'weights' en cada punto de guardado?",
|
69 |
+
"加载预训练底模G路径": "Cargue la ruta G del modelo base preentrenada.",
|
70 |
+
"加载预训练底模D路径": "Cargue la ruta del modelo D base preentrenada.",
|
71 |
+
"训练模型": "Entrenar Modelo",
|
72 |
+
"训练特征索引": "Índice de características del Entrenamiento",
|
73 |
+
"一键训练": "One-click training. (Not working on this fork)",
|
74 |
+
"ckpt处理": "Procesamiento de recibos",
|
75 |
+
"模型融合, 可用于测试音色融合": "Fusión de modelos, se puede utilizar para fusionar diferentes voces",
|
76 |
+
"A模型路径": "Modelo A ruta.",
|
77 |
+
"B模型路径": "Modelo B ruta.",
|
78 |
+
"A模型权重": "Un peso modelo para el modelo A.",
|
79 |
+
"模型是否带音高指导": "Si el modelo tiene guía de tono.",
|
80 |
+
"要置入的模型信息": "Información del modelo a colocar.",
|
81 |
+
"保存的模型名不带后缀": "Nombre del modelo guardado sin extensión.",
|
82 |
+
"模型版本型号": "Versión y modelo del modelo",
|
83 |
+
"融合": "Fusión.",
|
84 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modificar la información del modelo (solo admite archivos de modelos pequeños extraídos en la carpeta de pesos).",
|
85 |
+
"模型路径": "Ruta del modelo",
|
86 |
+
"要改的模型信息": "Información del modelo a modificar",
|
87 |
+
"保存的文件名, 默认空为和源文件同名": "Nombre del archivo que se guardará, el valor predeterminado es el mismo que el nombre del archivo de origen",
|
88 |
+
"修改": "Modificar",
|
89 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Ver información del modelo (solo aplicable a archivos de modelos pequeños extraídos de la carpeta 'pesos')",
|
90 |
+
"查看": "Ver",
|
91 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Extracción de modelo (ingrese la ruta de un archivo de modelo grande en la carpeta 'logs'), aplicable cuando desea extraer un archivo de modelo pequeño después de entrenar a mitad de camino y no se guardó automáticamente, o cuando desea probar un modelo intermedio",
|
92 |
+
"保存名": "Guardar nombre",
|
93 |
+
"模型是否带音高指导,1是0否": "Si el modelo tiene guía de tono, 1 para sí, 0 para no",
|
94 |
+
"提取": "Extracter",
|
95 |
+
"Onnx导出": "Exportar Onnx",
|
96 |
+
"RVC模型路径": "Ruta del modelo RVC",
|
97 |
+
"Onnx输出路径": "Ruta de salida Onnx",
|
98 |
+
"导出Onnx模型": "Exportar modelo Onnx",
|
99 |
+
"常见问题解答": "Preguntas frecuentes",
|
100 |
+
"招募音高曲线前端编辑器": "Reclutar editores front-end para curvas de tono",
|
101 |
+
"加开发群联系我xxxxx": "Únase al grupo de desarrollo para contactarme en xxxxx",
|
102 |
+
"点击查看交流、问题反馈群号": "Haga clic para ver el número de grupo de comunicación y comentarios sobre problemas",
|
103 |
+
"xxxxx": "xxxxx",
|
104 |
+
"加载模型": "Cargar modelo",
|
105 |
+
"Hubert模型": "Modelo de Hubert ",
|
106 |
+
"选择.pth文件": "Seleccionar archivo .pth",
|
107 |
+
"选择.index文件": "Select .index file",
|
108 |
+
"选择.npy文件": "Seleccionar archivo .npy",
|
109 |
+
"输入设备": "Dispositivo de entrada",
|
110 |
+
"输出设备": "Dispositivo de salida",
|
111 |
+
"音频设备(请使用同种类驱动)": "Dispositivo de audio (utilice el mismo tipo de controlador)",
|
112 |
+
"响应阈值": "Umbral de respuesta",
|
113 |
+
"音调设置": "Ajuste de tono",
|
114 |
+
"Index Rate": "Tasa de índice",
|
115 |
+
"常规设置": "Configuración general",
|
116 |
+
"采样长度": "Longitud de muestreo",
|
117 |
+
"淡入淡出长度": "Duración del fundido de entrada/salida",
|
118 |
+
"额外推理时长": "Tiempo de inferencia adicional",
|
119 |
+
"输入降噪": "Reducción de ruido de entrada",
|
120 |
+
"输出降噪": "Reducción de ruido de salida",
|
121 |
+
"性能设置": "Configuración de rendimiento",
|
122 |
+
"开始音频转换": "Iniciar conversión de audio",
|
123 |
+
"停止音频转换": "Detener la conversión de audio",
|
124 |
+
"推理时间(ms):": "Inferir tiempo (ms):",
|
125 |
+
"请选择pth文件": "请选择pth文件",
|
126 |
+
"请选择index文件": "请选择index文件",
|
127 |
+
"hubert模型路径不可包含中文": "hubert模型路径不可包含中文",
|
128 |
+
"pth文件路径不可包含中文": "pth文件路径不可包含中文",
|
129 |
+
"index文件路径不可包含中文": "index文件路径不可包含中文",
|
130 |
+
"音高算法": "音高算法",
|
131 |
+
"harvest进程数": "harvest进程数"
|
132 |
+
}
|
i18n/it_IT.json
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "Sfortunatamente, non è disponibile alcuna GPU compatibile per supportare l'addestramento.",
|
3 |
+
"是": "SÌ",
|
4 |
+
"step1:正在处理数据": "Passaggio 1: elaborazione dei dati",
|
5 |
+
"step2a:无需提取音高": "Step 2a: Saltare l'estrazione del tono",
|
6 |
+
"step2b:正在提取特征": "Passaggio 2b: estrazione delle funzionalità",
|
7 |
+
"step3a:正在训练模型": "Passaggio 3a: è iniziato l'addestramento del modello",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Addestramento completato. ",
|
9 |
+
"全流程结束!": "Tutti i processi sono stati completati!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "Questo software è open source con licenza MIT. <br>Se non si accetta questa clausola, non è possibile utilizzare o fare riferimento a codici e file all'interno del pacchetto software. <b>Contratto-LICENZA.txt</b> per dettagli.",
|
11 |
+
"模型推理": "Inferenza del modello",
|
12 |
+
"推理音色": "Voce di inferenza:",
|
13 |
+
"刷新音色列表和索引路径": "Aggiorna l'elenco delle voci e il percorso dell'indice",
|
14 |
+
"卸载音色省显存": "Scarica la voce per risparmiare memoria della GPU:",
|
15 |
+
"请选择说话人id": "Seleziona ID locutore/cantante:",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tonalità +12 consigliata per la conversione da maschio a femmina e tonalità -12 per la conversione da femmina a maschio. ",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "Immettere il percorso del file audio da elaborare (l'impostazione predefinita è l'esempio di formato corretto):",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Seleziona l'algoritmo di estrazione del tono (\"pm\": estrazione più veloce ma risultato di qualità inferiore; \"harvest\": bassi migliori ma estremamente lenti; \"crepe\": qualità migliore ma utilizzo intensivo della GPU):",
|
20 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Se >=3: applica il filtro mediano ai risultati del pitch raccolto. ",
|
21 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "Percorso del file di indice delle caratteristiche. ",
|
22 |
+
"自动检测index路径,下拉式选择(dropdown)": "Rileva automaticamente il percorso dell'indice e seleziona dal menu a tendina:",
|
23 |
+
"特征文件路径": "Percorso del file delle caratteristiche:",
|
24 |
+
"检索特征占比": "Rapporto funzionalità di ricerca (controlla la forza dell'accento, troppo alto ha artefatti):",
|
25 |
+
"后处理重采样至最终采样率,0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ",
|
26 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Regola il ridimensionamento dell'inviluppo del volume. ",
|
27 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Proteggi le consonanti senza voce e i suoni del respiro per evitare artefatti come il tearing nella musica elettronica. ",
|
28 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "File curva F0 (opzionale). ",
|
29 |
+
"转换": "Convertire",
|
30 |
+
"输出信息": "Informazioni sull'uscita",
|
31 |
+
"输出音频(右下角三个点,点了可以下载)": "Esporta audio (clicca sui tre puntini in basso a destra per scaricarlo)",
|
32 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversione massiva. Inserisci il percorso della cartella che contiene i file da convertire o carica più file audio. I file convertiti finiranno nella cartella specificata. (default: opt) ",
|
33 |
+
"指定输出文件夹": "Specifica la cartella di output:",
|
34 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Immettere il percorso della cartella audio da elaborare (copiarlo dalla barra degli indirizzi del file manager):",
|
35 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "Puoi anche inserire file audio in massa. ",
|
36 |
+
"导出文件格式": "Formato file di esportazione",
|
37 |
+
"伴奏人声分离&去混响&去回声": "Separazione voce/accompagnamento",
|
38 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Elaborazione batch per la separazione dell'accompagnamento vocale utilizzando il modello UVR5.<br>Esempio di un formato di percorso di cartella valido: D:\\path\\to\\input\\folder (copialo dalla barra degli indirizzi del file manager).<br>Il modello è suddiviso in tre categorie:<br>1. Conserva la voce: scegli questa opzione per l'audio senza armonie. <br>2. Mantieni solo la voce principale: scegli questa opzione per l'audio con armonie. <br>3. Modelli di de-riverbero e de-delay (di FoxJoy):<br> (1) MDX-Net: la scelta migliore per la rimozione del riverbero stereo ma non può rimuovere il riverbero mono;<br><br>Note di de-riverbero/de-delay:<br>1. Il tempo di elaborazione per il modello DeEcho-DeReverb è circa il doppio rispetto agli altri due modelli DeEcho.<br>2. Il modello MDX-Net-Dereverb è piuttosto lento.<br>3. La configurazione più pulita consigliata consiste nell'applicare prima MDX-Net e poi DeEcho-Aggressive.",
|
39 |
+
"输入待处理音频文件夹路径": "Immettere il percorso della cartella audio da elaborare:",
|
40 |
+
"模型": "Modello",
|
41 |
+
"指定输出主人声文件夹": "Specifica la cartella di output per le voci:",
|
42 |
+
"指定输出非主人声文件夹": "Specificare la cartella di output per l'accompagnamento:",
|
43 |
+
"训练": "Addestramento",
|
44 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Passaggio 1: compilare la configurazione sperimentale. ",
|
45 |
+
"输入实验名": "Inserisci il nome dell'esperimento:",
|
46 |
+
"目标采样率": "Frequenza di campionamento target:",
|
47 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "Se il modello ha una guida del tono (necessario per il canto, facoltativo per il parlato):",
|
48 |
+
"版本": "Versione",
|
49 |
+
"提取音高和处理数据使用的CPU进程数": "Numero di processi CPU utilizzati per l'estrazione del tono e l'elaborazione dei dati:",
|
50 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Passaggio 2a: attraversa automaticamente tutti i file nella cartella di addestramento che possono essere decodificati in audio ed esegui la normalizzazione delle sezioni. ",
|
51 |
+
"输入训练文件夹路径": "Inserisci il percorso della cartella di addestramento:",
|
52 |
+
"请指定说话人id": "Si prega di specificare l'ID del locutore/cantante:",
|
53 |
+
"处理数据": "Processa dati",
|
54 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Passaggio 2b: utilizzare la CPU per estrarre il tono (se il modello ha il tono), utilizzare la GPU per estrarre le caratteristiche (selezionare l'indice GPU):",
|
55 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Inserisci gli indici GPU separati da '-', ad esempio 0-1-2 per utilizzare GPU 0, 1 e 2:",
|
56 |
+
"显卡信息": "Informazioni GPU",
|
57 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Seleziona l'algoritmo di estrazione del tono (\"pm\": estrazione più rapida ma parlato di qualità inferiore; \"dio\": parlato migliorato ma estrazione più lenta; \"harvest\": migliore qualità ma estrazione più lenta):",
|
58 |
+
"特征提取": "Estrazione delle caratteristiche",
|
59 |
+
"step3: 填写训练设置, 开始训练模型和索引": "Passaggio 3: compilare le impostazioni di addestramento e avviare l'addestramento del modello e dell'indice",
|
60 |
+
"保存频率save_every_epoch": "Frequenza di salvataggio (save_every_epoch):",
|
61 |
+
"总训练轮数total_epoch": "Epoch totali di addestramento (total_epoch):",
|
62 |
+
"每张显卡的batch_size": "Dimensione batch per GPU:",
|
63 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "Salva solo l'ultimo file '.ckpt' per risparmiare spazio su disco:",
|
64 |
+
"否": "NO",
|
65 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Memorizza nella cache tutti i set di addestramento nella memoria della GPU. ",
|
66 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "Salva un piccolo modello finale nella cartella \"weights\" in ogni punto di salvataggio:",
|
67 |
+
"加载预训练底模G路径": "Carica il percorso G del modello base pre-addestrato:",
|
68 |
+
"加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:",
|
69 |
+
"训练模型": "Addestra modello",
|
70 |
+
"训练特征索引": "Addestra indice delle caratteristiche",
|
71 |
+
"一键训练": "Addestramento con un clic",
|
72 |
+
"ckpt处理": "Elaborazione ckpt",
|
73 |
+
"模型融合, 可用于测试音色融合": "Model fusion, può essere utilizzato per testare la fusione timbrica",
|
74 |
+
"A模型路径": "Percorso per il modello A:",
|
75 |
+
"B模型路径": "Percorso per il modello B:",
|
76 |
+
"A模型权重": "Peso (w) per il modello A:",
|
77 |
+
"模型是否带音高指导": "Se il modello ha una guida del tono:",
|
78 |
+
"要置入的模型信息": "Informazioni sul modello da posizionare:",
|
79 |
+
"保存的模型名不带后缀": "Nome del modello salvato (senza estensione):",
|
80 |
+
"模型版本型号": "Versione dell'architettura del modello:",
|
81 |
+
"融合": "Fusione",
|
82 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifica le informazioni sul modello (supportato solo per i file di modello di piccole dimensioni estratti dalla cartella 'weights')",
|
83 |
+
"模型路径": "Percorso al modello:",
|
84 |
+
"要改的模型信息": "Informazioni sul modello da modificare:",
|
85 |
+
"保存的文件名, 默认空为和源文件同名": "Salva il nome del file (predefinito: uguale al file di origine):",
|
86 |
+
"修改": "Modificare",
|
87 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Visualizza le informazioni sul modello (supportato solo per file di modello piccoli estratti dalla cartella 'weights')",
|
88 |
+
"查看": "Visualizzazione",
|
89 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Estrazione del modello (inserire il percorso del modello di file di grandi dimensioni nella cartella \"logs\"). ",
|
90 |
+
"保存名": "Salva nome:",
|
91 |
+
"模型是否带音高指导,1是0否": "Se il modello ha una guida del tono (1: sì, 0: no):",
|
92 |
+
"提取": "Estrai",
|
93 |
+
"Onnx导出": "Esporta Onnx",
|
94 |
+
"RVC模型路径": "Percorso modello RVC:",
|
95 |
+
"Onnx输出路径": "Percorso di esportazione Onnx:",
|
96 |
+
"导出Onnx模型": "Esporta modello Onnx",
|
97 |
+
"常见问题解答": "FAQ (Domande frequenti)",
|
98 |
+
"招募音高曲线前端编辑器": "Reclutamento di redattori front-end per curve di tono",
|
99 |
+
"加开发群联系我xxxxx": "Unisciti al gruppo di sviluppo e contattami a xxxxx",
|
100 |
+
"点击查看交流、问题反馈群号": "Fare clic per visualizzare il numero del gruppo di comunicazione e feedback sui problemi",
|
101 |
+
"xxxxx": "xxxxx",
|
102 |
+
"加载模型": "Carica modello",
|
103 |
+
"Hubert模型": "Modello Hubert",
|
104 |
+
"选择.pth文件": "Seleziona il file .pth",
|
105 |
+
"选择.index文件": "Seleziona il file .index",
|
106 |
+
"选择.npy文件": "Seleziona il file .npy",
|
107 |
+
"输入设备": "Dispositivo di input",
|
108 |
+
"输出设备": "Dispositivo di uscita",
|
109 |
+
"音频设备(请使用同种类驱动)": "Dispositivo audio (utilizzare lo stesso tipo di driver)",
|
110 |
+
"响应阈值": "Soglia di risposta",
|
111 |
+
"音调设置": "Impostazioni del tono",
|
112 |
+
"Index Rate": "Tasso di indice",
|
113 |
+
"常规设置": "Impostazioni generali",
|
114 |
+
"采样长度": "Lunghezza del campione",
|
115 |
+
"淡入淡出长度": "Lunghezza dissolvenza",
|
116 |
+
"额外推理时长": "Tempo di inferenza extra",
|
117 |
+
"输入降噪": "Riduzione del rumore in ingresso",
|
118 |
+
"输出降噪": "Riduzione del rumore in uscita",
|
119 |
+
"性能设置": "Impostazioni delle prestazioni",
|
120 |
+
"开始音频转换": "Avvia la conversione audio",
|
121 |
+
"停止音频转换": "Arresta la conversione audio",
|
122 |
+
"推理时间(ms):": "Tempo di inferenza (ms):",
|
123 |
+
"请选择pth文件": "请选择pth 文件",
|
124 |
+
"请选择index文件": "请选择index文件",
|
125 |
+
"hubert模型路径不可包含中文": "hubert 模型路径不可包含中文",
|
126 |
+
"pth文件路径不可包含中文": "pth è un'app per il futuro",
|
127 |
+
"index文件路径不可包含中文": "index文件路径不可包含中文",
|
128 |
+
"音高算法": "音高算法",
|
129 |
+
"harvest进程数": "harvest进程数"
|
130 |
+
}
|
i18n/ja_JP.json
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "トレーニングに対応したGPUが動作しないのは残念です。",
|
3 |
+
"是": "はい",
|
4 |
+
"step1:正在处理数据": "step1:処理中のデータ",
|
5 |
+
"step2a:无需提取音高": "step2a:ピッチの抽出は不要",
|
6 |
+
"step2b:正在提取特征": "step2b:抽出される特徴量",
|
7 |
+
"step3a:正在训练模型": "step3a:トレーニング中のモデル",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "トレーニング終了時に、トレーニングログやフォルダ内のtrain.logを確認することができます",
|
9 |
+
"全流程结束!": "全工程が完了!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>使用需遵守的协议-LICENSE.txt</b>.": "本ソフトウェアはMITライセンスに基づくオープンソースであり、作者は本ソフトウェアに対していかなる強制力も持ちません。本ソフトウェアの利用者および本ソフトウェアから派生した音源(成果物)を配布する者は、本ソフトウェアに対して自身で責任を負うものとします。 <br>この条項に同意しない場合、パッケージ内のコードやファイルを使用や参照を禁じます。詳しくは<b>使用需遵守的协议-LICENSE.txt</b>をご覧ください.",
|
11 |
+
"模型推理": "モデル推論",
|
12 |
+
"推理音色": "音源推論",
|
13 |
+
"刷新音色列表和索引路径": "音源リストとインデックスパスの更新",
|
14 |
+
"卸载音色省显存": "音源を削除してメモリを節約",
|
15 |
+
"请选择说话人id": "話者IDを選択してください",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性から女性へは+12キーをお勧めします。女性から男性へは-12キーをお勧めします。音域が広すぎて音質が劣化した場合は、適切な音域に自分で調整することもできます。",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "処理対象音声ファイルのパスを入力してください(デフォルトは正しいフォーマットの例です)",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
|
20 |
+
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
21 |
+
"特征检索库文件路径": "特徴量検索データベースのファイルパス",
|
22 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3 次に、harvestピッチの認識結果に対してメディアンフィルタを使用します。値はフィルター半径で、ミュートを減衰させるために使用します。",
|
23 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "特徴検索ライブラリへのパス 空の場合はドロップダウンで選択",
|
24 |
+
"自动检测index路径,下拉式选择(dropdown)": "インデックスパスの自動検出 ドロップダウンで選択",
|
25 |
+
"特征文件路径": "特徴量ファイルのパス",
|
26 |
+
"检索特征占比": "検索特徴率",
|
27 |
+
"后处理重采样至最终采样率,0为不进行重采样": "最終的なサンプリングレートへのポストプロセッシングのリサンプリング リサンプリングしない場合は0",
|
28 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "入力ソースの音量エンベロープと出力音量エンベロープの融合率 1に近づくほど、出力音量エンベロープの割合が高くなる",
|
29 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果",
|
30 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0(最低共振周波数)カーブファイル(オプション、1行に1ピッチ、デフォルトのF0(最低共振周波数)とエレベーションを置き換えます。)",
|
31 |
+
"转换": "変換",
|
32 |
+
"输出信息": "出力情報",
|
33 |
+
"输出音频(右下角三个点,点了可以下载)": "出力音声(右下の三点をクリックしてダウンロードできます)",
|
34 |
+
"批量转��, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "一括変換、変換する音声フォルダを入力、または複数の音声ファイルをアップロードし、指定したフォルダ(デフォルトのopt)に変換した音声を出力します。",
|
35 |
+
"指定输出文件夹": "出力フォルダを指定してください",
|
36 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "処理対象音声フォルダーのパスを入力してください(ファイルマネージャのアドレスバーからコピーしてください)",
|
37 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "複数の音声ファイルを一括で入力することもできますが、フォルダーを優先して読み込みます",
|
38 |
+
"导出文件格式": "导出文件格式",
|
39 |
+
"伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
|
40 |
+
"输入待处理音频文件夹路径": "処理するオーディオファイルのフォルダパスを入力してください",
|
41 |
+
"模型": "モデル",
|
42 |
+
"指定输出主人声文件夹": "指定输出主人声文件夹",
|
43 |
+
"指定输出非主人声文件夹": "指定输出非主人声文件夹",
|
44 |
+
"训练": "トレーニング",
|
45 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "ステップ1:実験設定を入力します。実験データはlogsに保存され、各実験にはフォルダーがあります。実験名のパスを手動で入力する必要があり、実験設定、ログ、トレーニングされたモデルファイルが含まれます。",
|
46 |
+
"输入实验名": "モデル名",
|
47 |
+
"目标采样率": "目標サンプリングレート",
|
48 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "モデルに音高ガイドがあるかどうか(歌唱には必要ですが、音声には必要ありません)",
|
49 |
+
"版本": "バージョン",
|
50 |
+
"提取音高和处理数据使用的CPU进程数": "ピッチの抽出やデータ処理に使用するCPUスレッド数",
|
51 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "ステップ2a: 訓練フォルダー内のすべての音声ファイルを自動的に探索し、スライスと正規化を行い、2つのwavフォルダーを実験ディレクトリに生成します。現在は一人でのトレーニングのみをサポートしています。",
|
52 |
+
"输入训练文件夹路径": "トレーニング用フォルダのパスを入力してください",
|
53 |
+
"请指定说话人id": "話者IDを指定してください",
|
54 |
+
"处理数据": "データ処理",
|
55 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "ステップ2b: CPUを使用して音高を抽出する(モデルに音高がある場合)、GPUを使用して特徴を抽出する(GPUの番号を選択する)",
|
56 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "ハイフンで区切って使用するGPUの番号を入力します。例えば0-1-2はGPU0、GPU1、GPU2を使用します",
|
57 |
+
"显卡信息": "GPU情報",
|
58 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "音高抽出アルゴリズムの選択:歌声を入力する場合は、pmを使用して速度を上げることができます。CPUが低い場合はdioを使用して速度を上げることができます。harvestは品質が高く、精度が高いですが、遅いです。",
|
59 |
+
"特征提取": "特徴抽出",
|
60 |
+
"step3: 填写训练设置, 开始训练模型和索引": "ステップ3: トレーニング設定を入力して、モデルとインデックスのトレーニングを開始します",
|
61 |
+
"保存频率save_every_epoch": "エポックごとの保存頻度",
|
62 |
+
"总训练轮数total_epoch": "総エポック数",
|
63 |
+
"每张显卡的batch_size": "GPUごとのバッチサイズ",
|
64 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "ハードディスク容量を節約するため、最新のckptファイルのみを保存するかどうか",
|
65 |
+
"否": "いいえ",
|
66 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "すべてのトレーニングデータをメモリにキャッシュするかどうか。10分以下の小さなデータはキャッシュしてトレーニングを高速化できますが、大きなデータをキャッシュするとメモリが破裂し、あまり速度が上がりません。",
|
67 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "各保存時点の小モデルを全部weightsフォルダに保存するか���うか",
|
68 |
+
"加载预训练底模G路径": "事前学習済みのGモデルのパス",
|
69 |
+
"加载预训练底模D路径": "事前学習済みのDモデルのパス",
|
70 |
+
"训练模型": "モデルのトレーニング",
|
71 |
+
"训练特征索引": "特徴インデックスのトレーニング",
|
72 |
+
"一键训练": "One-click training. (Not working on this fork)",
|
73 |
+
"ckpt处理": "ckptファイルの処理",
|
74 |
+
"模型融合, 可用于测试音色融合": "モデルのマージ、音源のマージテストに使用できます",
|
75 |
+
"A模型路径": "Aモデルのパス",
|
76 |
+
"B模型路径": "Bモデルのパス",
|
77 |
+
"A模型权重": "Aモデルの重み",
|
78 |
+
"模型是否带音高指导": "モデルに音高ガイドを付けるかどうか",
|
79 |
+
"要置入的模型信息": "挿入するモデル情報",
|
80 |
+
"保存的模型名不带后缀": "拡張子のない保存するモデル名",
|
81 |
+
"模型版本型号": "モデルのバージョン",
|
82 |
+
"融合": "フュージョン",
|
83 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報の修正(weightsフォルダから抽出された小さなモデルファイルのみ対応)",
|
84 |
+
"模型路径": "モデルパス",
|
85 |
+
"要改的模型信息": "変更するモデル情報",
|
86 |
+
"保存的文件名, 默认空为和源文件同名": "保存するファイル名、デフォルトでは空欄で元のファイル名と同じ名前になります",
|
87 |
+
"修改": "変更",
|
88 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報を表示する(小さいモデルファイルはweightsフォルダーからのみサポートされています)",
|
89 |
+
"查看": "表示",
|
90 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "モデル抽出(ログフォルダー内の大きなファイルのモデルパスを入力)、モデルを半分までトレーニングし、自動的に小さいファイルモデルを保存しなかったり、中間モデルをテストしたい場合に適用されます。",
|
91 |
+
"保存名": "保存ファイル名",
|
92 |
+
"模型是否带音高指导,1是0否": "モデルに音高ガイドを付けるかどうか、1は付ける、0は付けない",
|
93 |
+
"提取": "抽出",
|
94 |
+
"Onnx导出": "Onnx",
|
95 |
+
"RVC模型路径": "RVCモデルパス",
|
96 |
+
"Onnx输出路径": "Onnx出力パス",
|
97 |
+
"MoeVS模型": "MoeSS?",
|
98 |
+
"导出Onnx模型": "Onnxに変換",
|
99 |
+
"常见问题解答": "よくある質問",
|
100 |
+
"招募音高曲线前端编辑器": "音高曲線フロントエンドエディターを募集",
|
101 |
+
"加开发群联系我xxxxx": "開発グループに参加して私に連絡してくださいxxxxx",
|
102 |
+
"点击查看交流、问题反馈群号": "クリックして交流、問題フィードバックグループ番号を表示",
|
103 |
+
"xxxxx": "xxxxx",
|
104 |
+
"加载模型": "モデルをロード",
|
105 |
+
"Hubert模型": "Hubertモデル",
|
106 |
+
"选择.pth文件": ".pthファイルを選択",
|
107 |
+
"选择.index文件": ".indexファイルを選択",
|
108 |
+
"选择.npy文件": ".npyファイルを選択",
|
109 |
+
"输入设备": "入力デバイス",
|
110 |
+
"输出设备": "出力デバイス",
|
111 |
+
"音频设备(请使用同种类驱动)": "オーディオデバイス(同じ種類のドライバーを使用してください)",
|
112 |
+
"响应阈值": "反応閾値",
|
113 |
+
"音调设置": "音程設定",
|
114 |
+
"Index Rate": "Index Rate",
|
115 |
+
"常规设置": "一般設定",
|
116 |
+
"采样长度": "サンプル長",
|
117 |
+
"淡入淡出长度": "フェードイン/フェードアウト長",
|
118 |
+
"额外推理时长": "追加推論時間",
|
119 |
+
"输入降噪": "入力ノイズの低減",
|
120 |
+
"输出降噪": "出力ノイズの低減",
|
121 |
+
"性能设置": "パフォーマンス設定",
|
122 |
+
"开始音频转换": "音声変換を開始",
|
123 |
+
"停止音频转换": "音声変換を停止",
|
124 |
+
"推理时间(ms):": "推論時間(ms):",
|
125 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。":"UVR5モデルを使用したボーカル伴奏の分離バッチ処理。<br>有効なフォルダーパスフォーマットの例: D:\\path\\to\\input\\folder (ファイルマネージャのアドレスバーからコピーします)。<br>モデルは三つのカテゴリに分かれています:<br>1. ボーカルを保持: ハーモニーのないオーディオに対してこれを選択します。HP5よりもボーカルをより良く保持します。HP2とHP3の二つの内蔵モデルが含まれています。HP3は伴奏をわずかに漏らす可能性がありますが、HP2よりもわずかにボーカルをより良く保持します。<br>2. 主なボーカルのみを保持: ハーモニーのあるオーディオに対してこれを選択します。主なボーカルを弱める可能性があります。HP5の一つの内蔵モデルが含まれています。<br>3. ディリバーブとディレイモデル (by FoxJoy):<br> (1) MDX-Net: ステレオリバーブの除去に最適な選択肢ですが、モノリバーブは除去できません;<br> (234) DeEcho: ディレイ効果を除去します。AggressiveモードはNormalモードよりも徹底的に除去します。DeReverbはさらにリバーブを除去し、モノリバーブを除去することができますが、高周波のリバーブが強い内容に対しては非常に効果的ではありません。<br>ディリバーブ/ディレイに関する注意点:<br>1. DeEcho-DeReverbモデルの処理時間は、他の二つのDeEchoモデルの約二倍です。<br>2. MDX-Net-Dereverbモデルは非常に遅いです。<br>3. 推奨される最もクリーンな設定は、最初にMDX-Netを適用し、その後にDeEcho-Aggressiveを適用することです。"
|
126 |
+
}
|
i18n/locale_diff.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from collections import OrderedDict
|
4 |
+
|
5 |
+
# Define the standard file name
|
6 |
+
standard_file = "zh_CN.json"
|
7 |
+
|
8 |
+
# Find all JSON files in the directory
|
9 |
+
dir_path = "./"
|
10 |
+
languages = [
|
11 |
+
f for f in os.listdir(dir_path) if f.endswith(".json") and f != standard_file
|
12 |
+
]
|
13 |
+
|
14 |
+
# Load the standard file
|
15 |
+
with open(standard_file, "r", encoding="utf-8") as f:
|
16 |
+
standard_data = json.load(f, object_pairs_hook=OrderedDict)
|
17 |
+
|
18 |
+
# Loop through each language file
|
19 |
+
for lang_file in languages:
|
20 |
+
# Load the language file
|
21 |
+
with open(lang_file, "r", encoding="utf-8") as f:
|
22 |
+
lang_data = json.load(f, object_pairs_hook=OrderedDict)
|
23 |
+
|
24 |
+
# Find the difference between the language file and the standard file
|
25 |
+
diff = set(standard_data.keys()) - set(lang_data.keys())
|
26 |
+
|
27 |
+
miss = set(lang_data.keys()) - set(standard_data.keys())
|
28 |
+
|
29 |
+
# Add any missing keys to the language file
|
30 |
+
for key in diff:
|
31 |
+
lang_data[key] = key
|
32 |
+
|
33 |
+
# Del any extra keys to the language file
|
34 |
+
for key in miss:
|
35 |
+
del lang_data[key]
|
36 |
+
|
37 |
+
# Sort the keys of the language file to match the order of the standard file
|
38 |
+
lang_data = OrderedDict(
|
39 |
+
sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
|
40 |
+
)
|
41 |
+
|
42 |
+
# Save the updated language file
|
43 |
+
with open(lang_file, "w", encoding="utf-8") as f:
|
44 |
+
json.dump(lang_data, f, ensure_ascii=False, indent=4)
|
45 |
+
f.write("\n")
|
i18n/ru-RU.json
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "К сожалению у вас нету видеокарты, которая поддерживает тренировку модели.",
|
3 |
+
"是": "Да",
|
4 |
+
"step1:正在处理数据": "Шаг 1: Переработка данных",
|
5 |
+
"step2a:无需提取音高": "Шаг 2а: Пропуск вытаскивания тональности",
|
6 |
+
"step2b:正在提取特征": "Шаг 2б: Вытаскивание черт",
|
7 |
+
"step3a:正在训练模型": "Шаг 3а: Тренировка модели начата",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Тренировка завершена. Вы можете проверить логи тренировки в консоли или в файле 'train.log' в папке модели.",
|
9 |
+
"全流程结束!": "Все процессы завершены!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.",
|
11 |
+
"模型推理": "Обработка модели",
|
12 |
+
"推理音色": "Обработка голоса:",
|
13 |
+
"刷新音色列表和索引路径": "Обновить список голосов и индексов",
|
14 |
+
"卸载音色省显存": "Выгрузить голос для сохранения памяти видеокарты:",
|
15 |
+
"请选择说话人id": "Выбери айди голоса:",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Рекомендованно +12 для конвертирования мужского голоса в женский и -12 для конвертирования женского в мужской. Если диапазон голоса слищком велик и голос искажается, значение можно изменить на свой вкус.",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "Высота голоса (число, полутоны, поднять на октаву: 12, понизить на октаву: -12):",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "Введите путь к аудиофайлу, который хотите переработать (по умолчанию введён правильный формат):",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Выберите алгоритм вытаскивания тональности ('pm': быстрое извлечение но качество речи хуже; 'harvest': бассы лучше но очень медленный; 'crepe': лучшее качество но сильно использует видеокарту):",
|
20 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Если больше 3: применить медианную фильтрацию к вытащенным тональностям. Значение контролирует радиус фильтра и может уменьшить излишнее дыхание.",
|
21 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "Путь к файлу индекса черт. Оставьте пустым, чтобы использовать выбранный результат из списка:",
|
22 |
+
"自动检测index路径,下拉式选择(dropdown)": "Автоматически найти путь к индексу и выбрать его из списка:",
|
23 |
+
"特征文件路径": "Путь к файлу черт:",
|
24 |
+
"检索特征占比": "Соотношение поиска черт:",
|
25 |
+
"后处理重采样至最终采样率,0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:",
|
26 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Использовать громкость входного файла для замены или перемешивания с громкостью выходного файла. Чем ближе соотношение к 1, тем больше используется звука из выходного файла:",
|
27 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Защитить глухие со��ласные и звуки дыхания для предотвращения артефактов, например разрывание в электронной музыке. Поставьте на 0.5, чтобы выключить. Уменьшите значение для повышения защиты, но при этом может ухудшиться аккуратность индексирования:",
|
28 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Файл дуги F0 (не обязательно). Одна тональность на каждую строчку. Заменяет обычный F0 и модуляцию тональности:",
|
29 |
+
"转换": "Конвертировать",
|
30 |
+
"输出信息": "Выходная информация",
|
31 |
+
"输出音频(右下角三个点,点了可以下载)": "Экспортировать аудиофайл (нажми на три точки в правом нижнем углу для загрузки)",
|
32 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Конвертировать пачкой. Введите путь к папке, в которой находятся файлы для конвертирования или выложите несколько аудиофайлов. Сконвертированные файлы будут сохранены в указанной папке (по умолчанию 'opt').",
|
33 |
+
"指定输出文件夹": "Укажите выходную папку:",
|
34 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Введите путь к папке с аудио для переработки:",
|
35 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "Вы также можете выложить аудиофайлы пачкой. Выберите одно из двух. Приоритет отдаётся считыванию из папки.",
|
36 |
+
"导出文件格式": "Формат выходного файла",
|
37 |
+
"伴奏人声分离&去混响&去回声": "Отделение вокала/инструментала и убирание эхо",
|
38 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Пакетная обработка для разделения вокального сопровождения с использованием модели UVR5.<br>Пример допустимого формата пути к папке: D:\\path\\to\\input\\folder<br> Модель разделена на три категории:<br>1. Сохранить вокал: выберите этот вариант для звука без гармоний. Он сохраняет вокал лучше, чем HP5. Он включает в себя две встроенные модели: HP2 и HP3. HP3 может немного пропускать инструментал, но сохраняет вокал немного лучше, чем HP2.<br>2. Сохранить только основной вокал: выберите этот вариант для звука с гармониями. Это может ослабить основной вокал. Он включает одну встроенную модель: HP5.<br>3. Модели удаления реверберации и задержки (от FoxJoy):<br> (1) MDX-Net: лучший выбор для удаления стереореверберации, но он не может удалить монореверберацию;<br> (234) DeEcho: удаляет эффекты задержки. Агрессивный режим удаляет более тщательно, чем Нормальный режим. DeReverb дополнительно удаляет реверберацию и может удалять монореверберацию, но не очень эффективно для сильно ревербери��ованного высокочастотного контента.<br>Примечания по удалению реверберации/задержки:<br>1. Время обработки для модели DeEcho-DeReverb примерно в два раза больше, чем для двух других моделей DeEcho.<br>2. Модель MDX-Net-Dereverb довольно медленная.<br>3. Рекомендуемая самая чистая конфигурация — сначала применить MDX-Net, а затем DeEcho-Aggressive.",
|
39 |
+
"输入待处理音频文件夹路径": "Введите путь к папке с аудиофайлами для переработки:",
|
40 |
+
"模型": "Модели",
|
41 |
+
"指定输出主人声文件夹": "Введите путь к папке для вокала:",
|
42 |
+
"指定输出非主人声文件夹": "Введите путь к папке для инструментала:",
|
43 |
+
"训练": "Тренировка",
|
44 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Шаг 1: Заполните настройки модели. Данные модели сохранены в папку 'logs' и для каждой модели создаётся отдельная папка. Введите вручную путь к настройкам для модели, в которой находятся логи и тренировочные файлы.",
|
45 |
+
"输入实验名": "Введите название модели:",
|
46 |
+
"目标采样率": "Частота дискретизации модели:",
|
47 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "Наведение по тональности у модели (обязательно для пения, необязательно для речи):",
|
48 |
+
"版本": "Версия",
|
49 |
+
"提取音高和处理数据使用的CPU进程数": "Число процессов ЦП, используемое для вытаскивания тональностей и обрабротки данных:",
|
50 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Шаг 2а: Автоматически пройтись по всем аудиофайлам в папке тренировки и нормализировать куски. Создаст 2 папки wav в папке модели. В данных момент поддерживается тренировка только одного голоса.",
|
51 |
+
"输入训练文件夹路径": "Введите путь к папке тренировки:",
|
52 |
+
"请指定说话人id": "Введите айди голоса:",
|
53 |
+
"处理数据": "Переработать данные",
|
54 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Шаг 2б: Вытащить тональности с помошью процессора (если в модели есть тональности), вытащить черты с помощью видеокарты (выберите какой):",
|
55 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Введите, какие(-ую) видеокарты(-у) хотите использовать через '-', например 0-1-2, чтобы использовать видеокарту 0, 1 и 2:",
|
56 |
+
"显卡信息": "Информация о видеокартах",
|
57 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Выберите алгоритм вытаскивания тональности ('pm': быстрое извлечение но качество речи хуже; 'harvest': бассы лучше но очень медленный; 'crepe': лучшее качество но сильно использует видеокарту):",
|
58 |
+
"特征提取": "Вытаскивание черт",
|
59 |
+
"step3: 填写训练设置, 开始训练模型和索引": "Шаг 3: Заполните остальные настройки тренировки и начните тренировать модель и индекс",
|
60 |
+
"保存频率save_every_epoch": "Частота сохранения (save_every_epoch):",
|
61 |
+
"总训练轮数total_epoch": "Полное количество эпох (total_epoch):",
|
62 |
+
"每张显卡的batch_size": "Размер пачки для видеокарты:",
|
63 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "Сохранять только последний файл '.ckpt', чтобы сохранить место на диске:",
|
64 |
+
"否": "Нет",
|
65 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Кэшировать все тренировочные сеты в видеопамять. Кэширование маленький датасетов (меньше 10 минут) может ускорить тренировку, но кэширование больших, наоборот, займёт много видеопамяти и не сильно ускорит тренировку:",
|
66 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "Сохранять маленькую финальную модель в папку 'weights' на каждой точке сохранения:",
|
67 |
+
"加载预训练底模G路径": "Путь к натренированой базовой модели G:",
|
68 |
+
"加载预训练底模D路径": "Путь к натренированой базовой модели D:",
|
69 |
+
"训练模型": "Тренировать модель",
|
70 |
+
"训练特征索引": "Тренировать индекс черт",
|
71 |
+
"一键训练": "Тренировка одним нажатием",
|
72 |
+
"ckpt处理": "Обработка ckpt",
|
73 |
+
"模型融合, 可用于测试音色融合": "Слияние моделей, может быть использовано для проверки слияния тембра",
|
74 |
+
"A模型路径": "Путь к модели А:",
|
75 |
+
"B模型路径": "Путь к модели Б:",
|
76 |
+
"A模型权重": "Вес (w) модели А::",
|
77 |
+
"模型是否带音高指导": "Есть ли у модели наведение по тональности (1: да, 0: нет):",
|
78 |
+
"要置入的模型信息": "Информация о модели:",
|
79 |
+
"保存的模型名不带后缀": "Название сохранённой модели (без расширения):",
|
80 |
+
"模型版本型号": "Версия архитектуры модели:",
|
81 |
+
"融合": "Слияние",
|
82 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Модифицировать информацию о модели (поддерживается только для маленких моделей, взятых из папки 'weights')",
|
83 |
+
"模型路径": "Путь к папке:",
|
84 |
+
"要改的模型信息": "Информация о модели, которую нужно модифицировать:",
|
85 |
+
"保存的文件名, 默认空为和源文件同名": "Название сохранённого файла (по умолчанию такое же, как и входного):",
|
86 |
+
"修改": "Модифицировать",
|
87 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Просмотреть информацию о модели (поддерживается только для маленких моделей, взятых из папки 'weights')",
|
88 |
+
"查看": "Просмотр",
|
89 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Вытаскивание модели (введите путь к большому файлу модели в папке 'logs'). Полезно, если Вам нужно заверщить тренировку и вручную достать и сохранить маленький файл модели, или если Вам нужно проверить незаконченную модель:",
|
90 |
+
"保存名": "Имя сохранённого файла:",
|
91 |
+
"模型是否带音高指导,1是0否": "Есть ли у модели наведение по тональности (1: да, 0: нет):",
|
92 |
+
"提取": "Вытащить",
|
93 |
+
"Onnx导出": "Экспортировать Onnx",
|
94 |
+
"RVC模型路径": "Путь к модели RVC:",
|
95 |
+
"Onnx输出路径": "Путь для экспотрированного Onnx:",
|
96 |
+
"导出Onnx模型": "Экспортировать Onnx модель",
|
97 |
+
"常见问题解答": "ЧаВО (Часто задаваемые вопросы)",
|
98 |
+
"招募音高曲线前端编辑器": "Использование фронтенд редакторов для тональных дуг",
|
99 |
+
"加开发群联系我xxxxx": "Присоединитесь к группе разработки и свяжитесь со мной по xxxxx",
|
100 |
+
"点击查看交流、问题反馈群号": "Нажмите, чтобы просмотреть номер группы коммуникации и отзывах о проблемах",
|
101 |
+
"xxxxx": "xxxxx",
|
102 |
+
"加载模型": "Загрузить модель",
|
103 |
+
"Hubert模型": "Модель Hubert",
|
104 |
+
"选择.pth文件": "Выбрать файл .pth",
|
105 |
+
"选择.index文件": "Выбрать файл .index",
|
106 |
+
"选择.npy文件": "Выбрать файл .npy",
|
107 |
+
"输入设备": "Входное устройство",
|
108 |
+
"输出设备": "Выходное устройство",
|
109 |
+
"音频设备(请使用同种类驱动)": "Аудио устройство (пожалуйста используйте такой=же тип драйвера)",
|
110 |
+
"响应阈值": "Порог ответа",
|
111 |
+
"音调设置": "Настройки тональности",
|
112 |
+
"Index Rate": "Темп индекса",
|
113 |
+
"常规设置": "Основные настройки",
|
114 |
+
"采样长度": "Длина сэмпла",
|
115 |
+
"淡入淡出长度": "Длина затухания",
|
116 |
+
"额外推理时长": "Доп. время переработки",
|
117 |
+
"输入降噪": "Уменьшения шума во входной информации",
|
118 |
+
"输出降噪": "Уменьшения шума во выходной информации",
|
119 |
+
"性能设置": "Настройки быстроты",
|
120 |
+
"开始音频转换": "Начать конвертацию аудио",
|
121 |
+
"停止音频转换": "Закончить конвертацию аудио",
|
122 |
+
"推理时间(ms):": "Время переработки (мс):",
|
123 |
+
"请选择pth文件": "请选择pth文件",
|
124 |
+
"请选择index文件": "请选择index文件",
|
125 |
+
"hubert模型路径不可包含中文": "hubert模型路径不可包含中文",
|
126 |
+
"pth文件路径不可包含中文": "pth文件路径不可包含中文",
|
127 |
+
"index文件路径不可包含中文": "index文件路径不可包含中文",
|
128 |
+
"音高算法": "音高算法",
|
129 |
+
"harvest进程数": "harvest进程数"
|
130 |
+
}
|
i18n/tr_TR.json
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "Maalesef, eğitiminizi desteklemek için uyumlu bir GPU bulunmamaktadır.",
|
3 |
+
"是": "Evet",
|
4 |
+
"step1:正在处理数据": "Adım 1: Veri işleme",
|
5 |
+
"step2a:无需提取音高": "Adım 2a: Pitch çıkartma adımını atlama",
|
6 |
+
"step2b:正在提取特征": "Adım 2b: Özelliklerin çıkarılması",
|
7 |
+
"step3a:正在训练模型": "Adım 3a: Model eğitimi başladı",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Eğitim tamamlandı. Eğitim günlüklerini konsolda veya deney klasörü altındaki train.log dosyasında kontrol edebilirsiniz.",
|
9 |
+
"全流程结束!": "Tüm işlemler tamamlandı!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "Bu yazılım, MIT lisansı altında açık kaynaklıdır. Yazarın yazılım üzerinde herhangi bir kontrolü yoktur. Yazılımı kullanan ve yazılım tarafından dışa aktarılan sesleri dağıtan kullanıcılar sorumludur. <br>Eğer bu maddeyle aynı fikirde değilseniz, yazılım paketi içindeki herhangi bir kod veya dosyayı kullanamaz veya referans göremezsiniz. Detaylar için kök dizindeki <b>Agreement-LICENSE.txt</b> dosyasına bakınız.",
|
11 |
+
"模型推理": "Model çıkartma (Inference)",
|
12 |
+
"推理音色": "Ses çıkartma (Inference):",
|
13 |
+
"刷新音色列表和索引路径": "Ses listesini ve indeks yolunu yenile",
|
14 |
+
"卸载音色省显存": "GPU bellek kullanımını azaltmak için sesi kaldır",
|
15 |
+
"请选择说话人id": "Konuşmacı/Şarkıcı No seçin:",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Erkekten kadına çevirmek için +12 tuş önerilir, kadından erkeğe çevirmek için ise -12 tuş önerilir. Eğer ses aralığı çok fazla genişler ve ses bozulursa, isteğe bağlı olarak uygun aralığa kendiniz de ayarlayabilirsiniz.",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "İşlenecek ses dosyasının yolunu girin (varsayılan doğru format örneğidir):",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Pitch algoritmasını seçin ('pm': daha hızlı çıkarır ancak daha düşük kaliteli konuşma; 'harvest': daha iyi konuşma sesi ancak son derece yavaş; 'crepe': daha da iyi kalite ancak GPU yoğunluğu gerektirir):",
|
20 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Eğer >=3 ise, elde edilen pitch sonuçlarına median filtreleme uygula. Bu değer, filtre yarıçapını temsil eder ve nefesliliği azaltabilir.",
|
21 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "Özellik indeksi dosyasının yolunu belirtin. Seçilen sonucu kullanmak için boş bırakın veya açılır menüden seçim yapın.",
|
22 |
+
"自动检测index路径,下拉式选择(dropdown)": "İndeks yolunu otomatik olarak tespit et ve açılır menüden seçim yap.",
|
23 |
+
"特征文件路径": "Özellik dosyasının yolu:",
|
24 |
+
"检索特征占比": "Arama özelliği oranı (vurgu gücünü kontrol eder, çok yüksek olması sanal etkilere neden olur)",
|
25 |
+
"后处理重采样至最终采样率,0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:",
|
26 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Sesin hacim zarfını ayarlayın. 0'a yakın değerler, sesin orijinal vokallerin hacmine benzer olmasını sağlar. Düşük bir değerle ses gürültüsünü maskeleyebilir ve hacmi daha doğal bir şekilde duyulabilir hale getirebilirsiniz. 1'e yaklaştıkça sürekli bir yüksek ses seviyesi elde edilir:",
|
27 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Sessiz ünsüzleri ve nefes seslerini koruyarak elektronik müzikte yırtılma gibi sanal hataların oluşmasını engeller. 0.5 olarak ayarlandığında devre dışı kalır. Değerin azaltılması korumayı artırabilir, ancak indeksleme doğruluğunu azaltabilir:",
|
28 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 eğrisi dosyası (isteğe bağlı). Her satırda bir pitch değeri bulunur. Varsayılan F0 ve pitch modülasyonunu değiştirir:",
|
29 |
+
"转换": "Dönüştür",
|
30 |
+
"输出信息": "Çıkış bilgisi",
|
31 |
+
"输出音频(右下角三个点,点了可以下载)": "Ses dosyasını dışa aktar (indirmek için sağ alt köşedeki üç noktaya tıklayın)",
|
32 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Toplu dönüştür. Dönüştürülecek ses dosyalarının bulunduğu klasörü girin veya birden çok ses dosyasını yükleyin. Dönüştürülen ses dosyaları belirtilen klasöre ('opt' varsayılan olarak) dönüştürülecektir",
|
33 |
+
"指定输出文件夹": "Çıkış klasörünü belirt:",
|
34 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "İşlenecek ses klasörünün yolunu girin (dosya yöneticisinin adres çubuğundan kopyalayın):",
|
35 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "Toplu olarak ses dosyalarını da girebilirsiniz. İki seçenekten birini seçin. Öncelik klasörden okumaya verilir.",
|
36 |
+
"导出文件格式": "Dışa aktarma dosya formatı",
|
37 |
+
"伴奏人声分离&去混响&去回声": "Vokal/Müzik Ayrıştırma ve Yankı Giderme",
|
38 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Batch işleme kullanarak vokal eşlik ayrımı için UVR5 modeli kullanılır.<br>Geçerli bir klasör yol formatı örneği: D:\\path\\to\\input\\folder (dosya yöneticisi adres çubuğundan kopyalanır).<br>Model üç kategoriye ayrılır:<br>1. Vokalleri koru: Bu seçeneği, harmoni içermeyen sesler için kullanın. HP5'ten daha iyi bir şekilde vokalleri korur. İki dahili model içerir: HP2 ve HP3. HP3, eşlik sesini hafifçe sızdırabilir, ancak vokalleri HP2'den biraz daha iyi korur.<br>2. Sadece ana vokalleri koru: Bu seçeneği, harmoni içeren sesler için kullanın. Ana vokalleri zayıflatabilir. Bir dahili model içerir: HP5.<br>3. Reverb ve gecikme modelleri (FoxJoy tarafından):<br> (1) MDX-Net: Stereo reverb'i kaldırmak için en iyi seçenek, ancak mono reverb'i kaldıramaz;<br> (234) DeEcho: Gecikme efektlerini kaldırır. Agresif mod, Normal moda göre daha kapsamlı bir şekilde kaldırma yapar. DeReverb ayrıca reverb'i kaldırır ve mono reverb'i kaldırabilir, ancak yoğun yankılı yüksek frekanslı içerikler için çok etkili değildir.<br>Reverb/gecikme notları:<br>1. DeEcho-DeReverb modelinin işleme süresi diğer iki DeEcho modeline göre yaklaşık olarak iki kat daha uzundur.<br>2. MDX-Net-Dereverb modeli oldukça yavaştır.<br>3. Tavsiye edilen en temiz yapılandırma önce MDX-Net'i uygulamak ve ardından DeEcho-Aggressive uygulamaktır.",
|
39 |
+
"输入待处理音频文件夹路径": "İşlenecek ses klasörünün yolunu girin:",
|
40 |
+
"模型": "Model",
|
41 |
+
"指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:",
|
42 |
+
"指定输出非主人声文件夹": "Müzik ve diğer sesler için çıkış klasörünü belirtin:",
|
43 |
+
"训练": "Eğitim",
|
44 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Adım 1: Deneysel yapılandırmayı doldurun. Deneysel veriler 'logs' klasöründe saklanır ve her bir deney için ayrı bir klasör vardır. Deneysel adı yolu manuel olarak girin; bu yol, deneysel yapılandırmayı, günlükleri ve eğitilmiş model dosyalarını içerir.",
|
45 |
+
"输入实验名": "Deneysel adı girin:",
|
46 |
+
"目标采样率": "Hedef örnekleme oranı:",
|
47 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "Modelin ses yüksekliği (Pitch) rehberliği içerip içermediği (şarkı söyleme için şarttır, konuşma için isteğe bağlıdır):",
|
48 |
+
"版本": "Sürüm",
|
49 |
+
"提取音高和处理数据使用的CPU进程数": "Ses yüksekliği çıkartmak (Pitch) ve verileri işlemek için kullanılacak CPU işlemci sayısı:",
|
50 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Adım 2a: Eğitim klasöründe ses dosyalarını otomatik olarak gezinerek dilimleme normalizasyonu yapın. Deney dizini içinde 2 wav klasörü oluşturur. Şu anda sadece tek kişilik eğitim desteklenmektedir.",
|
51 |
+
"输入训练文件夹路径": "Eğitim klasörünün yolunu girin:",
|
52 |
+
"请指定说话人id": "Lütfen konuşmacı/sanatçı no belirtin:",
|
53 |
+
"处理数据": "Verileri işle",
|
54 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Adım 2b: Ses yüksekliği (Pitch) çıkartmak için CPU kullanın (eğer model ses yüksekliği içeriyorsa), özellikleri çıkartmak için GPU kullanın (GPU indeksini seçin):",
|
55 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "GPU indekslerini '-' ile ayırarak girin, örneğin 0-1-2, GPU 0, 1 ve 2'yi kullanmak için:",
|
56 |
+
"显卡信息": "GPU Bilgisi",
|
57 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Ses yüksekliği (Pitch) çıkartma algoritmasını seçin ('pm': daha hızlı çıkartma, ancak düşük kaliteli konuşma; 'dio': geliştirilmiş konuşma kalitesi, ancak daha yavaş çıkartma; 'harvest': daha iyi kalite, ancak daha da yavaş çıkartma):",
|
58 |
+
"特征提取": "Özellik çıkartma",
|
59 |
+
"step3: 填写训练设置, 开始训练模型和索引": "Adım 3: Eğitim ayarlarını doldurun ve modeli ve dizini eğitmeye başlayın",
|
60 |
+
"保存频率save_every_epoch": "Kaydetme sıklığı (save_every_epoch):",
|
61 |
+
"总训练轮数total_epoch": "Toplam eğitim turu (total_epoch):",
|
62 |
+
"每张显卡的batch_size": "Her GPU için yığın boyutu (batch_size):",
|
63 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "Sadece en son '.ckpt' dosyasını kaydet:",
|
64 |
+
"否": "Hayır",
|
65 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Tüm eğitim verilerini GPU belleğine önbelleğe alıp almayacağınızı belirtin. Küçük veri setlerini (10 dakikadan az) önbelleğe almak eğitimi hızlandırabilir, ancak büyük veri setlerini önbelleğe almak çok fazla GPU belleği tüketir ve çok fazla hız artışı sağlamaz:",
|
66 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "Her kaydetme noktasında son küçük bir modeli 'weights' klasörüne kaydetmek için:",
|
67 |
+
"加载预训练底模G路径": "Önceden eğitilmiş temel G modelini yükleme yolu:",
|
68 |
+
"加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:",
|
69 |
+
"训练模型": "Modeli Eğit",
|
70 |
+
"训练特征索引": "Özellik Dizinini Eğit",
|
71 |
+
"一键训练": "Tek Tuşla Eğit",
|
72 |
+
"ckpt处理": "ckpt İşleme",
|
73 |
+
"模型融合, 可用于测试音色融合": "Model birleştirme, ses rengi birleştirmesi için kullanılabilir",
|
74 |
+
"A模型路径": "A Modeli Yolu:",
|
75 |
+
"B模型路径": "B Modeli Yolu:",
|
76 |
+
"A模型权重": "A Modeli Ağırlığı:",
|
77 |
+
"模型是否带音高指导": "Modelin ses yüksekliği rehberi içerip içermediği:",
|
78 |
+
"要置入的模型信息": "Eklemek için model bilgileri:",
|
79 |
+
"保存的模型名不带后缀": "Kaydedilecek model adı (uzantı olmadan):",
|
80 |
+
"模型版本型号": "Model mimari versiyonu:",
|
81 |
+
"融合": "Birleştir",
|
82 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini düzenle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)",
|
83 |
+
"模型路径": "Model Yolu:",
|
84 |
+
"要改的模型信息": "Düzenlenecek model bilgileri:",
|
85 |
+
"保存的文件名, 默认空为和源文件同名": "Kaydedilecek dosya adı (varsayılan: kaynak dosya ile aynı):",
|
86 |
+
"修改": "Düzenle",
|
87 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini görüntüle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)",
|
88 |
+
"查看": "Görüntüle",
|
89 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Model çıkartma (büyük dosya modeli yolunu 'logs' klasöründe girin). Bu, eğitimi yarıda bırakmak istediğinizde ve manuel olarak küçük bir model dosyası çıkartmak ve kaydetmek istediğinizde veya bir ara modeli test etmek istediğinizde kullanışlıdır:",
|
90 |
+
"保存名": "Kaydetme Adı:",
|
91 |
+
"模型是否带音高指导,1是0否": "Modelin ses yüksekliği rehberi içerip içermediği (1: evet, 0: hayır):",
|
92 |
+
"提取": "Çıkart",
|
93 |
+
"Onnx导出": "Onnx Dışa Aktar",
|
94 |
+
"RVC模型路径": "RVC Model Yolu:",
|
95 |
+
"Onnx输出路径": "Onnx Dışa Aktarım Yolu:",
|
96 |
+
"导出Onnx模型": "Onnx Modeli Dışa Aktar",
|
97 |
+
"常见问题解答": "Sıkça Sorulan Sorular (SSS)",
|
98 |
+
"招募音高曲线前端编辑器": "Ses yükseklik eğrisi ön uç düzenleyicisi için işe alım",
|
99 |
+
"加开发群联系我xxxxx": "Geliştirme grubuna katılın ve benimle iletişime geçin: xxxxx",
|
100 |
+
"点击查看交流、问题反馈群号": "İletişim ve sorun geri bildirim grup numarasını görüntülemek için tıklayın",
|
101 |
+
"xxxxx": "xxxxx",
|
102 |
+
"加载模型": "Model yükle",
|
103 |
+
"Hubert模型": "Hubert Modeli",
|
104 |
+
"选择.pth文件": ".pth dosyası seç",
|
105 |
+
"选择.index文件": ".index dosyası seç",
|
106 |
+
"选择.npy文件": ".npy dosyası seç",
|
107 |
+
"输入设备": "Giriş cihazı",
|
108 |
+
"输出设备": "Çıkış cihazı",
|
109 |
+
"音频设备(请使用同种类驱动)": "Ses cihazı (aynı tür sürücüyü kullanın)",
|
110 |
+
"响应阈值": "Tepki eşiği",
|
111 |
+
"音调设置": "Pitch ayarları",
|
112 |
+
"Index Rate": "Index Oranı",
|
113 |
+
"常规设置": "Genel ayarlar",
|
114 |
+
"采样长度": "Örnekleme uzunluğu",
|
115 |
+
"淡入淡出长度": "Geçiş (Fade) uzunluğu",
|
116 |
+
"额外推理时长": "Ekstra çıkartma süresi",
|
117 |
+
"输入降噪": "Giriş gürültü azaltma",
|
118 |
+
"输出降噪": "Çıkış gürültü azaltma",
|
119 |
+
"性能设置": "Performans ayarları",
|
120 |
+
"开始音频转换": "Ses dönüştürmeyi başlat",
|
121 |
+
"停止音频转换": "Ses dönüştürmeyi durdur",
|
122 |
+
"推理时间(ms):": "Çıkarsama süresi (ms):",
|
123 |
+
"请选择pth文件": "Lütfen .pth dosyası seçin",
|
124 |
+
"请选择index文件": "Lütfen .index dosyası seçin",
|
125 |
+
"hubert模型路径不可包含中文": "hubert modeli yolu Çince karakter içeremez",
|
126 |
+
"pth文件路径不可包含中文": ".pth dosya yolu Çince karakter içeremez",
|
127 |
+
"index文件路径不可包含中文": ".index dosya yolu Çince karakter içeremez",
|
128 |
+
"音高算法": "音高算法",
|
129 |
+
"harvest进程数": "harvest进程数"
|
130 |
+
}
|
i18n/zh_CN.json
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
|
3 |
+
"是": "是",
|
4 |
+
"step1:正在处理数据": "step1:正在处理数据",
|
5 |
+
"step2a:无需提取音高": "step2a:无需提取音高",
|
6 |
+
"step2b:正在提取特征": "step2b:正在提取特征",
|
7 |
+
"step3a:正在训练模型": "step3a:正在训练模型",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
|
9 |
+
"全流程结束!": "全流程结束!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.",
|
11 |
+
"模型推理": "模型推理",
|
12 |
+
"推理音色": "推理音色",
|
13 |
+
"刷新音色列表和索引路径": "刷新音色列表和索引路径",
|
14 |
+
"卸载音色省显存": "卸载音色省显存",
|
15 |
+
"请选择说话人id": "请选择说话人id",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
|
20 |
+
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
21 |
+
"特征检索库文件路径": "特征检索库文件路径",
|
22 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果",
|
23 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音",
|
24 |
+
"自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)",
|
25 |
+
"特征文件路径": "特征文件路径",
|
26 |
+
"检索特征占比": "检索特征占比",
|
27 |
+
"后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样",
|
28 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络",
|
29 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果",
|
30 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调",
|
31 |
+
"转换": "转换",
|
32 |
+
"输出信息": "输出信息",
|
33 |
+
"输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)",
|
34 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ",
|
35 |
+
"指定输出文件夹": "指定输出文件夹",
|
36 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",
|
37 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
|
38 |
+
"导出文件格式": "导出文件格式",
|
39 |
+
"伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
|
40 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。",
|
41 |
+
"输入待处理音频文件夹路径": "输入待处理音频文件夹路径",
|
42 |
+
"模型": "模型",
|
43 |
+
"指定输出主人声文件夹": "指定输出主人声文件夹",
|
44 |
+
"指定输出非主人声文件夹": "指定输出非主人声文件夹",
|
45 |
+
"训练": "训练",
|
46 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ",
|
47 |
+
"输入实验名": "输入实验名",
|
48 |
+
"目标采样率": "目标采样率",
|
49 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)",
|
50 |
+
"版本": "版本",
|
51 |
+
"提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数",
|
52 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ",
|
53 |
+
"输入训练文件夹路径": "输入训练文件夹路径",
|
54 |
+
"请指定说话人id": "请指定说话人id",
|
55 |
+
"处理数据": "处理数据",
|
56 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)",
|
57 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2",
|
58 |
+
"显卡信息": "显卡信息",
|
59 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢",
|
60 |
+
"特征提取": "特征提取",
|
61 |
+
"step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引",
|
62 |
+
"保存频率save_every_epoch": "保存频率save_every_epoch",
|
63 |
+
"总训练轮数total_epoch": "总训练轮数total_epoch",
|
64 |
+
"每张显卡的batch_size": "每张显卡的batch_size",
|
65 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间",
|
66 |
+
"否": "否",
|
67 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速",
|
68 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹",
|
69 |
+
"加载预训练底模G路径": "加载预训练底模G路径",
|
70 |
+
"加载预训练底模D路径": "加载预训练底模D路径",
|
71 |
+
"训练模型": "训练模型",
|
72 |
+
"训练特征索引": "���练特征索引",
|
73 |
+
"一键训练": "一键训练",
|
74 |
+
"ckpt处理": "ckpt处理",
|
75 |
+
"模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合",
|
76 |
+
"A模型路径": "A模型路径",
|
77 |
+
"B模型路径": "B模型路径",
|
78 |
+
"A模型权重": "A模型权重",
|
79 |
+
"模型是否带音高指导": "模型是否带音高指导",
|
80 |
+
"要置入的模型信息": "要置入的模型信息",
|
81 |
+
"保存的模型名不带后缀": "保存的模型名不带后缀",
|
82 |
+
"模型版本型号": "模型版本型号",
|
83 |
+
"融合": "融合",
|
84 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)",
|
85 |
+
"模型路径": "模型路径",
|
86 |
+
"要改的模型信息": "要改的模型信息",
|
87 |
+
"保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名",
|
88 |
+
"修改": "修改",
|
89 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)",
|
90 |
+
"查看": "查看",
|
91 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况",
|
92 |
+
"保存名": "保存名",
|
93 |
+
"模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否",
|
94 |
+
"提取": "提取",
|
95 |
+
"Onnx导出": "Onnx导出",
|
96 |
+
"RVC模型路径": "RVC模型路径",
|
97 |
+
"Onnx输出路径": "Onnx输出路径",
|
98 |
+
"导出Onnx模型": "导出Onnx模型",
|
99 |
+
"常见问题解答": "常见问题解答",
|
100 |
+
"招募音高曲线前端编辑器": "招募音高曲线前端编辑器",
|
101 |
+
"加开发群联系我xxxxx": "加开发群联系我xxxxx",
|
102 |
+
"点击查看交流、问题反馈群号": "点击查看交流、问题反馈群号",
|
103 |
+
"xxxxx": "xxxxx",
|
104 |
+
"加载模型": "加载模型",
|
105 |
+
"Hubert模型": "Hubert模型",
|
106 |
+
"选择.pth文件": "选择.pth文件",
|
107 |
+
"选择.index文件": "选择.index文件",
|
108 |
+
"选择.npy文件": "选择.npy文件",
|
109 |
+
"输入设备": "输入设备",
|
110 |
+
"输出设备": "输出设备",
|
111 |
+
"音频设备(请使用同种类驱动)": "音频设备(请使用同种类驱动)",
|
112 |
+
"响应阈值": "响应阈值",
|
113 |
+
"音调设置": "音调设置",
|
114 |
+
"Index Rate": "Index Rate",
|
115 |
+
"常规设置": "常规设置",
|
116 |
+
"采样长度": "采样长度",
|
117 |
+
"淡入淡出长度": "淡入淡出长度",
|
118 |
+
"额外推理时长": "额外推理时长",
|
119 |
+
"输入降噪": "输入降噪",
|
120 |
+
"输出降噪": "输出降噪",
|
121 |
+
"性能设置": "性能设置",
|
122 |
+
"开始音频转换": "开始音频转换",
|
123 |
+
"停止音频转换": "停止音频转换",
|
124 |
+
"推理时间(ms):": "推理时间(ms):",
|
125 |
+
"请选择pth文件": "请选择pth文件",
|
126 |
+
"请选择index文件": "请选择index文件",
|
127 |
+
"hubert模型路径不可包含中文": "hubert模型路径不可包含中文",
|
128 |
+
"pth文件路径不可包含中文": "pth文件路径不可包含中文",
|
129 |
+
"index文件路径不可包含中文": "index文件路径不可包含中文",
|
130 |
+
"音高算法": "音高算法",
|
131 |
+
"harvest进程数": "harvest进程数"
|
132 |
+
}
|
i18n/zh_HK.json
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
|
3 |
+
"是": "是",
|
4 |
+
"step1:正在处理数据": "step1:正在处理数据",
|
5 |
+
"step2a:无需提取音高": "step2a:无需提取音高",
|
6 |
+
"step2b:正在提取特征": "step2b:正在提取特征",
|
7 |
+
"step3a:正在训练模型": "step3a:正在训练模型",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
|
9 |
+
"全流程结束!": "全流程结束!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
|
11 |
+
"模型推理": "模型推理",
|
12 |
+
"推理音色": "推理音色",
|
13 |
+
"刷新音色列表和索引路径": "刷新音色列表和索引路徑",
|
14 |
+
"卸载音色省显存": "卸載音色節省 VRAM",
|
15 |
+
"请选择说话人id": "請選擇說話人ID",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU",
|
20 |
+
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
21 |
+
"特征检索库文件路径": "特徵檢索庫檔案路徑",
|
22 |
+
"自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
|
23 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音",
|
24 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
|
25 |
+
"特征文件路径": "特徵檔案路徑",
|
26 |
+
"检索特征占比": "檢索特徵佔比",
|
27 |
+
"后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣",
|
28 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡",
|
29 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果",
|
30 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調",
|
31 |
+
"转换": "轉換",
|
32 |
+
"输出信息": "輸出訊息",
|
33 |
+
"输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)",
|
34 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
|
35 |
+
"指定输出文件夹": "指定輸出資料夾",
|
36 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
|
37 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾",
|
38 |
+
"导出文件格式": "導出檔格式",
|
39 |
+
"伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
|
40 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。��置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。<br>模型分為三類:<br>1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;<br>2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。<br>3. 消除混響和延遲模型(由FoxJoy提供):<br> (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;<br> (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項:<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;<br>2. MDX-Net-Dereverb模型相當慢;<br>3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。",
|
41 |
+
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
|
42 |
+
"模型": "模型",
|
43 |
+
"指定输出主人声文件夹": "指定输出主人声文件夹",
|
44 |
+
"指定输出非主人声文件夹": "指定输出非主人声文件夹",
|
45 |
+
"训练": "訓練",
|
46 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。",
|
47 |
+
"输入实验名": "輸入實驗名稱",
|
48 |
+
"目标采样率": "目標取樣率",
|
49 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)",
|
50 |
+
"版本": "版本",
|
51 |
+
"提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
|
52 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。",
|
53 |
+
"输入训练文件夹路径": "輸入訓練檔案夾路徑",
|
54 |
+
"请指定说话人id": "請指定說話人id",
|
55 |
+
"处理数据": "處理資料",
|
56 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
|
57 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
|
58 |
+
"显卡信息": "顯示卡資訊",
|
59 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "選擇音高提取算法:輸入歌聲可用pm提速,高品質語音但CPU差可用dio提速,harvest品質更好但較慢",
|
60 |
+
"特征提取": "特徵提取",
|
61 |
+
"step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
|
62 |
+
"保存频率save_every_epoch": "保存頻率save_every_epoch",
|
63 |
+
"总训练轮数total_epoch": "總訓練輪數total_epoch",
|
64 |
+
"每张显卡的batch_size": "每张显卡的batch_size",
|
65 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
|
66 |
+
"否": "否",
|
67 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度",
|
68 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
|
69 |
+
"加载预训练��模G路径": "加載預訓練底模G路徑",
|
70 |
+
"加载预训练底模D路径": "加載預訓練底模D路徑",
|
71 |
+
"训练模型": "訓練模型",
|
72 |
+
"训练特征索引": "訓練特徵索引",
|
73 |
+
"一键训练": "一鍵訓練",
|
74 |
+
"ckpt处理": "ckpt處理",
|
75 |
+
"模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合",
|
76 |
+
"A模型路径": "A模型路徑",
|
77 |
+
"B模型路径": "B模型路徑",
|
78 |
+
"A模型权重": "A模型權重",
|
79 |
+
"模型是否带音高指导": "模型是否帶音高指導",
|
80 |
+
"要置入的模型信息": "要置入的模型資訊",
|
81 |
+
"保存的模型名不带后缀": "儲存的模型名不帶副檔名",
|
82 |
+
"模型版本型号": "模型版本型號",
|
83 |
+
"融合": "融合",
|
84 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
|
85 |
+
"模型路径": "模型路徑",
|
86 |
+
"要改的模型信息": "要改的模型資訊",
|
87 |
+
"保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名",
|
88 |
+
"修改": "修改",
|
89 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
|
90 |
+
"查看": "查看",
|
91 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況",
|
92 |
+
"保存名": "儲存名",
|
93 |
+
"模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否",
|
94 |
+
"提取": "提取",
|
95 |
+
"Onnx导出": "Onnx导出",
|
96 |
+
"RVC模型路径": "RVC模型路径",
|
97 |
+
"Onnx输出路径": "Onnx输出路径",
|
98 |
+
"导出Onnx模型": "导出Onnx模型",
|
99 |
+
"常见问题解答": "常見問題解答",
|
100 |
+
"招募音高曲线前端编辑器": "招募音高曲線前端編輯器",
|
101 |
+
"加开发群联系我xxxxx": "加開發群聯繫我xxxxx",
|
102 |
+
"点击查看交流、问题反馈群号": "點擊查看交流、問題反饋群號",
|
103 |
+
"xxxxx": "xxxxx",
|
104 |
+
"加载模型": "載入模型",
|
105 |
+
"Hubert模型": "Hubert 模型",
|
106 |
+
"选择.pth文件": "選擇 .pth 檔案",
|
107 |
+
"选择.index文件": "選擇 .index 檔案",
|
108 |
+
"选择.npy文件": "選擇 .npy 檔案",
|
109 |
+
"输入设备": "輸入設備",
|
110 |
+
"输出设备": "輸出設備",
|
111 |
+
"音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
|
112 |
+
"响应阈值": "響應閾值",
|
113 |
+
"音调设置": "音調設定",
|
114 |
+
"Index Rate": "Index Rate",
|
115 |
+
"常规设置": "一般設定",
|
116 |
+
"采样长度": "取樣長度",
|
117 |
+
"淡入淡出长度": "淡入淡出長度",
|
118 |
+
"额外推理时长": "額外推理時長",
|
119 |
+
"输入降噪": "輸入降噪",
|
120 |
+
"输出降噪": "輸出降噪",
|
121 |
+
"性能设置": "效能設定",
|
122 |
+
"开始音频转换": "開始音訊轉換",
|
123 |
+
"停止音频转换": "停止音訊轉換",
|
124 |
+
"推理时间(ms):": "推理時間(ms):",
|
125 |
+
"请选择pth文件": "请选择pth文件",
|
126 |
+
"请选择index文件": "请选择index文件",
|
127 |
+
"hubert模型路径不可包含中文": "hubert模型路径不可包含中文",
|
128 |
+
"pth文件路径不可包含中文": "pth文件路径不可包含中文",
|
129 |
+
"index文件路径不可包含中文": "index文件路径不可包含中文",
|
130 |
+
"音高算法": "音高算法",
|
131 |
+
"harvest进程数": "harvest进程数"
|
132 |
+
}
|
i18n/zh_SG.json
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
|
3 |
+
"是": "是",
|
4 |
+
"step1:正在处理数据": "step1:正在处理数据",
|
5 |
+
"step2a:无需提取音高": "step2a:无需提取音高",
|
6 |
+
"step2b:正在提取特征": "step2b:正在提取特征",
|
7 |
+
"step3a:正在训练模型": "step3a:正在训练模型",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
|
9 |
+
"全流程结束!": "全流程结束!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
|
11 |
+
"模型推理": "模型推理",
|
12 |
+
"推理音色": "推理音色",
|
13 |
+
"刷新音色列表和索引路径": "刷新音色列表和索引路徑",
|
14 |
+
"卸载音色省显存": "卸載音色節省 VRAM",
|
15 |
+
"请选择说话人id": "請選擇說話人ID",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU",
|
20 |
+
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
21 |
+
"特征检索库文件路径": "特徵檢索庫檔案路徑",
|
22 |
+
"特征文件路径": "特徵檔案路徑",
|
23 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音",
|
24 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
|
25 |
+
"自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
|
26 |
+
"检索特征占比": "檢索特徵佔比",
|
27 |
+
"后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣",
|
28 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡",
|
29 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果",
|
30 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調",
|
31 |
+
"转换": "轉換",
|
32 |
+
"输出信息": "輸出訊息",
|
33 |
+
"输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)",
|
34 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
|
35 |
+
"指定输出文件夹": "指定輸出資料夾",
|
36 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
|
37 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾",
|
38 |
+
"导出文件格式": "導出檔格式",
|
39 |
+
"伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
|
40 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。��置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。<br>模型分為三類:<br>1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;<br>2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。<br>3. 消除混響和延遲模型(由FoxJoy提供):<br> (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;<br> (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項:<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;<br>2. MDX-Net-Dereverb模型相當慢;<br>3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。",
|
41 |
+
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
|
42 |
+
"模型": "模型",
|
43 |
+
"指定输出主人声文件夹": "指定输出主人声文件夹",
|
44 |
+
"指定输出非主人声文件夹": "指定输出非主人声文件夹",
|
45 |
+
"训练": "訓練",
|
46 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。",
|
47 |
+
"输入实验名": "輸入實驗名稱",
|
48 |
+
"目标采样率": "目標取樣率",
|
49 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)",
|
50 |
+
"版本": "版本",
|
51 |
+
"提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
|
52 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。",
|
53 |
+
"输入训练文件夹路径": "輸入訓練檔案夾路徑",
|
54 |
+
"请指定说话人id": "請指定說話人id",
|
55 |
+
"处理数据": "處理資料",
|
56 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
|
57 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
|
58 |
+
"显卡信息": "顯示卡資訊",
|
59 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "選擇音高提取算法:輸入歌聲可用pm提速,高品質語音但CPU差可用dio提速,harvest品質更好但較慢",
|
60 |
+
"特征提取": "特徵提取",
|
61 |
+
"step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
|
62 |
+
"保存频率save_every_epoch": "保存頻率save_every_epoch",
|
63 |
+
"总训练轮数total_epoch": "總訓練輪數total_epoch",
|
64 |
+
"每张显卡的batch_size": "每张显卡的batch_size",
|
65 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
|
66 |
+
"否": "否",
|
67 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度",
|
68 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
|
69 |
+
"加载预训练��模G路径": "加載預訓練底模G路徑",
|
70 |
+
"加载预训练底模D路径": "加載預訓練底模D路徑",
|
71 |
+
"训练模型": "訓練模型",
|
72 |
+
"训练特征索引": "訓練特徵索引",
|
73 |
+
"一键训练": "一鍵訓練",
|
74 |
+
"ckpt处理": "ckpt處理",
|
75 |
+
"模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合",
|
76 |
+
"A模型路径": "A模型路徑",
|
77 |
+
"B模型路径": "B模型路徑",
|
78 |
+
"A模型权重": "A模型權重",
|
79 |
+
"模型是否带音高指导": "模型是否帶音高指導",
|
80 |
+
"要置入的模型信息": "要置入的模型資訊",
|
81 |
+
"保存的模型名不带后缀": "儲存的模型名不帶副檔名",
|
82 |
+
"模型版本型号": "模型版本型號",
|
83 |
+
"融合": "融合",
|
84 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
|
85 |
+
"模型路径": "模型路徑",
|
86 |
+
"要改的模型信息": "要改的模型資訊",
|
87 |
+
"保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名",
|
88 |
+
"修改": "修改",
|
89 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
|
90 |
+
"查看": "查看",
|
91 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況",
|
92 |
+
"保存名": "儲存名",
|
93 |
+
"模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否",
|
94 |
+
"提取": "提取",
|
95 |
+
"Onnx导出": "Onnx导出",
|
96 |
+
"RVC模型路径": "RVC模型路径",
|
97 |
+
"Onnx输出路径": "Onnx输出路径",
|
98 |
+
"导出Onnx模型": "导出Onnx模型",
|
99 |
+
"常见问题解答": "常見問題解答",
|
100 |
+
"招募音高曲线前端编辑器": "招募音高曲線前端編輯器",
|
101 |
+
"加开发群联系我xxxxx": "加開發群聯繫我xxxxx",
|
102 |
+
"点击查看交流、问题反馈群号": "點擊查看交流、問題反饋群號",
|
103 |
+
"xxxxx": "xxxxx",
|
104 |
+
"加载模型": "載入模型",
|
105 |
+
"Hubert模型": "Hubert 模型",
|
106 |
+
"选择.pth文件": "選擇 .pth 檔案",
|
107 |
+
"选择.index文件": "選擇 .index 檔案",
|
108 |
+
"选择.npy文件": "選擇 .npy 檔案",
|
109 |
+
"输入设备": "輸入設備",
|
110 |
+
"输出设备": "輸出設備",
|
111 |
+
"音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
|
112 |
+
"响应阈值": "響應閾值",
|
113 |
+
"音调设置": "音調設定",
|
114 |
+
"Index Rate": "Index Rate",
|
115 |
+
"常规设置": "一般設定",
|
116 |
+
"采样长度": "取樣長度",
|
117 |
+
"淡入淡出长度": "淡入淡出長度",
|
118 |
+
"额外推理时长": "額外推理時長",
|
119 |
+
"输入降噪": "輸入降噪",
|
120 |
+
"输出降噪": "輸出降噪",
|
121 |
+
"性能设置": "效能設定",
|
122 |
+
"开始音频转换": "開始音訊轉換",
|
123 |
+
"停止音频转换": "停止音訊轉換",
|
124 |
+
"推理时间(ms):": "推理時間(ms):",
|
125 |
+
"请选择pth文件": "请选择pth文件",
|
126 |
+
"请选择index文件": "请选择index文件",
|
127 |
+
"hubert模型路径不可包含中文": "hubert模型路径不可包含中文",
|
128 |
+
"pth文件路径不可包含中文": "pth文件路径不可包含中文",
|
129 |
+
"index文件路径不可包含中文": "index文件路径不可包含中文",
|
130 |
+
"音高算法": "音高算法",
|
131 |
+
"harvest进程数": "harvest进程数"
|
132 |
+
}
|
i18n/zh_TW.json
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
|
3 |
+
"是": "是",
|
4 |
+
"step1:正在处理数据": "step1:正在处理数据",
|
5 |
+
"step2a:无需提取音高": "step2a:无需提取音高",
|
6 |
+
"step2b:正在提取特征": "step2b:正在提取特征",
|
7 |
+
"step3a:正在训练模型": "step3a:正在训练模型",
|
8 |
+
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
|
9 |
+
"全流程结束!": "全流程结束!",
|
10 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
|
11 |
+
"模型推理": "模型推理",
|
12 |
+
"推理音色": "推理音色",
|
13 |
+
"刷新音色列表和索引路径": "刷新音色列表和索引路徑",
|
14 |
+
"卸载音色省显存": "卸載音色節省 VRAM",
|
15 |
+
"请选择说话人id": "請選擇說話人ID",
|
16 |
+
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。",
|
17 |
+
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
|
18 |
+
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
|
19 |
+
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU",
|
20 |
+
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
21 |
+
"特征检索库文件路径": "特徵檢索庫檔案路徑",
|
22 |
+
"特征文件路径": "特徵檔案路徑",
|
23 |
+
"自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
|
24 |
+
">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音",
|
25 |
+
"特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
|
26 |
+
"检索特征占比": "檢索特徵佔比",
|
27 |
+
"后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣",
|
28 |
+
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡",
|
29 |
+
"保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果",
|
30 |
+
"F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調",
|
31 |
+
"转换": "轉換",
|
32 |
+
"输出信息": "輸出訊息",
|
33 |
+
"输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)",
|
34 |
+
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
|
35 |
+
"指定输出文件夹": "指定輸出資料夾",
|
36 |
+
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
|
37 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾",
|
38 |
+
"导出文件格式": "導出檔格式",
|
39 |
+
"伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
|
40 |
+
"人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。��置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br> (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br> (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。<br>模型分為三類:<br>1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;<br>2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。<br>3. 消除混響和延遲模型(由FoxJoy提供):<br> (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;<br> (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項:<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;<br>2. MDX-Net-Dereverb模型相當慢;<br>3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。",
|
41 |
+
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
|
42 |
+
"模型": "模型",
|
43 |
+
"指定输出主人声文件夹": "指定输出主人声文件夹",
|
44 |
+
"指定输出非主人声文件夹": "指定输出非主人声文件夹",
|
45 |
+
"训练": "訓練",
|
46 |
+
"step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。",
|
47 |
+
"输入实验名": "輸入實驗名稱",
|
48 |
+
"目标采样率": "目標取樣率",
|
49 |
+
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)",
|
50 |
+
"版本": "版本",
|
51 |
+
"提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
|
52 |
+
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。",
|
53 |
+
"输入训练文件夹路径": "輸入訓練檔案夾路徑",
|
54 |
+
"请指定说话人id": "請指定說話人id",
|
55 |
+
"处理数据": "處理資料",
|
56 |
+
"step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
|
57 |
+
"以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
|
58 |
+
"显卡信息": "顯示卡資訊",
|
59 |
+
"选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "選擇音高提取算法:輸入歌聲可用pm提速,高品質語音但CPU差可用dio提速,harvest品質更好但較慢",
|
60 |
+
"特征提取": "特徵提取",
|
61 |
+
"step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
|
62 |
+
"保存频率save_every_epoch": "保存頻率save_every_epoch",
|
63 |
+
"总训练轮数total_epoch": "總訓練輪數total_epoch",
|
64 |
+
"每张显卡的batch_size": "每张显卡的batch_size",
|
65 |
+
"是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
|
66 |
+
"否": "否",
|
67 |
+
"是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度",
|
68 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
|
69 |
+
"加载预训练��模G路径": "加載預訓練底模G路徑",
|
70 |
+
"加载预训练底模D路径": "加載預訓練底模D路徑",
|
71 |
+
"训练模型": "訓練模型",
|
72 |
+
"训练特征索引": "訓練特徵索引",
|
73 |
+
"一键训练": "一鍵訓練",
|
74 |
+
"ckpt处理": "ckpt處理",
|
75 |
+
"模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合",
|
76 |
+
"A模型路径": "A模型路徑",
|
77 |
+
"B模型路径": "B模型路徑",
|
78 |
+
"A模型权重": "A模型權重",
|
79 |
+
"模型是否带音高指导": "模型是否帶音高指導",
|
80 |
+
"要置入的模型信息": "要置入的模型資訊",
|
81 |
+
"保存的模型名不带后缀": "儲存的模型名不帶副檔名",
|
82 |
+
"模型版本型号": "模型版本型號",
|
83 |
+
"融合": "融合",
|
84 |
+
"修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
|
85 |
+
"模型路径": "模型路徑",
|
86 |
+
"要改的模型信息": "要改的模型資訊",
|
87 |
+
"保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名",
|
88 |
+
"修改": "修改",
|
89 |
+
"查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
|
90 |
+
"查看": "查看",
|
91 |
+
"模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況",
|
92 |
+
"保存名": "儲存名",
|
93 |
+
"模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否",
|
94 |
+
"提取": "提取",
|
95 |
+
"Onnx导出": "Onnx导出",
|
96 |
+
"RVC模型路径": "RVC模型路径",
|
97 |
+
"Onnx输出路径": "Onnx输出路径",
|
98 |
+
"导出Onnx模型": "导出Onnx模型",
|
99 |
+
"常见问题解答": "常見問題解答",
|
100 |
+
"招募音高曲线前端编辑器": "招募音高曲線前端編輯器",
|
101 |
+
"加开发群联系我xxxxx": "加開發群聯繫我xxxxx",
|
102 |
+
"点击查看交流、问题反馈群号": "點擊查看交流、問題反饋群號",
|
103 |
+
"xxxxx": "xxxxx",
|
104 |
+
"加载模型": "載入模型",
|
105 |
+
"Hubert模型": "Hubert 模型",
|
106 |
+
"选择.pth文件": "選擇 .pth 檔案",
|
107 |
+
"选择.index文件": "選擇 .index 檔案",
|
108 |
+
"选择.npy文件": "選擇 .npy 檔案",
|
109 |
+
"输入设备": "輸入設備",
|
110 |
+
"输出设备": "輸出設備",
|
111 |
+
"音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
|
112 |
+
"响应阈值": "響應閾值",
|
113 |
+
"音调设置": "音調設定",
|
114 |
+
"Index Rate": "Index Rate",
|
115 |
+
"常规设置": "一般設定",
|
116 |
+
"采样长度": "取樣長度",
|
117 |
+
"淡入淡出长度": "淡入淡出長度",
|
118 |
+
"额外推理时长": "額外推理時長",
|
119 |
+
"输入降噪": "輸入降噪",
|
120 |
+
"输出降噪": "輸出降噪",
|
121 |
+
"性能设置": "效能設定",
|
122 |
+
"开始音频转换": "開始音訊轉換",
|
123 |
+
"停止音频转换": "停止音訊轉換",
|
124 |
+
"推理时间(ms):": "推理時間(ms):",
|
125 |
+
"请选择pth文件": "请选择pth文件",
|
126 |
+
"请选择index文件": "请选择index文件",
|
127 |
+
"hubert模型路径不可包含中文": "hubert模型路径不可包含中文",
|
128 |
+
"pth文件路径不可包含中文": "pth文件路径不可包含中文",
|
129 |
+
"index文件路径不可包含中文": "index文件路径不可包含中文",
|
130 |
+
"音高算法": "音高算法",
|
131 |
+
"harvest进程数": "harvest进程数"
|
132 |
+
}
|
lib/infer_pack/attentions.py
ADDED
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
from torch.nn import functional as F
|
7 |
+
|
8 |
+
from lib.infer_pack import commons
|
9 |
+
from lib.infer_pack import modules
|
10 |
+
from lib.infer_pack.modules import LayerNorm
|
11 |
+
|
12 |
+
|
13 |
+
class Encoder(nn.Module):
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
hidden_channels,
|
17 |
+
filter_channels,
|
18 |
+
n_heads,
|
19 |
+
n_layers,
|
20 |
+
kernel_size=1,
|
21 |
+
p_dropout=0.0,
|
22 |
+
window_size=10,
|
23 |
+
**kwargs
|
24 |
+
):
|
25 |
+
super().__init__()
|
26 |
+
self.hidden_channels = hidden_channels
|
27 |
+
self.filter_channels = filter_channels
|
28 |
+
self.n_heads = n_heads
|
29 |
+
self.n_layers = n_layers
|
30 |
+
self.kernel_size = kernel_size
|
31 |
+
self.p_dropout = p_dropout
|
32 |
+
self.window_size = window_size
|
33 |
+
|
34 |
+
self.drop = nn.Dropout(p_dropout)
|
35 |
+
self.attn_layers = nn.ModuleList()
|
36 |
+
self.norm_layers_1 = nn.ModuleList()
|
37 |
+
self.ffn_layers = nn.ModuleList()
|
38 |
+
self.norm_layers_2 = nn.ModuleList()
|
39 |
+
for i in range(self.n_layers):
|
40 |
+
self.attn_layers.append(
|
41 |
+
MultiHeadAttention(
|
42 |
+
hidden_channels,
|
43 |
+
hidden_channels,
|
44 |
+
n_heads,
|
45 |
+
p_dropout=p_dropout,
|
46 |
+
window_size=window_size,
|
47 |
+
)
|
48 |
+
)
|
49 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
50 |
+
self.ffn_layers.append(
|
51 |
+
FFN(
|
52 |
+
hidden_channels,
|
53 |
+
hidden_channels,
|
54 |
+
filter_channels,
|
55 |
+
kernel_size,
|
56 |
+
p_dropout=p_dropout,
|
57 |
+
)
|
58 |
+
)
|
59 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
60 |
+
|
61 |
+
def forward(self, x, x_mask):
|
62 |
+
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
63 |
+
x = x * x_mask
|
64 |
+
for i in range(self.n_layers):
|
65 |
+
y = self.attn_layers[i](x, x, attn_mask)
|
66 |
+
y = self.drop(y)
|
67 |
+
x = self.norm_layers_1[i](x + y)
|
68 |
+
|
69 |
+
y = self.ffn_layers[i](x, x_mask)
|
70 |
+
y = self.drop(y)
|
71 |
+
x = self.norm_layers_2[i](x + y)
|
72 |
+
x = x * x_mask
|
73 |
+
return x
|
74 |
+
|
75 |
+
|
76 |
+
class Decoder(nn.Module):
|
77 |
+
def __init__(
|
78 |
+
self,
|
79 |
+
hidden_channels,
|
80 |
+
filter_channels,
|
81 |
+
n_heads,
|
82 |
+
n_layers,
|
83 |
+
kernel_size=1,
|
84 |
+
p_dropout=0.0,
|
85 |
+
proximal_bias=False,
|
86 |
+
proximal_init=True,
|
87 |
+
**kwargs
|
88 |
+
):
|
89 |
+
super().__init__()
|
90 |
+
self.hidden_channels = hidden_channels
|
91 |
+
self.filter_channels = filter_channels
|
92 |
+
self.n_heads = n_heads
|
93 |
+
self.n_layers = n_layers
|
94 |
+
self.kernel_size = kernel_size
|
95 |
+
self.p_dropout = p_dropout
|
96 |
+
self.proximal_bias = proximal_bias
|
97 |
+
self.proximal_init = proximal_init
|
98 |
+
|
99 |
+
self.drop = nn.Dropout(p_dropout)
|
100 |
+
self.self_attn_layers = nn.ModuleList()
|
101 |
+
self.norm_layers_0 = nn.ModuleList()
|
102 |
+
self.encdec_attn_layers = nn.ModuleList()
|
103 |
+
self.norm_layers_1 = nn.ModuleList()
|
104 |
+
self.ffn_layers = nn.ModuleList()
|
105 |
+
self.norm_layers_2 = nn.ModuleList()
|
106 |
+
for i in range(self.n_layers):
|
107 |
+
self.self_attn_layers.append(
|
108 |
+
MultiHeadAttention(
|
109 |
+
hidden_channels,
|
110 |
+
hidden_channels,
|
111 |
+
n_heads,
|
112 |
+
p_dropout=p_dropout,
|
113 |
+
proximal_bias=proximal_bias,
|
114 |
+
proximal_init=proximal_init,
|
115 |
+
)
|
116 |
+
)
|
117 |
+
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
118 |
+
self.encdec_attn_layers.append(
|
119 |
+
MultiHeadAttention(
|
120 |
+
hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
|
121 |
+
)
|
122 |
+
)
|
123 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
124 |
+
self.ffn_layers.append(
|
125 |
+
FFN(
|
126 |
+
hidden_channels,
|
127 |
+
hidden_channels,
|
128 |
+
filter_channels,
|
129 |
+
kernel_size,
|
130 |
+
p_dropout=p_dropout,
|
131 |
+
causal=True,
|
132 |
+
)
|
133 |
+
)
|
134 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
135 |
+
|
136 |
+
def forward(self, x, x_mask, h, h_mask):
|
137 |
+
"""
|
138 |
+
x: decoder input
|
139 |
+
h: encoder output
|
140 |
+
"""
|
141 |
+
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
|
142 |
+
device=x.device, dtype=x.dtype
|
143 |
+
)
|
144 |
+
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
145 |
+
x = x * x_mask
|
146 |
+
for i in range(self.n_layers):
|
147 |
+
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
148 |
+
y = self.drop(y)
|
149 |
+
x = self.norm_layers_0[i](x + y)
|
150 |
+
|
151 |
+
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
152 |
+
y = self.drop(y)
|
153 |
+
x = self.norm_layers_1[i](x + y)
|
154 |
+
|
155 |
+
y = self.ffn_layers[i](x, x_mask)
|
156 |
+
y = self.drop(y)
|
157 |
+
x = self.norm_layers_2[i](x + y)
|
158 |
+
x = x * x_mask
|
159 |
+
return x
|
160 |
+
|
161 |
+
|
162 |
+
class MultiHeadAttention(nn.Module):
|
163 |
+
def __init__(
|
164 |
+
self,
|
165 |
+
channels,
|
166 |
+
out_channels,
|
167 |
+
n_heads,
|
168 |
+
p_dropout=0.0,
|
169 |
+
window_size=None,
|
170 |
+
heads_share=True,
|
171 |
+
block_length=None,
|
172 |
+
proximal_bias=False,
|
173 |
+
proximal_init=False,
|
174 |
+
):
|
175 |
+
super().__init__()
|
176 |
+
assert channels % n_heads == 0
|
177 |
+
|
178 |
+
self.channels = channels
|
179 |
+
self.out_channels = out_channels
|
180 |
+
self.n_heads = n_heads
|
181 |
+
self.p_dropout = p_dropout
|
182 |
+
self.window_size = window_size
|
183 |
+
self.heads_share = heads_share
|
184 |
+
self.block_length = block_length
|
185 |
+
self.proximal_bias = proximal_bias
|
186 |
+
self.proximal_init = proximal_init
|
187 |
+
self.attn = None
|
188 |
+
|
189 |
+
self.k_channels = channels // n_heads
|
190 |
+
self.conv_q = nn.Conv1d(channels, channels, 1)
|
191 |
+
self.conv_k = nn.Conv1d(channels, channels, 1)
|
192 |
+
self.conv_v = nn.Conv1d(channels, channels, 1)
|
193 |
+
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
194 |
+
self.drop = nn.Dropout(p_dropout)
|
195 |
+
|
196 |
+
if window_size is not None:
|
197 |
+
n_heads_rel = 1 if heads_share else n_heads
|
198 |
+
rel_stddev = self.k_channels**-0.5
|
199 |
+
self.emb_rel_k = nn.Parameter(
|
200 |
+
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
201 |
+
* rel_stddev
|
202 |
+
)
|
203 |
+
self.emb_rel_v = nn.Parameter(
|
204 |
+
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
205 |
+
* rel_stddev
|
206 |
+
)
|
207 |
+
|
208 |
+
nn.init.xavier_uniform_(self.conv_q.weight)
|
209 |
+
nn.init.xavier_uniform_(self.conv_k.weight)
|
210 |
+
nn.init.xavier_uniform_(self.conv_v.weight)
|
211 |
+
if proximal_init:
|
212 |
+
with torch.no_grad():
|
213 |
+
self.conv_k.weight.copy_(self.conv_q.weight)
|
214 |
+
self.conv_k.bias.copy_(self.conv_q.bias)
|
215 |
+
|
216 |
+
def forward(self, x, c, attn_mask=None):
|
217 |
+
q = self.conv_q(x)
|
218 |
+
k = self.conv_k(c)
|
219 |
+
v = self.conv_v(c)
|
220 |
+
|
221 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
222 |
+
|
223 |
+
x = self.conv_o(x)
|
224 |
+
return x
|
225 |
+
|
226 |
+
def attention(self, query, key, value, mask=None):
|
227 |
+
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
228 |
+
b, d, t_s, t_t = (*key.size(), query.size(2))
|
229 |
+
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
230 |
+
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
231 |
+
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
232 |
+
|
233 |
+
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
234 |
+
if self.window_size is not None:
|
235 |
+
assert (
|
236 |
+
t_s == t_t
|
237 |
+
), "Relative attention is only available for self-attention."
|
238 |
+
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
239 |
+
rel_logits = self._matmul_with_relative_keys(
|
240 |
+
query / math.sqrt(self.k_channels), key_relative_embeddings
|
241 |
+
)
|
242 |
+
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
243 |
+
scores = scores + scores_local
|
244 |
+
if self.proximal_bias:
|
245 |
+
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
246 |
+
scores = scores + self._attention_bias_proximal(t_s).to(
|
247 |
+
device=scores.device, dtype=scores.dtype
|
248 |
+
)
|
249 |
+
if mask is not None:
|
250 |
+
scores = scores.masked_fill(mask == 0, -1e4)
|
251 |
+
if self.block_length is not None:
|
252 |
+
assert (
|
253 |
+
t_s == t_t
|
254 |
+
), "Local attention is only available for self-attention."
|
255 |
+
block_mask = (
|
256 |
+
torch.ones_like(scores)
|
257 |
+
.triu(-self.block_length)
|
258 |
+
.tril(self.block_length)
|
259 |
+
)
|
260 |
+
scores = scores.masked_fill(block_mask == 0, -1e4)
|
261 |
+
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
262 |
+
p_attn = self.drop(p_attn)
|
263 |
+
output = torch.matmul(p_attn, value)
|
264 |
+
if self.window_size is not None:
|
265 |
+
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
266 |
+
value_relative_embeddings = self._get_relative_embeddings(
|
267 |
+
self.emb_rel_v, t_s
|
268 |
+
)
|
269 |
+
output = output + self._matmul_with_relative_values(
|
270 |
+
relative_weights, value_relative_embeddings
|
271 |
+
)
|
272 |
+
output = (
|
273 |
+
output.transpose(2, 3).contiguous().view(b, d, t_t)
|
274 |
+
) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
275 |
+
return output, p_attn
|
276 |
+
|
277 |
+
def _matmul_with_relative_values(self, x, y):
|
278 |
+
"""
|
279 |
+
x: [b, h, l, m]
|
280 |
+
y: [h or 1, m, d]
|
281 |
+
ret: [b, h, l, d]
|
282 |
+
"""
|
283 |
+
ret = torch.matmul(x, y.unsqueeze(0))
|
284 |
+
return ret
|
285 |
+
|
286 |
+
def _matmul_with_relative_keys(self, x, y):
|
287 |
+
"""
|
288 |
+
x: [b, h, l, d]
|
289 |
+
y: [h or 1, m, d]
|
290 |
+
ret: [b, h, l, m]
|
291 |
+
"""
|
292 |
+
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
293 |
+
return ret
|
294 |
+
|
295 |
+
def _get_relative_embeddings(self, relative_embeddings, length):
|
296 |
+
max_relative_position = 2 * self.window_size + 1
|
297 |
+
# Pad first before slice to avoid using cond ops.
|
298 |
+
pad_length = max(length - (self.window_size + 1), 0)
|
299 |
+
slice_start_position = max((self.window_size + 1) - length, 0)
|
300 |
+
slice_end_position = slice_start_position + 2 * length - 1
|
301 |
+
if pad_length > 0:
|
302 |
+
padded_relative_embeddings = F.pad(
|
303 |
+
relative_embeddings,
|
304 |
+
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
|
305 |
+
)
|
306 |
+
else:
|
307 |
+
padded_relative_embeddings = relative_embeddings
|
308 |
+
used_relative_embeddings = padded_relative_embeddings[
|
309 |
+
:, slice_start_position:slice_end_position
|
310 |
+
]
|
311 |
+
return used_relative_embeddings
|
312 |
+
|
313 |
+
def _relative_position_to_absolute_position(self, x):
|
314 |
+
"""
|
315 |
+
x: [b, h, l, 2*l-1]
|
316 |
+
ret: [b, h, l, l]
|
317 |
+
"""
|
318 |
+
batch, heads, length, _ = x.size()
|
319 |
+
# Concat columns of pad to shift from relative to absolute indexing.
|
320 |
+
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
|
321 |
+
|
322 |
+
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
323 |
+
x_flat = x.view([batch, heads, length * 2 * length])
|
324 |
+
x_flat = F.pad(
|
325 |
+
x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
|
326 |
+
)
|
327 |
+
|
328 |
+
# Reshape and slice out the padded elements.
|
329 |
+
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
|
330 |
+
:, :, :length, length - 1 :
|
331 |
+
]
|
332 |
+
return x_final
|
333 |
+
|
334 |
+
def _absolute_position_to_relative_position(self, x):
|
335 |
+
"""
|
336 |
+
x: [b, h, l, l]
|
337 |
+
ret: [b, h, l, 2*l-1]
|
338 |
+
"""
|
339 |
+
batch, heads, length, _ = x.size()
|
340 |
+
# padd along column
|
341 |
+
x = F.pad(
|
342 |
+
x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
|
343 |
+
)
|
344 |
+
x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
|
345 |
+
# add 0's in the beginning that will skew the elements after reshape
|
346 |
+
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
347 |
+
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
|
348 |
+
return x_final
|
349 |
+
|
350 |
+
def _attention_bias_proximal(self, length):
|
351 |
+
"""Bias for self-attention to encourage attention to close positions.
|
352 |
+
Args:
|
353 |
+
length: an integer scalar.
|
354 |
+
Returns:
|
355 |
+
a Tensor with shape [1, 1, length, length]
|
356 |
+
"""
|
357 |
+
r = torch.arange(length, dtype=torch.float32)
|
358 |
+
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
359 |
+
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
360 |
+
|
361 |
+
|
362 |
+
class FFN(nn.Module):
|
363 |
+
def __init__(
|
364 |
+
self,
|
365 |
+
in_channels,
|
366 |
+
out_channels,
|
367 |
+
filter_channels,
|
368 |
+
kernel_size,
|
369 |
+
p_dropout=0.0,
|
370 |
+
activation=None,
|
371 |
+
causal=False,
|
372 |
+
):
|
373 |
+
super().__init__()
|
374 |
+
self.in_channels = in_channels
|
375 |
+
self.out_channels = out_channels
|
376 |
+
self.filter_channels = filter_channels
|
377 |
+
self.kernel_size = kernel_size
|
378 |
+
self.p_dropout = p_dropout
|
379 |
+
self.activation = activation
|
380 |
+
self.causal = causal
|
381 |
+
|
382 |
+
if causal:
|
383 |
+
self.padding = self._causal_padding
|
384 |
+
else:
|
385 |
+
self.padding = self._same_padding
|
386 |
+
|
387 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
388 |
+
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
389 |
+
self.drop = nn.Dropout(p_dropout)
|
390 |
+
|
391 |
+
def forward(self, x, x_mask):
|
392 |
+
x = self.conv_1(self.padding(x * x_mask))
|
393 |
+
if self.activation == "gelu":
|
394 |
+
x = x * torch.sigmoid(1.702 * x)
|
395 |
+
else:
|
396 |
+
x = torch.relu(x)
|
397 |
+
x = self.drop(x)
|
398 |
+
x = self.conv_2(self.padding(x * x_mask))
|
399 |
+
return x * x_mask
|
400 |
+
|
401 |
+
def _causal_padding(self, x):
|
402 |
+
if self.kernel_size == 1:
|
403 |
+
return x
|
404 |
+
pad_l = self.kernel_size - 1
|
405 |
+
pad_r = 0
|
406 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
407 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
408 |
+
return x
|
409 |
+
|
410 |
+
def _same_padding(self, x):
|
411 |
+
if self.kernel_size == 1:
|
412 |
+
return x
|
413 |
+
pad_l = (self.kernel_size - 1) // 2
|
414 |
+
pad_r = self.kernel_size // 2
|
415 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
416 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
417 |
+
return x
|
lib/infer_pack/commons.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
|
7 |
+
|
8 |
+
def init_weights(m, mean=0.0, std=0.01):
|
9 |
+
classname = m.__class__.__name__
|
10 |
+
if classname.find("Conv") != -1:
|
11 |
+
m.weight.data.normal_(mean, std)
|
12 |
+
|
13 |
+
|
14 |
+
def get_padding(kernel_size, dilation=1):
|
15 |
+
return int((kernel_size * dilation - dilation) / 2)
|
16 |
+
|
17 |
+
|
18 |
+
def convert_pad_shape(pad_shape):
|
19 |
+
l = pad_shape[::-1]
|
20 |
+
pad_shape = [item for sublist in l for item in sublist]
|
21 |
+
return pad_shape
|
22 |
+
|
23 |
+
|
24 |
+
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
25 |
+
"""KL(P||Q)"""
|
26 |
+
kl = (logs_q - logs_p) - 0.5
|
27 |
+
kl += (
|
28 |
+
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
|
29 |
+
)
|
30 |
+
return kl
|
31 |
+
|
32 |
+
|
33 |
+
def rand_gumbel(shape):
|
34 |
+
"""Sample from the Gumbel distribution, protect from overflows."""
|
35 |
+
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
36 |
+
return -torch.log(-torch.log(uniform_samples))
|
37 |
+
|
38 |
+
|
39 |
+
def rand_gumbel_like(x):
|
40 |
+
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
41 |
+
return g
|
42 |
+
|
43 |
+
|
44 |
+
def slice_segments(x, ids_str, segment_size=4):
|
45 |
+
ret = torch.zeros_like(x[:, :, :segment_size])
|
46 |
+
for i in range(x.size(0)):
|
47 |
+
idx_str = ids_str[i]
|
48 |
+
idx_end = idx_str + segment_size
|
49 |
+
ret[i] = x[i, :, idx_str:idx_end]
|
50 |
+
return ret
|
51 |
+
|
52 |
+
|
53 |
+
def slice_segments2(x, ids_str, segment_size=4):
|
54 |
+
ret = torch.zeros_like(x[:, :segment_size])
|
55 |
+
for i in range(x.size(0)):
|
56 |
+
idx_str = ids_str[i]
|
57 |
+
idx_end = idx_str + segment_size
|
58 |
+
ret[i] = x[i, idx_str:idx_end]
|
59 |
+
return ret
|
60 |
+
|
61 |
+
|
62 |
+
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
63 |
+
b, d, t = x.size()
|
64 |
+
if x_lengths is None:
|
65 |
+
x_lengths = t
|
66 |
+
ids_str_max = x_lengths - segment_size + 1
|
67 |
+
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
68 |
+
ret = slice_segments(x, ids_str, segment_size)
|
69 |
+
return ret, ids_str
|
70 |
+
|
71 |
+
|
72 |
+
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
73 |
+
position = torch.arange(length, dtype=torch.float)
|
74 |
+
num_timescales = channels // 2
|
75 |
+
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
|
76 |
+
num_timescales - 1
|
77 |
+
)
|
78 |
+
inv_timescales = min_timescale * torch.exp(
|
79 |
+
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
|
80 |
+
)
|
81 |
+
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
82 |
+
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
83 |
+
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
84 |
+
signal = signal.view(1, channels, length)
|
85 |
+
return signal
|
86 |
+
|
87 |
+
|
88 |
+
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
89 |
+
b, channels, length = x.size()
|
90 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
91 |
+
return x + signal.to(dtype=x.dtype, device=x.device)
|
92 |
+
|
93 |
+
|
94 |
+
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
95 |
+
b, channels, length = x.size()
|
96 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
97 |
+
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
98 |
+
|
99 |
+
|
100 |
+
def subsequent_mask(length):
|
101 |
+
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
102 |
+
return mask
|
103 |
+
|
104 |
+
|
105 |
+
@torch.jit.script
|
106 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
107 |
+
n_channels_int = n_channels[0]
|
108 |
+
in_act = input_a + input_b
|
109 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
110 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
111 |
+
acts = t_act * s_act
|
112 |
+
return acts
|
113 |
+
|
114 |
+
|
115 |
+
def convert_pad_shape(pad_shape):
|
116 |
+
l = pad_shape[::-1]
|
117 |
+
pad_shape = [item for sublist in l for item in sublist]
|
118 |
+
return pad_shape
|
119 |
+
|
120 |
+
|
121 |
+
def shift_1d(x):
|
122 |
+
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
123 |
+
return x
|
124 |
+
|
125 |
+
|
126 |
+
def sequence_mask(length, max_length=None):
|
127 |
+
if max_length is None:
|
128 |
+
max_length = length.max()
|
129 |
+
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
130 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
131 |
+
|
132 |
+
|
133 |
+
def generate_path(duration, mask):
|
134 |
+
"""
|
135 |
+
duration: [b, 1, t_x]
|
136 |
+
mask: [b, 1, t_y, t_x]
|
137 |
+
"""
|
138 |
+
device = duration.device
|
139 |
+
|
140 |
+
b, _, t_y, t_x = mask.shape
|
141 |
+
cum_duration = torch.cumsum(duration, -1)
|
142 |
+
|
143 |
+
cum_duration_flat = cum_duration.view(b * t_x)
|
144 |
+
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
145 |
+
path = path.view(b, t_x, t_y)
|
146 |
+
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
147 |
+
path = path.unsqueeze(1).transpose(2, 3) * mask
|
148 |
+
return path
|
149 |
+
|
150 |
+
|
151 |
+
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
152 |
+
if isinstance(parameters, torch.Tensor):
|
153 |
+
parameters = [parameters]
|
154 |
+
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
155 |
+
norm_type = float(norm_type)
|
156 |
+
if clip_value is not None:
|
157 |
+
clip_value = float(clip_value)
|
158 |
+
|
159 |
+
total_norm = 0
|
160 |
+
for p in parameters:
|
161 |
+
param_norm = p.grad.data.norm(norm_type)
|
162 |
+
total_norm += param_norm.item() ** norm_type
|
163 |
+
if clip_value is not None:
|
164 |
+
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
165 |
+
total_norm = total_norm ** (1.0 / norm_type)
|
166 |
+
return total_norm
|
lib/infer_pack/models.py
ADDED
@@ -0,0 +1,1142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, pdb, os
|
2 |
+
from time import time as ttime
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
from lib.infer_pack import modules
|
7 |
+
from lib.infer_pack import attentions
|
8 |
+
from lib.infer_pack import commons
|
9 |
+
from lib.infer_pack.commons import init_weights, get_padding
|
10 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
+
from lib.infer_pack.commons import init_weights
|
13 |
+
import numpy as np
|
14 |
+
from lib.infer_pack import commons
|
15 |
+
|
16 |
+
|
17 |
+
class TextEncoder256(nn.Module):
|
18 |
+
def __init__(
|
19 |
+
self,
|
20 |
+
out_channels,
|
21 |
+
hidden_channels,
|
22 |
+
filter_channels,
|
23 |
+
n_heads,
|
24 |
+
n_layers,
|
25 |
+
kernel_size,
|
26 |
+
p_dropout,
|
27 |
+
f0=True,
|
28 |
+
):
|
29 |
+
super().__init__()
|
30 |
+
self.out_channels = out_channels
|
31 |
+
self.hidden_channels = hidden_channels
|
32 |
+
self.filter_channels = filter_channels
|
33 |
+
self.n_heads = n_heads
|
34 |
+
self.n_layers = n_layers
|
35 |
+
self.kernel_size = kernel_size
|
36 |
+
self.p_dropout = p_dropout
|
37 |
+
self.emb_phone = nn.Linear(256, hidden_channels)
|
38 |
+
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
39 |
+
if f0 == True:
|
40 |
+
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
41 |
+
self.encoder = attentions.Encoder(
|
42 |
+
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
43 |
+
)
|
44 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
45 |
+
|
46 |
+
def forward(self, phone, pitch, lengths):
|
47 |
+
if pitch == None:
|
48 |
+
x = self.emb_phone(phone)
|
49 |
+
else:
|
50 |
+
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
51 |
+
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
52 |
+
x = self.lrelu(x)
|
53 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
54 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
|
55 |
+
x.dtype
|
56 |
+
)
|
57 |
+
x = self.encoder(x * x_mask, x_mask)
|
58 |
+
stats = self.proj(x) * x_mask
|
59 |
+
|
60 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
61 |
+
return m, logs, x_mask
|
62 |
+
|
63 |
+
|
64 |
+
class TextEncoder768(nn.Module):
|
65 |
+
def __init__(
|
66 |
+
self,
|
67 |
+
out_channels,
|
68 |
+
hidden_channels,
|
69 |
+
filter_channels,
|
70 |
+
n_heads,
|
71 |
+
n_layers,
|
72 |
+
kernel_size,
|
73 |
+
p_dropout,
|
74 |
+
f0=True,
|
75 |
+
):
|
76 |
+
super().__init__()
|
77 |
+
self.out_channels = out_channels
|
78 |
+
self.hidden_channels = hidden_channels
|
79 |
+
self.filter_channels = filter_channels
|
80 |
+
self.n_heads = n_heads
|
81 |
+
self.n_layers = n_layers
|
82 |
+
self.kernel_size = kernel_size
|
83 |
+
self.p_dropout = p_dropout
|
84 |
+
self.emb_phone = nn.Linear(768, hidden_channels)
|
85 |
+
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
86 |
+
if f0 == True:
|
87 |
+
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
88 |
+
self.encoder = attentions.Encoder(
|
89 |
+
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
90 |
+
)
|
91 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
92 |
+
|
93 |
+
def forward(self, phone, pitch, lengths):
|
94 |
+
if pitch == None:
|
95 |
+
x = self.emb_phone(phone)
|
96 |
+
else:
|
97 |
+
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
98 |
+
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
99 |
+
x = self.lrelu(x)
|
100 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
101 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
|
102 |
+
x.dtype
|
103 |
+
)
|
104 |
+
x = self.encoder(x * x_mask, x_mask)
|
105 |
+
stats = self.proj(x) * x_mask
|
106 |
+
|
107 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
108 |
+
return m, logs, x_mask
|
109 |
+
|
110 |
+
|
111 |
+
class ResidualCouplingBlock(nn.Module):
|
112 |
+
def __init__(
|
113 |
+
self,
|
114 |
+
channels,
|
115 |
+
hidden_channels,
|
116 |
+
kernel_size,
|
117 |
+
dilation_rate,
|
118 |
+
n_layers,
|
119 |
+
n_flows=4,
|
120 |
+
gin_channels=0,
|
121 |
+
):
|
122 |
+
super().__init__()
|
123 |
+
self.channels = channels
|
124 |
+
self.hidden_channels = hidden_channels
|
125 |
+
self.kernel_size = kernel_size
|
126 |
+
self.dilation_rate = dilation_rate
|
127 |
+
self.n_layers = n_layers
|
128 |
+
self.n_flows = n_flows
|
129 |
+
self.gin_channels = gin_channels
|
130 |
+
|
131 |
+
self.flows = nn.ModuleList()
|
132 |
+
for i in range(n_flows):
|
133 |
+
self.flows.append(
|
134 |
+
modules.ResidualCouplingLayer(
|
135 |
+
channels,
|
136 |
+
hidden_channels,
|
137 |
+
kernel_size,
|
138 |
+
dilation_rate,
|
139 |
+
n_layers,
|
140 |
+
gin_channels=gin_channels,
|
141 |
+
mean_only=True,
|
142 |
+
)
|
143 |
+
)
|
144 |
+
self.flows.append(modules.Flip())
|
145 |
+
|
146 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
147 |
+
if not reverse:
|
148 |
+
for flow in self.flows:
|
149 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
150 |
+
else:
|
151 |
+
for flow in reversed(self.flows):
|
152 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
153 |
+
return x
|
154 |
+
|
155 |
+
def remove_weight_norm(self):
|
156 |
+
for i in range(self.n_flows):
|
157 |
+
self.flows[i * 2].remove_weight_norm()
|
158 |
+
|
159 |
+
|
160 |
+
class PosteriorEncoder(nn.Module):
|
161 |
+
def __init__(
|
162 |
+
self,
|
163 |
+
in_channels,
|
164 |
+
out_channels,
|
165 |
+
hidden_channels,
|
166 |
+
kernel_size,
|
167 |
+
dilation_rate,
|
168 |
+
n_layers,
|
169 |
+
gin_channels=0,
|
170 |
+
):
|
171 |
+
super().__init__()
|
172 |
+
self.in_channels = in_channels
|
173 |
+
self.out_channels = out_channels
|
174 |
+
self.hidden_channels = hidden_channels
|
175 |
+
self.kernel_size = kernel_size
|
176 |
+
self.dilation_rate = dilation_rate
|
177 |
+
self.n_layers = n_layers
|
178 |
+
self.gin_channels = gin_channels
|
179 |
+
|
180 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
181 |
+
self.enc = modules.WN(
|
182 |
+
hidden_channels,
|
183 |
+
kernel_size,
|
184 |
+
dilation_rate,
|
185 |
+
n_layers,
|
186 |
+
gin_channels=gin_channels,
|
187 |
+
)
|
188 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
189 |
+
|
190 |
+
def forward(self, x, x_lengths, g=None):
|
191 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
192 |
+
x.dtype
|
193 |
+
)
|
194 |
+
x = self.pre(x) * x_mask
|
195 |
+
x = self.enc(x, x_mask, g=g)
|
196 |
+
stats = self.proj(x) * x_mask
|
197 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
198 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
199 |
+
return z, m, logs, x_mask
|
200 |
+
|
201 |
+
def remove_weight_norm(self):
|
202 |
+
self.enc.remove_weight_norm()
|
203 |
+
|
204 |
+
|
205 |
+
class Generator(torch.nn.Module):
|
206 |
+
def __init__(
|
207 |
+
self,
|
208 |
+
initial_channel,
|
209 |
+
resblock,
|
210 |
+
resblock_kernel_sizes,
|
211 |
+
resblock_dilation_sizes,
|
212 |
+
upsample_rates,
|
213 |
+
upsample_initial_channel,
|
214 |
+
upsample_kernel_sizes,
|
215 |
+
gin_channels=0,
|
216 |
+
):
|
217 |
+
super(Generator, self).__init__()
|
218 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
219 |
+
self.num_upsamples = len(upsample_rates)
|
220 |
+
self.conv_pre = Conv1d(
|
221 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
222 |
+
)
|
223 |
+
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
224 |
+
|
225 |
+
self.ups = nn.ModuleList()
|
226 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
227 |
+
self.ups.append(
|
228 |
+
weight_norm(
|
229 |
+
ConvTranspose1d(
|
230 |
+
upsample_initial_channel // (2**i),
|
231 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
232 |
+
k,
|
233 |
+
u,
|
234 |
+
padding=(k - u) // 2,
|
235 |
+
)
|
236 |
+
)
|
237 |
+
)
|
238 |
+
|
239 |
+
self.resblocks = nn.ModuleList()
|
240 |
+
for i in range(len(self.ups)):
|
241 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
242 |
+
for j, (k, d) in enumerate(
|
243 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
244 |
+
):
|
245 |
+
self.resblocks.append(resblock(ch, k, d))
|
246 |
+
|
247 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
248 |
+
self.ups.apply(init_weights)
|
249 |
+
|
250 |
+
if gin_channels != 0:
|
251 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
252 |
+
|
253 |
+
def forward(self, x, g=None):
|
254 |
+
x = self.conv_pre(x)
|
255 |
+
if g is not None:
|
256 |
+
x = x + self.cond(g)
|
257 |
+
|
258 |
+
for i in range(self.num_upsamples):
|
259 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
260 |
+
x = self.ups[i](x)
|
261 |
+
xs = None
|
262 |
+
for j in range(self.num_kernels):
|
263 |
+
if xs is None:
|
264 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
265 |
+
else:
|
266 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
267 |
+
x = xs / self.num_kernels
|
268 |
+
x = F.leaky_relu(x)
|
269 |
+
x = self.conv_post(x)
|
270 |
+
x = torch.tanh(x)
|
271 |
+
|
272 |
+
return x
|
273 |
+
|
274 |
+
def remove_weight_norm(self):
|
275 |
+
for l in self.ups:
|
276 |
+
remove_weight_norm(l)
|
277 |
+
for l in self.resblocks:
|
278 |
+
l.remove_weight_norm()
|
279 |
+
|
280 |
+
|
281 |
+
class SineGen(torch.nn.Module):
|
282 |
+
"""Definition of sine generator
|
283 |
+
SineGen(samp_rate, harmonic_num = 0,
|
284 |
+
sine_amp = 0.1, noise_std = 0.003,
|
285 |
+
voiced_threshold = 0,
|
286 |
+
flag_for_pulse=False)
|
287 |
+
samp_rate: sampling rate in Hz
|
288 |
+
harmonic_num: number of harmonic overtones (default 0)
|
289 |
+
sine_amp: amplitude of sine-wavefrom (default 0.1)
|
290 |
+
noise_std: std of Gaussian noise (default 0.003)
|
291 |
+
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
292 |
+
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
293 |
+
Note: when flag_for_pulse is True, the first time step of a voiced
|
294 |
+
segment is always sin(np.pi) or cos(0)
|
295 |
+
"""
|
296 |
+
|
297 |
+
def __init__(
|
298 |
+
self,
|
299 |
+
samp_rate,
|
300 |
+
harmonic_num=0,
|
301 |
+
sine_amp=0.1,
|
302 |
+
noise_std=0.003,
|
303 |
+
voiced_threshold=0,
|
304 |
+
flag_for_pulse=False,
|
305 |
+
):
|
306 |
+
super(SineGen, self).__init__()
|
307 |
+
self.sine_amp = sine_amp
|
308 |
+
self.noise_std = noise_std
|
309 |
+
self.harmonic_num = harmonic_num
|
310 |
+
self.dim = self.harmonic_num + 1
|
311 |
+
self.sampling_rate = samp_rate
|
312 |
+
self.voiced_threshold = voiced_threshold
|
313 |
+
|
314 |
+
def _f02uv(self, f0):
|
315 |
+
# generate uv signal
|
316 |
+
uv = torch.ones_like(f0)
|
317 |
+
uv = uv * (f0 > self.voiced_threshold)
|
318 |
+
return uv
|
319 |
+
|
320 |
+
def forward(self, f0, upp):
|
321 |
+
"""sine_tensor, uv = forward(f0)
|
322 |
+
input F0: tensor(batchsize=1, length, dim=1)
|
323 |
+
f0 for unvoiced steps should be 0
|
324 |
+
output sine_tensor: tensor(batchsize=1, length, dim)
|
325 |
+
output uv: tensor(batchsize=1, length, 1)
|
326 |
+
"""
|
327 |
+
with torch.no_grad():
|
328 |
+
f0 = f0[:, None].transpose(1, 2)
|
329 |
+
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
|
330 |
+
# fundamental component
|
331 |
+
f0_buf[:, :, 0] = f0[:, :, 0]
|
332 |
+
for idx in np.arange(self.harmonic_num):
|
333 |
+
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
|
334 |
+
idx + 2
|
335 |
+
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
|
336 |
+
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
|
337 |
+
rand_ini = torch.rand(
|
338 |
+
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
|
339 |
+
)
|
340 |
+
rand_ini[:, 0] = 0
|
341 |
+
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
342 |
+
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
|
343 |
+
tmp_over_one *= upp
|
344 |
+
tmp_over_one = F.interpolate(
|
345 |
+
tmp_over_one.transpose(2, 1),
|
346 |
+
scale_factor=upp,
|
347 |
+
mode="linear",
|
348 |
+
align_corners=True,
|
349 |
+
).transpose(2, 1)
|
350 |
+
rad_values = F.interpolate(
|
351 |
+
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
|
352 |
+
).transpose(
|
353 |
+
2, 1
|
354 |
+
) #######
|
355 |
+
tmp_over_one %= 1
|
356 |
+
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
|
357 |
+
cumsum_shift = torch.zeros_like(rad_values)
|
358 |
+
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
359 |
+
sine_waves = torch.sin(
|
360 |
+
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
|
361 |
+
)
|
362 |
+
sine_waves = sine_waves * self.sine_amp
|
363 |
+
uv = self._f02uv(f0)
|
364 |
+
uv = F.interpolate(
|
365 |
+
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
|
366 |
+
).transpose(2, 1)
|
367 |
+
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
368 |
+
noise = noise_amp * torch.randn_like(sine_waves)
|
369 |
+
sine_waves = sine_waves * uv + noise
|
370 |
+
return sine_waves, uv, noise
|
371 |
+
|
372 |
+
|
373 |
+
class SourceModuleHnNSF(torch.nn.Module):
|
374 |
+
"""SourceModule for hn-nsf
|
375 |
+
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
376 |
+
add_noise_std=0.003, voiced_threshod=0)
|
377 |
+
sampling_rate: sampling_rate in Hz
|
378 |
+
harmonic_num: number of harmonic above F0 (default: 0)
|
379 |
+
sine_amp: amplitude of sine source signal (default: 0.1)
|
380 |
+
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
381 |
+
note that amplitude of noise in unvoiced is decided
|
382 |
+
by sine_amp
|
383 |
+
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
384 |
+
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
385 |
+
F0_sampled (batchsize, length, 1)
|
386 |
+
Sine_source (batchsize, length, 1)
|
387 |
+
noise_source (batchsize, length 1)
|
388 |
+
uv (batchsize, length, 1)
|
389 |
+
"""
|
390 |
+
|
391 |
+
def __init__(
|
392 |
+
self,
|
393 |
+
sampling_rate,
|
394 |
+
harmonic_num=0,
|
395 |
+
sine_amp=0.1,
|
396 |
+
add_noise_std=0.003,
|
397 |
+
voiced_threshod=0,
|
398 |
+
is_half=True,
|
399 |
+
):
|
400 |
+
super(SourceModuleHnNSF, self).__init__()
|
401 |
+
|
402 |
+
self.sine_amp = sine_amp
|
403 |
+
self.noise_std = add_noise_std
|
404 |
+
self.is_half = is_half
|
405 |
+
# to produce sine waveforms
|
406 |
+
self.l_sin_gen = SineGen(
|
407 |
+
sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
|
408 |
+
)
|
409 |
+
|
410 |
+
# to merge source harmonics into a single excitation
|
411 |
+
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
412 |
+
self.l_tanh = torch.nn.Tanh()
|
413 |
+
|
414 |
+
def forward(self, x, upp=None):
|
415 |
+
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
|
416 |
+
if self.is_half:
|
417 |
+
sine_wavs = sine_wavs.half()
|
418 |
+
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
419 |
+
return sine_merge, None, None # noise, uv
|
420 |
+
|
421 |
+
|
422 |
+
class GeneratorNSF(torch.nn.Module):
|
423 |
+
def __init__(
|
424 |
+
self,
|
425 |
+
initial_channel,
|
426 |
+
resblock,
|
427 |
+
resblock_kernel_sizes,
|
428 |
+
resblock_dilation_sizes,
|
429 |
+
upsample_rates,
|
430 |
+
upsample_initial_channel,
|
431 |
+
upsample_kernel_sizes,
|
432 |
+
gin_channels,
|
433 |
+
sr,
|
434 |
+
is_half=False,
|
435 |
+
):
|
436 |
+
super(GeneratorNSF, self).__init__()
|
437 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
438 |
+
self.num_upsamples = len(upsample_rates)
|
439 |
+
|
440 |
+
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
|
441 |
+
self.m_source = SourceModuleHnNSF(
|
442 |
+
sampling_rate=sr, harmonic_num=0, is_half=is_half
|
443 |
+
)
|
444 |
+
self.noise_convs = nn.ModuleList()
|
445 |
+
self.conv_pre = Conv1d(
|
446 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
447 |
+
)
|
448 |
+
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
449 |
+
|
450 |
+
self.ups = nn.ModuleList()
|
451 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
452 |
+
c_cur = upsample_initial_channel // (2 ** (i + 1))
|
453 |
+
self.ups.append(
|
454 |
+
weight_norm(
|
455 |
+
ConvTranspose1d(
|
456 |
+
upsample_initial_channel // (2**i),
|
457 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
458 |
+
k,
|
459 |
+
u,
|
460 |
+
padding=(k - u) // 2,
|
461 |
+
)
|
462 |
+
)
|
463 |
+
)
|
464 |
+
if i + 1 < len(upsample_rates):
|
465 |
+
stride_f0 = np.prod(upsample_rates[i + 1 :])
|
466 |
+
self.noise_convs.append(
|
467 |
+
Conv1d(
|
468 |
+
1,
|
469 |
+
c_cur,
|
470 |
+
kernel_size=stride_f0 * 2,
|
471 |
+
stride=stride_f0,
|
472 |
+
padding=stride_f0 // 2,
|
473 |
+
)
|
474 |
+
)
|
475 |
+
else:
|
476 |
+
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
|
477 |
+
|
478 |
+
self.resblocks = nn.ModuleList()
|
479 |
+
for i in range(len(self.ups)):
|
480 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
481 |
+
for j, (k, d) in enumerate(
|
482 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
483 |
+
):
|
484 |
+
self.resblocks.append(resblock(ch, k, d))
|
485 |
+
|
486 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
487 |
+
self.ups.apply(init_weights)
|
488 |
+
|
489 |
+
if gin_channels != 0:
|
490 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
491 |
+
|
492 |
+
self.upp = np.prod(upsample_rates)
|
493 |
+
|
494 |
+
def forward(self, x, f0, g=None):
|
495 |
+
har_source, noi_source, uv = self.m_source(f0, self.upp)
|
496 |
+
har_source = har_source.transpose(1, 2)
|
497 |
+
x = self.conv_pre(x)
|
498 |
+
if g is not None:
|
499 |
+
x = x + self.cond(g)
|
500 |
+
|
501 |
+
for i in range(self.num_upsamples):
|
502 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
503 |
+
x = self.ups[i](x)
|
504 |
+
x_source = self.noise_convs[i](har_source)
|
505 |
+
x = x + x_source
|
506 |
+
xs = None
|
507 |
+
for j in range(self.num_kernels):
|
508 |
+
if xs is None:
|
509 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
510 |
+
else:
|
511 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
512 |
+
x = xs / self.num_kernels
|
513 |
+
x = F.leaky_relu(x)
|
514 |
+
x = self.conv_post(x)
|
515 |
+
x = torch.tanh(x)
|
516 |
+
return x
|
517 |
+
|
518 |
+
def remove_weight_norm(self):
|
519 |
+
for l in self.ups:
|
520 |
+
remove_weight_norm(l)
|
521 |
+
for l in self.resblocks:
|
522 |
+
l.remove_weight_norm()
|
523 |
+
|
524 |
+
|
525 |
+
sr2sr = {
|
526 |
+
"32k": 32000,
|
527 |
+
"40k": 40000,
|
528 |
+
"48k": 48000,
|
529 |
+
}
|
530 |
+
|
531 |
+
|
532 |
+
class SynthesizerTrnMs256NSFsid(nn.Module):
|
533 |
+
def __init__(
|
534 |
+
self,
|
535 |
+
spec_channels,
|
536 |
+
segment_size,
|
537 |
+
inter_channels,
|
538 |
+
hidden_channels,
|
539 |
+
filter_channels,
|
540 |
+
n_heads,
|
541 |
+
n_layers,
|
542 |
+
kernel_size,
|
543 |
+
p_dropout,
|
544 |
+
resblock,
|
545 |
+
resblock_kernel_sizes,
|
546 |
+
resblock_dilation_sizes,
|
547 |
+
upsample_rates,
|
548 |
+
upsample_initial_channel,
|
549 |
+
upsample_kernel_sizes,
|
550 |
+
spk_embed_dim,
|
551 |
+
gin_channels,
|
552 |
+
sr,
|
553 |
+
**kwargs
|
554 |
+
):
|
555 |
+
super().__init__()
|
556 |
+
if type(sr) == type("strr"):
|
557 |
+
sr = sr2sr[sr]
|
558 |
+
self.spec_channels = spec_channels
|
559 |
+
self.inter_channels = inter_channels
|
560 |
+
self.hidden_channels = hidden_channels
|
561 |
+
self.filter_channels = filter_channels
|
562 |
+
self.n_heads = n_heads
|
563 |
+
self.n_layers = n_layers
|
564 |
+
self.kernel_size = kernel_size
|
565 |
+
self.p_dropout = p_dropout
|
566 |
+
self.resblock = resblock
|
567 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
568 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
569 |
+
self.upsample_rates = upsample_rates
|
570 |
+
self.upsample_initial_channel = upsample_initial_channel
|
571 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
572 |
+
self.segment_size = segment_size
|
573 |
+
self.gin_channels = gin_channels
|
574 |
+
# self.hop_length = hop_length#
|
575 |
+
self.spk_embed_dim = spk_embed_dim
|
576 |
+
self.enc_p = TextEncoder256(
|
577 |
+
inter_channels,
|
578 |
+
hidden_channels,
|
579 |
+
filter_channels,
|
580 |
+
n_heads,
|
581 |
+
n_layers,
|
582 |
+
kernel_size,
|
583 |
+
p_dropout,
|
584 |
+
)
|
585 |
+
self.dec = GeneratorNSF(
|
586 |
+
inter_channels,
|
587 |
+
resblock,
|
588 |
+
resblock_kernel_sizes,
|
589 |
+
resblock_dilation_sizes,
|
590 |
+
upsample_rates,
|
591 |
+
upsample_initial_channel,
|
592 |
+
upsample_kernel_sizes,
|
593 |
+
gin_channels=gin_channels,
|
594 |
+
sr=sr,
|
595 |
+
is_half=kwargs["is_half"],
|
596 |
+
)
|
597 |
+
self.enc_q = PosteriorEncoder(
|
598 |
+
spec_channels,
|
599 |
+
inter_channels,
|
600 |
+
hidden_channels,
|
601 |
+
5,
|
602 |
+
1,
|
603 |
+
16,
|
604 |
+
gin_channels=gin_channels,
|
605 |
+
)
|
606 |
+
self.flow = ResidualCouplingBlock(
|
607 |
+
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
608 |
+
)
|
609 |
+
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
610 |
+
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
611 |
+
|
612 |
+
def remove_weight_norm(self):
|
613 |
+
self.dec.remove_weight_norm()
|
614 |
+
self.flow.remove_weight_norm()
|
615 |
+
self.enc_q.remove_weight_norm()
|
616 |
+
|
617 |
+
def forward(
|
618 |
+
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
|
619 |
+
): # 这里ds是id,[bs,1]
|
620 |
+
# print(1,pitch.shape)#[bs,t]
|
621 |
+
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
622 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
623 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
624 |
+
z_p = self.flow(z, y_mask, g=g)
|
625 |
+
z_slice, ids_slice = commons.rand_slice_segments(
|
626 |
+
z, y_lengths, self.segment_size
|
627 |
+
)
|
628 |
+
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
|
629 |
+
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
|
630 |
+
# print(-2,pitchf.shape,z_slice.shape)
|
631 |
+
o = self.dec(z_slice, pitchf, g=g)
|
632 |
+
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
633 |
+
|
634 |
+
def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
|
635 |
+
g = self.emb_g(sid).unsqueeze(-1)
|
636 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
637 |
+
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
638 |
+
if rate:
|
639 |
+
head = int(z_p.shape[2] * rate)
|
640 |
+
z_p = z_p[:, :, -head:]
|
641 |
+
x_mask = x_mask[:, :, -head:]
|
642 |
+
nsff0 = nsff0[:, -head:]
|
643 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
644 |
+
o = self.dec(z * x_mask, nsff0, g=g)
|
645 |
+
return o, x_mask, (z, z_p, m_p, logs_p)
|
646 |
+
|
647 |
+
|
648 |
+
class SynthesizerTrnMs768NSFsid(nn.Module):
|
649 |
+
def __init__(
|
650 |
+
self,
|
651 |
+
spec_channels,
|
652 |
+
segment_size,
|
653 |
+
inter_channels,
|
654 |
+
hidden_channels,
|
655 |
+
filter_channels,
|
656 |
+
n_heads,
|
657 |
+
n_layers,
|
658 |
+
kernel_size,
|
659 |
+
p_dropout,
|
660 |
+
resblock,
|
661 |
+
resblock_kernel_sizes,
|
662 |
+
resblock_dilation_sizes,
|
663 |
+
upsample_rates,
|
664 |
+
upsample_initial_channel,
|
665 |
+
upsample_kernel_sizes,
|
666 |
+
spk_embed_dim,
|
667 |
+
gin_channels,
|
668 |
+
sr,
|
669 |
+
**kwargs
|
670 |
+
):
|
671 |
+
super().__init__()
|
672 |
+
if type(sr) == type("strr"):
|
673 |
+
sr = sr2sr[sr]
|
674 |
+
self.spec_channels = spec_channels
|
675 |
+
self.inter_channels = inter_channels
|
676 |
+
self.hidden_channels = hidden_channels
|
677 |
+
self.filter_channels = filter_channels
|
678 |
+
self.n_heads = n_heads
|
679 |
+
self.n_layers = n_layers
|
680 |
+
self.kernel_size = kernel_size
|
681 |
+
self.p_dropout = p_dropout
|
682 |
+
self.resblock = resblock
|
683 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
684 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
685 |
+
self.upsample_rates = upsample_rates
|
686 |
+
self.upsample_initial_channel = upsample_initial_channel
|
687 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
688 |
+
self.segment_size = segment_size
|
689 |
+
self.gin_channels = gin_channels
|
690 |
+
# self.hop_length = hop_length#
|
691 |
+
self.spk_embed_dim = spk_embed_dim
|
692 |
+
self.enc_p = TextEncoder768(
|
693 |
+
inter_channels,
|
694 |
+
hidden_channels,
|
695 |
+
filter_channels,
|
696 |
+
n_heads,
|
697 |
+
n_layers,
|
698 |
+
kernel_size,
|
699 |
+
p_dropout,
|
700 |
+
)
|
701 |
+
self.dec = GeneratorNSF(
|
702 |
+
inter_channels,
|
703 |
+
resblock,
|
704 |
+
resblock_kernel_sizes,
|
705 |
+
resblock_dilation_sizes,
|
706 |
+
upsample_rates,
|
707 |
+
upsample_initial_channel,
|
708 |
+
upsample_kernel_sizes,
|
709 |
+
gin_channels=gin_channels,
|
710 |
+
sr=sr,
|
711 |
+
is_half=kwargs["is_half"],
|
712 |
+
)
|
713 |
+
self.enc_q = PosteriorEncoder(
|
714 |
+
spec_channels,
|
715 |
+
inter_channels,
|
716 |
+
hidden_channels,
|
717 |
+
5,
|
718 |
+
1,
|
719 |
+
16,
|
720 |
+
gin_channels=gin_channels,
|
721 |
+
)
|
722 |
+
self.flow = ResidualCouplingBlock(
|
723 |
+
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
724 |
+
)
|
725 |
+
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
726 |
+
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
727 |
+
|
728 |
+
def remove_weight_norm(self):
|
729 |
+
self.dec.remove_weight_norm()
|
730 |
+
self.flow.remove_weight_norm()
|
731 |
+
self.enc_q.remove_weight_norm()
|
732 |
+
|
733 |
+
def forward(
|
734 |
+
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
|
735 |
+
): # 这里ds是id,[bs,1]
|
736 |
+
# print(1,pitch.shape)#[bs,t]
|
737 |
+
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
738 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
739 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
740 |
+
z_p = self.flow(z, y_mask, g=g)
|
741 |
+
z_slice, ids_slice = commons.rand_slice_segments(
|
742 |
+
z, y_lengths, self.segment_size
|
743 |
+
)
|
744 |
+
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
|
745 |
+
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
|
746 |
+
# print(-2,pitchf.shape,z_slice.shape)
|
747 |
+
o = self.dec(z_slice, pitchf, g=g)
|
748 |
+
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
749 |
+
|
750 |
+
def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
|
751 |
+
g = self.emb_g(sid).unsqueeze(-1)
|
752 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
753 |
+
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
754 |
+
if rate:
|
755 |
+
head = int(z_p.shape[2] * rate)
|
756 |
+
z_p = z_p[:, :, -head:]
|
757 |
+
x_mask = x_mask[:, :, -head:]
|
758 |
+
nsff0 = nsff0[:, -head:]
|
759 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
760 |
+
o = self.dec(z * x_mask, nsff0, g=g)
|
761 |
+
return o, x_mask, (z, z_p, m_p, logs_p)
|
762 |
+
|
763 |
+
|
764 |
+
class SynthesizerTrnMs256NSFsid_nono(nn.Module):
|
765 |
+
def __init__(
|
766 |
+
self,
|
767 |
+
spec_channels,
|
768 |
+
segment_size,
|
769 |
+
inter_channels,
|
770 |
+
hidden_channels,
|
771 |
+
filter_channels,
|
772 |
+
n_heads,
|
773 |
+
n_layers,
|
774 |
+
kernel_size,
|
775 |
+
p_dropout,
|
776 |
+
resblock,
|
777 |
+
resblock_kernel_sizes,
|
778 |
+
resblock_dilation_sizes,
|
779 |
+
upsample_rates,
|
780 |
+
upsample_initial_channel,
|
781 |
+
upsample_kernel_sizes,
|
782 |
+
spk_embed_dim,
|
783 |
+
gin_channels,
|
784 |
+
sr=None,
|
785 |
+
**kwargs
|
786 |
+
):
|
787 |
+
super().__init__()
|
788 |
+
self.spec_channels = spec_channels
|
789 |
+
self.inter_channels = inter_channels
|
790 |
+
self.hidden_channels = hidden_channels
|
791 |
+
self.filter_channels = filter_channels
|
792 |
+
self.n_heads = n_heads
|
793 |
+
self.n_layers = n_layers
|
794 |
+
self.kernel_size = kernel_size
|
795 |
+
self.p_dropout = p_dropout
|
796 |
+
self.resblock = resblock
|
797 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
798 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
799 |
+
self.upsample_rates = upsample_rates
|
800 |
+
self.upsample_initial_channel = upsample_initial_channel
|
801 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
802 |
+
self.segment_size = segment_size
|
803 |
+
self.gin_channels = gin_channels
|
804 |
+
# self.hop_length = hop_length#
|
805 |
+
self.spk_embed_dim = spk_embed_dim
|
806 |
+
self.enc_p = TextEncoder256(
|
807 |
+
inter_channels,
|
808 |
+
hidden_channels,
|
809 |
+
filter_channels,
|
810 |
+
n_heads,
|
811 |
+
n_layers,
|
812 |
+
kernel_size,
|
813 |
+
p_dropout,
|
814 |
+
f0=False,
|
815 |
+
)
|
816 |
+
self.dec = Generator(
|
817 |
+
inter_channels,
|
818 |
+
resblock,
|
819 |
+
resblock_kernel_sizes,
|
820 |
+
resblock_dilation_sizes,
|
821 |
+
upsample_rates,
|
822 |
+
upsample_initial_channel,
|
823 |
+
upsample_kernel_sizes,
|
824 |
+
gin_channels=gin_channels,
|
825 |
+
)
|
826 |
+
self.enc_q = PosteriorEncoder(
|
827 |
+
spec_channels,
|
828 |
+
inter_channels,
|
829 |
+
hidden_channels,
|
830 |
+
5,
|
831 |
+
1,
|
832 |
+
16,
|
833 |
+
gin_channels=gin_channels,
|
834 |
+
)
|
835 |
+
self.flow = ResidualCouplingBlock(
|
836 |
+
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
837 |
+
)
|
838 |
+
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
839 |
+
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
840 |
+
|
841 |
+
def remove_weight_norm(self):
|
842 |
+
self.dec.remove_weight_norm()
|
843 |
+
self.flow.remove_weight_norm()
|
844 |
+
self.enc_q.remove_weight_norm()
|
845 |
+
|
846 |
+
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
|
847 |
+
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
848 |
+
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
849 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
850 |
+
z_p = self.flow(z, y_mask, g=g)
|
851 |
+
z_slice, ids_slice = commons.rand_slice_segments(
|
852 |
+
z, y_lengths, self.segment_size
|
853 |
+
)
|
854 |
+
o = self.dec(z_slice, g=g)
|
855 |
+
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
856 |
+
|
857 |
+
def infer(self, phone, phone_lengths, sid, rate=None):
|
858 |
+
g = self.emb_g(sid).unsqueeze(-1)
|
859 |
+
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
860 |
+
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
861 |
+
if rate:
|
862 |
+
head = int(z_p.shape[2] * rate)
|
863 |
+
z_p = z_p[:, :, -head:]
|
864 |
+
x_mask = x_mask[:, :, -head:]
|
865 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
866 |
+
o = self.dec(z * x_mask, g=g)
|
867 |
+
return o, x_mask, (z, z_p, m_p, logs_p)
|
868 |
+
|
869 |
+
|
870 |
+
class SynthesizerTrnMs768NSFsid_nono(nn.Module):
|
871 |
+
def __init__(
|
872 |
+
self,
|
873 |
+
spec_channels,
|
874 |
+
segment_size,
|
875 |
+
inter_channels,
|
876 |
+
hidden_channels,
|
877 |
+
filter_channels,
|
878 |
+
n_heads,
|
879 |
+
n_layers,
|
880 |
+
kernel_size,
|
881 |
+
p_dropout,
|
882 |
+
resblock,
|
883 |
+
resblock_kernel_sizes,
|
884 |
+
resblock_dilation_sizes,
|
885 |
+
upsample_rates,
|
886 |
+
upsample_initial_channel,
|
887 |
+
upsample_kernel_sizes,
|
888 |
+
spk_embed_dim,
|
889 |
+
gin_channels,
|
890 |
+
sr=None,
|
891 |
+
**kwargs
|
892 |
+
):
|
893 |
+
super().__init__()
|
894 |
+
self.spec_channels = spec_channels
|
895 |
+
self.inter_channels = inter_channels
|
896 |
+
self.hidden_channels = hidden_channels
|
897 |
+
self.filter_channels = filter_channels
|
898 |
+
self.n_heads = n_heads
|
899 |
+
self.n_layers = n_layers
|
900 |
+
self.kernel_size = kernel_size
|
901 |
+
self.p_dropout = p_dropout
|
902 |
+
self.resblock = resblock
|
903 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
904 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
905 |
+
self.upsample_rates = upsample_rates
|
906 |
+
self.upsample_initial_channel = upsample_initial_channel
|
907 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
908 |
+
self.segment_size = segment_size
|
909 |
+
self.gin_channels = gin_channels
|
910 |
+
# self.hop_length = hop_length#
|
911 |
+
self.spk_embed_dim = spk_embed_dim
|
912 |
+
self.enc_p = TextEncoder768(
|
913 |
+
inter_channels,
|
914 |
+
hidden_channels,
|
915 |
+
filter_channels,
|
916 |
+
n_heads,
|
917 |
+
n_layers,
|
918 |
+
kernel_size,
|
919 |
+
p_dropout,
|
920 |
+
f0=False,
|
921 |
+
)
|
922 |
+
self.dec = Generator(
|
923 |
+
inter_channels,
|
924 |
+
resblock,
|
925 |
+
resblock_kernel_sizes,
|
926 |
+
resblock_dilation_sizes,
|
927 |
+
upsample_rates,
|
928 |
+
upsample_initial_channel,
|
929 |
+
upsample_kernel_sizes,
|
930 |
+
gin_channels=gin_channels,
|
931 |
+
)
|
932 |
+
self.enc_q = PosteriorEncoder(
|
933 |
+
spec_channels,
|
934 |
+
inter_channels,
|
935 |
+
hidden_channels,
|
936 |
+
5,
|
937 |
+
1,
|
938 |
+
16,
|
939 |
+
gin_channels=gin_channels,
|
940 |
+
)
|
941 |
+
self.flow = ResidualCouplingBlock(
|
942 |
+
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
943 |
+
)
|
944 |
+
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
945 |
+
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
946 |
+
|
947 |
+
def remove_weight_norm(self):
|
948 |
+
self.dec.remove_weight_norm()
|
949 |
+
self.flow.remove_weight_norm()
|
950 |
+
self.enc_q.remove_weight_norm()
|
951 |
+
|
952 |
+
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
|
953 |
+
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
954 |
+
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
955 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
956 |
+
z_p = self.flow(z, y_mask, g=g)
|
957 |
+
z_slice, ids_slice = commons.rand_slice_segments(
|
958 |
+
z, y_lengths, self.segment_size
|
959 |
+
)
|
960 |
+
o = self.dec(z_slice, g=g)
|
961 |
+
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
962 |
+
|
963 |
+
def infer(self, phone, phone_lengths, sid, rate=None):
|
964 |
+
g = self.emb_g(sid).unsqueeze(-1)
|
965 |
+
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
966 |
+
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
967 |
+
if rate:
|
968 |
+
head = int(z_p.shape[2] * rate)
|
969 |
+
z_p = z_p[:, :, -head:]
|
970 |
+
x_mask = x_mask[:, :, -head:]
|
971 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
972 |
+
o = self.dec(z * x_mask, g=g)
|
973 |
+
return o, x_mask, (z, z_p, m_p, logs_p)
|
974 |
+
|
975 |
+
|
976 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
977 |
+
def __init__(self, use_spectral_norm=False):
|
978 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
979 |
+
periods = [2, 3, 5, 7, 11, 17]
|
980 |
+
# periods = [3, 5, 7, 11, 17, 23, 37]
|
981 |
+
|
982 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
983 |
+
discs = discs + [
|
984 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
985 |
+
]
|
986 |
+
self.discriminators = nn.ModuleList(discs)
|
987 |
+
|
988 |
+
def forward(self, y, y_hat):
|
989 |
+
y_d_rs = [] #
|
990 |
+
y_d_gs = []
|
991 |
+
fmap_rs = []
|
992 |
+
fmap_gs = []
|
993 |
+
for i, d in enumerate(self.discriminators):
|
994 |
+
y_d_r, fmap_r = d(y)
|
995 |
+
y_d_g, fmap_g = d(y_hat)
|
996 |
+
# for j in range(len(fmap_r)):
|
997 |
+
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
|
998 |
+
y_d_rs.append(y_d_r)
|
999 |
+
y_d_gs.append(y_d_g)
|
1000 |
+
fmap_rs.append(fmap_r)
|
1001 |
+
fmap_gs.append(fmap_g)
|
1002 |
+
|
1003 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
1004 |
+
|
1005 |
+
|
1006 |
+
class MultiPeriodDiscriminatorV2(torch.nn.Module):
|
1007 |
+
def __init__(self, use_spectral_norm=False):
|
1008 |
+
super(MultiPeriodDiscriminatorV2, self).__init__()
|
1009 |
+
# periods = [2, 3, 5, 7, 11, 17]
|
1010 |
+
periods = [2, 3, 5, 7, 11, 17, 23, 37]
|
1011 |
+
|
1012 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
1013 |
+
discs = discs + [
|
1014 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
1015 |
+
]
|
1016 |
+
self.discriminators = nn.ModuleList(discs)
|
1017 |
+
|
1018 |
+
def forward(self, y, y_hat):
|
1019 |
+
y_d_rs = [] #
|
1020 |
+
y_d_gs = []
|
1021 |
+
fmap_rs = []
|
1022 |
+
fmap_gs = []
|
1023 |
+
for i, d in enumerate(self.discriminators):
|
1024 |
+
y_d_r, fmap_r = d(y)
|
1025 |
+
y_d_g, fmap_g = d(y_hat)
|
1026 |
+
# for j in range(len(fmap_r)):
|
1027 |
+
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
|
1028 |
+
y_d_rs.append(y_d_r)
|
1029 |
+
y_d_gs.append(y_d_g)
|
1030 |
+
fmap_rs.append(fmap_r)
|
1031 |
+
fmap_gs.append(fmap_g)
|
1032 |
+
|
1033 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
1034 |
+
|
1035 |
+
|
1036 |
+
class DiscriminatorS(torch.nn.Module):
|
1037 |
+
def __init__(self, use_spectral_norm=False):
|
1038 |
+
super(DiscriminatorS, self).__init__()
|
1039 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
1040 |
+
self.convs = nn.ModuleList(
|
1041 |
+
[
|
1042 |
+
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
1043 |
+
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
1044 |
+
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
1045 |
+
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
1046 |
+
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
1047 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
1048 |
+
]
|
1049 |
+
)
|
1050 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
1051 |
+
|
1052 |
+
def forward(self, x):
|
1053 |
+
fmap = []
|
1054 |
+
|
1055 |
+
for l in self.convs:
|
1056 |
+
x = l(x)
|
1057 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
1058 |
+
fmap.append(x)
|
1059 |
+
x = self.conv_post(x)
|
1060 |
+
fmap.append(x)
|
1061 |
+
x = torch.flatten(x, 1, -1)
|
1062 |
+
|
1063 |
+
return x, fmap
|
1064 |
+
|
1065 |
+
|
1066 |
+
class DiscriminatorP(torch.nn.Module):
|
1067 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
1068 |
+
super(DiscriminatorP, self).__init__()
|
1069 |
+
self.period = period
|
1070 |
+
self.use_spectral_norm = use_spectral_norm
|
1071 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
1072 |
+
self.convs = nn.ModuleList(
|
1073 |
+
[
|
1074 |
+
norm_f(
|
1075 |
+
Conv2d(
|
1076 |
+
1,
|
1077 |
+
32,
|
1078 |
+
(kernel_size, 1),
|
1079 |
+
(stride, 1),
|
1080 |
+
padding=(get_padding(kernel_size, 1), 0),
|
1081 |
+
)
|
1082 |
+
),
|
1083 |
+
norm_f(
|
1084 |
+
Conv2d(
|
1085 |
+
32,
|
1086 |
+
128,
|
1087 |
+
(kernel_size, 1),
|
1088 |
+
(stride, 1),
|
1089 |
+
padding=(get_padding(kernel_size, 1), 0),
|
1090 |
+
)
|
1091 |
+
),
|
1092 |
+
norm_f(
|
1093 |
+
Conv2d(
|
1094 |
+
128,
|
1095 |
+
512,
|
1096 |
+
(kernel_size, 1),
|
1097 |
+
(stride, 1),
|
1098 |
+
padding=(get_padding(kernel_size, 1), 0),
|
1099 |
+
)
|
1100 |
+
),
|
1101 |
+
norm_f(
|
1102 |
+
Conv2d(
|
1103 |
+
512,
|
1104 |
+
1024,
|
1105 |
+
(kernel_size, 1),
|
1106 |
+
(stride, 1),
|
1107 |
+
padding=(get_padding(kernel_size, 1), 0),
|
1108 |
+
)
|
1109 |
+
),
|
1110 |
+
norm_f(
|
1111 |
+
Conv2d(
|
1112 |
+
1024,
|
1113 |
+
1024,
|
1114 |
+
(kernel_size, 1),
|
1115 |
+
1,
|
1116 |
+
padding=(get_padding(kernel_size, 1), 0),
|
1117 |
+
)
|
1118 |
+
),
|
1119 |
+
]
|
1120 |
+
)
|
1121 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
1122 |
+
|
1123 |
+
def forward(self, x):
|
1124 |
+
fmap = []
|
1125 |
+
|
1126 |
+
# 1d to 2d
|
1127 |
+
b, c, t = x.shape
|
1128 |
+
if t % self.period != 0: # pad first
|
1129 |
+
n_pad = self.period - (t % self.period)
|
1130 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
1131 |
+
t = t + n_pad
|
1132 |
+
x = x.view(b, c, t // self.period, self.period)
|
1133 |
+
|
1134 |
+
for l in self.convs:
|
1135 |
+
x = l(x)
|
1136 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
1137 |
+
fmap.append(x)
|
1138 |
+
x = self.conv_post(x)
|
1139 |
+
fmap.append(x)
|
1140 |
+
x = torch.flatten(x, 1, -1)
|
1141 |
+
|
1142 |
+
return x, fmap
|
lib/infer_pack/models_onnx.py
ADDED
@@ -0,0 +1,819 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math, pdb, os
|
2 |
+
from time import time as ttime
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
from lib.infer_pack import modules
|
7 |
+
from lib.infer_pack import attentions
|
8 |
+
from lib.infer_pack import commons
|
9 |
+
from lib.infer_pack.commons import init_weights, get_padding
|
10 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
+
from lib.infer_pack.commons import init_weights
|
13 |
+
import numpy as np
|
14 |
+
from lib.infer_pack import commons
|
15 |
+
|
16 |
+
|
17 |
+
class TextEncoder256(nn.Module):
|
18 |
+
def __init__(
|
19 |
+
self,
|
20 |
+
out_channels,
|
21 |
+
hidden_channels,
|
22 |
+
filter_channels,
|
23 |
+
n_heads,
|
24 |
+
n_layers,
|
25 |
+
kernel_size,
|
26 |
+
p_dropout,
|
27 |
+
f0=True,
|
28 |
+
):
|
29 |
+
super().__init__()
|
30 |
+
self.out_channels = out_channels
|
31 |
+
self.hidden_channels = hidden_channels
|
32 |
+
self.filter_channels = filter_channels
|
33 |
+
self.n_heads = n_heads
|
34 |
+
self.n_layers = n_layers
|
35 |
+
self.kernel_size = kernel_size
|
36 |
+
self.p_dropout = p_dropout
|
37 |
+
self.emb_phone = nn.Linear(256, hidden_channels)
|
38 |
+
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
39 |
+
if f0 == True:
|
40 |
+
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
41 |
+
self.encoder = attentions.Encoder(
|
42 |
+
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
43 |
+
)
|
44 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
45 |
+
|
46 |
+
def forward(self, phone, pitch, lengths):
|
47 |
+
if pitch == None:
|
48 |
+
x = self.emb_phone(phone)
|
49 |
+
else:
|
50 |
+
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
51 |
+
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
52 |
+
x = self.lrelu(x)
|
53 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
54 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
|
55 |
+
x.dtype
|
56 |
+
)
|
57 |
+
x = self.encoder(x * x_mask, x_mask)
|
58 |
+
stats = self.proj(x) * x_mask
|
59 |
+
|
60 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
61 |
+
return m, logs, x_mask
|
62 |
+
|
63 |
+
|
64 |
+
class TextEncoder768(nn.Module):
|
65 |
+
def __init__(
|
66 |
+
self,
|
67 |
+
out_channels,
|
68 |
+
hidden_channels,
|
69 |
+
filter_channels,
|
70 |
+
n_heads,
|
71 |
+
n_layers,
|
72 |
+
kernel_size,
|
73 |
+
p_dropout,
|
74 |
+
f0=True,
|
75 |
+
):
|
76 |
+
super().__init__()
|
77 |
+
self.out_channels = out_channels
|
78 |
+
self.hidden_channels = hidden_channels
|
79 |
+
self.filter_channels = filter_channels
|
80 |
+
self.n_heads = n_heads
|
81 |
+
self.n_layers = n_layers
|
82 |
+
self.kernel_size = kernel_size
|
83 |
+
self.p_dropout = p_dropout
|
84 |
+
self.emb_phone = nn.Linear(768, hidden_channels)
|
85 |
+
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
86 |
+
if f0 == True:
|
87 |
+
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
88 |
+
self.encoder = attentions.Encoder(
|
89 |
+
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
90 |
+
)
|
91 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
92 |
+
|
93 |
+
def forward(self, phone, pitch, lengths):
|
94 |
+
if pitch == None:
|
95 |
+
x = self.emb_phone(phone)
|
96 |
+
else:
|
97 |
+
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
98 |
+
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
99 |
+
x = self.lrelu(x)
|
100 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
101 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
|
102 |
+
x.dtype
|
103 |
+
)
|
104 |
+
x = self.encoder(x * x_mask, x_mask)
|
105 |
+
stats = self.proj(x) * x_mask
|
106 |
+
|
107 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
108 |
+
return m, logs, x_mask
|
109 |
+
|
110 |
+
|
111 |
+
class ResidualCouplingBlock(nn.Module):
|
112 |
+
def __init__(
|
113 |
+
self,
|
114 |
+
channels,
|
115 |
+
hidden_channels,
|
116 |
+
kernel_size,
|
117 |
+
dilation_rate,
|
118 |
+
n_layers,
|
119 |
+
n_flows=4,
|
120 |
+
gin_channels=0,
|
121 |
+
):
|
122 |
+
super().__init__()
|
123 |
+
self.channels = channels
|
124 |
+
self.hidden_channels = hidden_channels
|
125 |
+
self.kernel_size = kernel_size
|
126 |
+
self.dilation_rate = dilation_rate
|
127 |
+
self.n_layers = n_layers
|
128 |
+
self.n_flows = n_flows
|
129 |
+
self.gin_channels = gin_channels
|
130 |
+
|
131 |
+
self.flows = nn.ModuleList()
|
132 |
+
for i in range(n_flows):
|
133 |
+
self.flows.append(
|
134 |
+
modules.ResidualCouplingLayer(
|
135 |
+
channels,
|
136 |
+
hidden_channels,
|
137 |
+
kernel_size,
|
138 |
+
dilation_rate,
|
139 |
+
n_layers,
|
140 |
+
gin_channels=gin_channels,
|
141 |
+
mean_only=True,
|
142 |
+
)
|
143 |
+
)
|
144 |
+
self.flows.append(modules.Flip())
|
145 |
+
|
146 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
147 |
+
if not reverse:
|
148 |
+
for flow in self.flows:
|
149 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
150 |
+
else:
|
151 |
+
for flow in reversed(self.flows):
|
152 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
153 |
+
return x
|
154 |
+
|
155 |
+
def remove_weight_norm(self):
|
156 |
+
for i in range(self.n_flows):
|
157 |
+
self.flows[i * 2].remove_weight_norm()
|
158 |
+
|
159 |
+
|
160 |
+
class PosteriorEncoder(nn.Module):
|
161 |
+
def __init__(
|
162 |
+
self,
|
163 |
+
in_channels,
|
164 |
+
out_channels,
|
165 |
+
hidden_channels,
|
166 |
+
kernel_size,
|
167 |
+
dilation_rate,
|
168 |
+
n_layers,
|
169 |
+
gin_channels=0,
|
170 |
+
):
|
171 |
+
super().__init__()
|
172 |
+
self.in_channels = in_channels
|
173 |
+
self.out_channels = out_channels
|
174 |
+
self.hidden_channels = hidden_channels
|
175 |
+
self.kernel_size = kernel_size
|
176 |
+
self.dilation_rate = dilation_rate
|
177 |
+
self.n_layers = n_layers
|
178 |
+
self.gin_channels = gin_channels
|
179 |
+
|
180 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
181 |
+
self.enc = modules.WN(
|
182 |
+
hidden_channels,
|
183 |
+
kernel_size,
|
184 |
+
dilation_rate,
|
185 |
+
n_layers,
|
186 |
+
gin_channels=gin_channels,
|
187 |
+
)
|
188 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
189 |
+
|
190 |
+
def forward(self, x, x_lengths, g=None):
|
191 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
192 |
+
x.dtype
|
193 |
+
)
|
194 |
+
x = self.pre(x) * x_mask
|
195 |
+
x = self.enc(x, x_mask, g=g)
|
196 |
+
stats = self.proj(x) * x_mask
|
197 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
198 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
199 |
+
return z, m, logs, x_mask
|
200 |
+
|
201 |
+
def remove_weight_norm(self):
|
202 |
+
self.enc.remove_weight_norm()
|
203 |
+
|
204 |
+
|
205 |
+
class Generator(torch.nn.Module):
|
206 |
+
def __init__(
|
207 |
+
self,
|
208 |
+
initial_channel,
|
209 |
+
resblock,
|
210 |
+
resblock_kernel_sizes,
|
211 |
+
resblock_dilation_sizes,
|
212 |
+
upsample_rates,
|
213 |
+
upsample_initial_channel,
|
214 |
+
upsample_kernel_sizes,
|
215 |
+
gin_channels=0,
|
216 |
+
):
|
217 |
+
super(Generator, self).__init__()
|
218 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
219 |
+
self.num_upsamples = len(upsample_rates)
|
220 |
+
self.conv_pre = Conv1d(
|
221 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
222 |
+
)
|
223 |
+
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
224 |
+
|
225 |
+
self.ups = nn.ModuleList()
|
226 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
227 |
+
self.ups.append(
|
228 |
+
weight_norm(
|
229 |
+
ConvTranspose1d(
|
230 |
+
upsample_initial_channel // (2**i),
|
231 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
232 |
+
k,
|
233 |
+
u,
|
234 |
+
padding=(k - u) // 2,
|
235 |
+
)
|
236 |
+
)
|
237 |
+
)
|
238 |
+
|
239 |
+
self.resblocks = nn.ModuleList()
|
240 |
+
for i in range(len(self.ups)):
|
241 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
242 |
+
for j, (k, d) in enumerate(
|
243 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
244 |
+
):
|
245 |
+
self.resblocks.append(resblock(ch, k, d))
|
246 |
+
|
247 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
248 |
+
self.ups.apply(init_weights)
|
249 |
+
|
250 |
+
if gin_channels != 0:
|
251 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
252 |
+
|
253 |
+
def forward(self, x, g=None):
|
254 |
+
x = self.conv_pre(x)
|
255 |
+
if g is not None:
|
256 |
+
x = x + self.cond(g)
|
257 |
+
|
258 |
+
for i in range(self.num_upsamples):
|
259 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
260 |
+
x = self.ups[i](x)
|
261 |
+
xs = None
|
262 |
+
for j in range(self.num_kernels):
|
263 |
+
if xs is None:
|
264 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
265 |
+
else:
|
266 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
267 |
+
x = xs / self.num_kernels
|
268 |
+
x = F.leaky_relu(x)
|
269 |
+
x = self.conv_post(x)
|
270 |
+
x = torch.tanh(x)
|
271 |
+
|
272 |
+
return x
|
273 |
+
|
274 |
+
def remove_weight_norm(self):
|
275 |
+
for l in self.ups:
|
276 |
+
remove_weight_norm(l)
|
277 |
+
for l in self.resblocks:
|
278 |
+
l.remove_weight_norm()
|
279 |
+
|
280 |
+
|
281 |
+
class SineGen(torch.nn.Module):
|
282 |
+
"""Definition of sine generator
|
283 |
+
SineGen(samp_rate, harmonic_num = 0,
|
284 |
+
sine_amp = 0.1, noise_std = 0.003,
|
285 |
+
voiced_threshold = 0,
|
286 |
+
flag_for_pulse=False)
|
287 |
+
samp_rate: sampling rate in Hz
|
288 |
+
harmonic_num: number of harmonic overtones (default 0)
|
289 |
+
sine_amp: amplitude of sine-wavefrom (default 0.1)
|
290 |
+
noise_std: std of Gaussian noise (default 0.003)
|
291 |
+
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
292 |
+
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
293 |
+
Note: when flag_for_pulse is True, the first time step of a voiced
|
294 |
+
segment is always sin(np.pi) or cos(0)
|
295 |
+
"""
|
296 |
+
|
297 |
+
def __init__(
|
298 |
+
self,
|
299 |
+
samp_rate,
|
300 |
+
harmonic_num=0,
|
301 |
+
sine_amp=0.1,
|
302 |
+
noise_std=0.003,
|
303 |
+
voiced_threshold=0,
|
304 |
+
flag_for_pulse=False,
|
305 |
+
):
|
306 |
+
super(SineGen, self).__init__()
|
307 |
+
self.sine_amp = sine_amp
|
308 |
+
self.noise_std = noise_std
|
309 |
+
self.harmonic_num = harmonic_num
|
310 |
+
self.dim = self.harmonic_num + 1
|
311 |
+
self.sampling_rate = samp_rate
|
312 |
+
self.voiced_threshold = voiced_threshold
|
313 |
+
|
314 |
+
def _f02uv(self, f0):
|
315 |
+
# generate uv signal
|
316 |
+
uv = torch.ones_like(f0)
|
317 |
+
uv = uv * (f0 > self.voiced_threshold)
|
318 |
+
return uv
|
319 |
+
|
320 |
+
def forward(self, f0, upp):
|
321 |
+
"""sine_tensor, uv = forward(f0)
|
322 |
+
input F0: tensor(batchsize=1, length, dim=1)
|
323 |
+
f0 for unvoiced steps should be 0
|
324 |
+
output sine_tensor: tensor(batchsize=1, length, dim)
|
325 |
+
output uv: tensor(batchsize=1, length, 1)
|
326 |
+
"""
|
327 |
+
with torch.no_grad():
|
328 |
+
f0 = f0[:, None].transpose(1, 2)
|
329 |
+
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
|
330 |
+
# fundamental component
|
331 |
+
f0_buf[:, :, 0] = f0[:, :, 0]
|
332 |
+
for idx in np.arange(self.harmonic_num):
|
333 |
+
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
|
334 |
+
idx + 2
|
335 |
+
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
|
336 |
+
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
|
337 |
+
rand_ini = torch.rand(
|
338 |
+
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
|
339 |
+
)
|
340 |
+
rand_ini[:, 0] = 0
|
341 |
+
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
342 |
+
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
|
343 |
+
tmp_over_one *= upp
|
344 |
+
tmp_over_one = F.interpolate(
|
345 |
+
tmp_over_one.transpose(2, 1),
|
346 |
+
scale_factor=upp,
|
347 |
+
mode="linear",
|
348 |
+
align_corners=True,
|
349 |
+
).transpose(2, 1)
|
350 |
+
rad_values = F.interpolate(
|
351 |
+
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
|
352 |
+
).transpose(
|
353 |
+
2, 1
|
354 |
+
) #######
|
355 |
+
tmp_over_one %= 1
|
356 |
+
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
|
357 |
+
cumsum_shift = torch.zeros_like(rad_values)
|
358 |
+
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
359 |
+
sine_waves = torch.sin(
|
360 |
+
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
|
361 |
+
)
|
362 |
+
sine_waves = sine_waves * self.sine_amp
|
363 |
+
uv = self._f02uv(f0)
|
364 |
+
uv = F.interpolate(
|
365 |
+
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
|
366 |
+
).transpose(2, 1)
|
367 |
+
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
368 |
+
noise = noise_amp * torch.randn_like(sine_waves)
|
369 |
+
sine_waves = sine_waves * uv + noise
|
370 |
+
return sine_waves, uv, noise
|
371 |
+
|
372 |
+
|
373 |
+
class SourceModuleHnNSF(torch.nn.Module):
|
374 |
+
"""SourceModule for hn-nsf
|
375 |
+
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
376 |
+
add_noise_std=0.003, voiced_threshod=0)
|
377 |
+
sampling_rate: sampling_rate in Hz
|
378 |
+
harmonic_num: number of harmonic above F0 (default: 0)
|
379 |
+
sine_amp: amplitude of sine source signal (default: 0.1)
|
380 |
+
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
381 |
+
note that amplitude of noise in unvoiced is decided
|
382 |
+
by sine_amp
|
383 |
+
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
384 |
+
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
385 |
+
F0_sampled (batchsize, length, 1)
|
386 |
+
Sine_source (batchsize, length, 1)
|
387 |
+
noise_source (batchsize, length 1)
|
388 |
+
uv (batchsize, length, 1)
|
389 |
+
"""
|
390 |
+
|
391 |
+
def __init__(
|
392 |
+
self,
|
393 |
+
sampling_rate,
|
394 |
+
harmonic_num=0,
|
395 |
+
sine_amp=0.1,
|
396 |
+
add_noise_std=0.003,
|
397 |
+
voiced_threshod=0,
|
398 |
+
is_half=True,
|
399 |
+
):
|
400 |
+
super(SourceModuleHnNSF, self).__init__()
|
401 |
+
|
402 |
+
self.sine_amp = sine_amp
|
403 |
+
self.noise_std = add_noise_std
|
404 |
+
self.is_half = is_half
|
405 |
+
# to produce sine waveforms
|
406 |
+
self.l_sin_gen = SineGen(
|
407 |
+
sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
|
408 |
+
)
|
409 |
+
|
410 |
+
# to merge source harmonics into a single excitation
|
411 |
+
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
412 |
+
self.l_tanh = torch.nn.Tanh()
|
413 |
+
|
414 |
+
def forward(self, x, upp=None):
|
415 |
+
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
|
416 |
+
if self.is_half:
|
417 |
+
sine_wavs = sine_wavs.half()
|
418 |
+
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
419 |
+
return sine_merge, None, None # noise, uv
|
420 |
+
|
421 |
+
|
422 |
+
class GeneratorNSF(torch.nn.Module):
|
423 |
+
def __init__(
|
424 |
+
self,
|
425 |
+
initial_channel,
|
426 |
+
resblock,
|
427 |
+
resblock_kernel_sizes,
|
428 |
+
resblock_dilation_sizes,
|
429 |
+
upsample_rates,
|
430 |
+
upsample_initial_channel,
|
431 |
+
upsample_kernel_sizes,
|
432 |
+
gin_channels,
|
433 |
+
sr,
|
434 |
+
is_half=False,
|
435 |
+
):
|
436 |
+
super(GeneratorNSF, self).__init__()
|
437 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
438 |
+
self.num_upsamples = len(upsample_rates)
|
439 |
+
|
440 |
+
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
|
441 |
+
self.m_source = SourceModuleHnNSF(
|
442 |
+
sampling_rate=sr, harmonic_num=0, is_half=is_half
|
443 |
+
)
|
444 |
+
self.noise_convs = nn.ModuleList()
|
445 |
+
self.conv_pre = Conv1d(
|
446 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
447 |
+
)
|
448 |
+
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
449 |
+
|
450 |
+
self.ups = nn.ModuleList()
|
451 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
452 |
+
c_cur = upsample_initial_channel // (2 ** (i + 1))
|
453 |
+
self.ups.append(
|
454 |
+
weight_norm(
|
455 |
+
ConvTranspose1d(
|
456 |
+
upsample_initial_channel // (2**i),
|
457 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
458 |
+
k,
|
459 |
+
u,
|
460 |
+
padding=(k - u) // 2,
|
461 |
+
)
|
462 |
+
)
|
463 |
+
)
|
464 |
+
if i + 1 < len(upsample_rates):
|
465 |
+
stride_f0 = np.prod(upsample_rates[i + 1 :])
|
466 |
+
self.noise_convs.append(
|
467 |
+
Conv1d(
|
468 |
+
1,
|
469 |
+
c_cur,
|
470 |
+
kernel_size=stride_f0 * 2,
|
471 |
+
stride=stride_f0,
|
472 |
+
padding=stride_f0 // 2,
|
473 |
+
)
|
474 |
+
)
|
475 |
+
else:
|
476 |
+
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
|
477 |
+
|
478 |
+
self.resblocks = nn.ModuleList()
|
479 |
+
for i in range(len(self.ups)):
|
480 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
481 |
+
for j, (k, d) in enumerate(
|
482 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
483 |
+
):
|
484 |
+
self.resblocks.append(resblock(ch, k, d))
|
485 |
+
|
486 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
487 |
+
self.ups.apply(init_weights)
|
488 |
+
|
489 |
+
if gin_channels != 0:
|
490 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
491 |
+
|
492 |
+
self.upp = np.prod(upsample_rates)
|
493 |
+
|
494 |
+
def forward(self, x, f0, g=None):
|
495 |
+
har_source, noi_source, uv = self.m_source(f0, self.upp)
|
496 |
+
har_source = har_source.transpose(1, 2)
|
497 |
+
x = self.conv_pre(x)
|
498 |
+
if g is not None:
|
499 |
+
x = x + self.cond(g)
|
500 |
+
|
501 |
+
for i in range(self.num_upsamples):
|
502 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
503 |
+
x = self.ups[i](x)
|
504 |
+
x_source = self.noise_convs[i](har_source)
|
505 |
+
x = x + x_source
|
506 |
+
xs = None
|
507 |
+
for j in range(self.num_kernels):
|
508 |
+
if xs is None:
|
509 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
510 |
+
else:
|
511 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
512 |
+
x = xs / self.num_kernels
|
513 |
+
x = F.leaky_relu(x)
|
514 |
+
x = self.conv_post(x)
|
515 |
+
x = torch.tanh(x)
|
516 |
+
return x
|
517 |
+
|
518 |
+
def remove_weight_norm(self):
|
519 |
+
for l in self.ups:
|
520 |
+
remove_weight_norm(l)
|
521 |
+
for l in self.resblocks:
|
522 |
+
l.remove_weight_norm()
|
523 |
+
|
524 |
+
|
525 |
+
sr2sr = {
|
526 |
+
"32k": 32000,
|
527 |
+
"40k": 40000,
|
528 |
+
"48k": 48000,
|
529 |
+
}
|
530 |
+
|
531 |
+
|
532 |
+
class SynthesizerTrnMsNSFsidM(nn.Module):
|
533 |
+
def __init__(
|
534 |
+
self,
|
535 |
+
spec_channels,
|
536 |
+
segment_size,
|
537 |
+
inter_channels,
|
538 |
+
hidden_channels,
|
539 |
+
filter_channels,
|
540 |
+
n_heads,
|
541 |
+
n_layers,
|
542 |
+
kernel_size,
|
543 |
+
p_dropout,
|
544 |
+
resblock,
|
545 |
+
resblock_kernel_sizes,
|
546 |
+
resblock_dilation_sizes,
|
547 |
+
upsample_rates,
|
548 |
+
upsample_initial_channel,
|
549 |
+
upsample_kernel_sizes,
|
550 |
+
spk_embed_dim,
|
551 |
+
gin_channels,
|
552 |
+
sr,
|
553 |
+
version,
|
554 |
+
**kwargs
|
555 |
+
):
|
556 |
+
super().__init__()
|
557 |
+
if type(sr) == type("strr"):
|
558 |
+
sr = sr2sr[sr]
|
559 |
+
self.spec_channels = spec_channels
|
560 |
+
self.inter_channels = inter_channels
|
561 |
+
self.hidden_channels = hidden_channels
|
562 |
+
self.filter_channels = filter_channels
|
563 |
+
self.n_heads = n_heads
|
564 |
+
self.n_layers = n_layers
|
565 |
+
self.kernel_size = kernel_size
|
566 |
+
self.p_dropout = p_dropout
|
567 |
+
self.resblock = resblock
|
568 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
569 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
570 |
+
self.upsample_rates = upsample_rates
|
571 |
+
self.upsample_initial_channel = upsample_initial_channel
|
572 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
573 |
+
self.segment_size = segment_size
|
574 |
+
self.gin_channels = gin_channels
|
575 |
+
# self.hop_length = hop_length#
|
576 |
+
self.spk_embed_dim = spk_embed_dim
|
577 |
+
if version == "v1":
|
578 |
+
self.enc_p = TextEncoder256(
|
579 |
+
inter_channels,
|
580 |
+
hidden_channels,
|
581 |
+
filter_channels,
|
582 |
+
n_heads,
|
583 |
+
n_layers,
|
584 |
+
kernel_size,
|
585 |
+
p_dropout,
|
586 |
+
)
|
587 |
+
else:
|
588 |
+
self.enc_p = TextEncoder768(
|
589 |
+
inter_channels,
|
590 |
+
hidden_channels,
|
591 |
+
filter_channels,
|
592 |
+
n_heads,
|
593 |
+
n_layers,
|
594 |
+
kernel_size,
|
595 |
+
p_dropout,
|
596 |
+
)
|
597 |
+
self.dec = GeneratorNSF(
|
598 |
+
inter_channels,
|
599 |
+
resblock,
|
600 |
+
resblock_kernel_sizes,
|
601 |
+
resblock_dilation_sizes,
|
602 |
+
upsample_rates,
|
603 |
+
upsample_initial_channel,
|
604 |
+
upsample_kernel_sizes,
|
605 |
+
gin_channels=gin_channels,
|
606 |
+
sr=sr,
|
607 |
+
is_half=kwargs["is_half"],
|
608 |
+
)
|
609 |
+
self.enc_q = PosteriorEncoder(
|
610 |
+
spec_channels,
|
611 |
+
inter_channels,
|
612 |
+
hidden_channels,
|
613 |
+
5,
|
614 |
+
1,
|
615 |
+
16,
|
616 |
+
gin_channels=gin_channels,
|
617 |
+
)
|
618 |
+
self.flow = ResidualCouplingBlock(
|
619 |
+
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
620 |
+
)
|
621 |
+
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
622 |
+
self.speaker_map = None
|
623 |
+
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
624 |
+
|
625 |
+
def remove_weight_norm(self):
|
626 |
+
self.dec.remove_weight_norm()
|
627 |
+
self.flow.remove_weight_norm()
|
628 |
+
self.enc_q.remove_weight_norm()
|
629 |
+
|
630 |
+
def construct_spkmixmap(self, n_speaker):
|
631 |
+
self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
|
632 |
+
for i in range(n_speaker):
|
633 |
+
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
|
634 |
+
self.speaker_map = self.speaker_map.unsqueeze(0)
|
635 |
+
|
636 |
+
def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
|
637 |
+
if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
|
638 |
+
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
|
639 |
+
g = g * self.speaker_map # [N, S, B, 1, H]
|
640 |
+
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
|
641 |
+
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
|
642 |
+
else:
|
643 |
+
g = g.unsqueeze(0)
|
644 |
+
g = self.emb_g(g).transpose(1, 2)
|
645 |
+
|
646 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
647 |
+
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
|
648 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
649 |
+
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
650 |
+
return o
|
651 |
+
|
652 |
+
|
653 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
654 |
+
def __init__(self, use_spectral_norm=False):
|
655 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
656 |
+
periods = [2, 3, 5, 7, 11, 17]
|
657 |
+
# periods = [3, 5, 7, 11, 17, 23, 37]
|
658 |
+
|
659 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
660 |
+
discs = discs + [
|
661 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
662 |
+
]
|
663 |
+
self.discriminators = nn.ModuleList(discs)
|
664 |
+
|
665 |
+
def forward(self, y, y_hat):
|
666 |
+
y_d_rs = [] #
|
667 |
+
y_d_gs = []
|
668 |
+
fmap_rs = []
|
669 |
+
fmap_gs = []
|
670 |
+
for i, d in enumerate(self.discriminators):
|
671 |
+
y_d_r, fmap_r = d(y)
|
672 |
+
y_d_g, fmap_g = d(y_hat)
|
673 |
+
# for j in range(len(fmap_r)):
|
674 |
+
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
|
675 |
+
y_d_rs.append(y_d_r)
|
676 |
+
y_d_gs.append(y_d_g)
|
677 |
+
fmap_rs.append(fmap_r)
|
678 |
+
fmap_gs.append(fmap_g)
|
679 |
+
|
680 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
681 |
+
|
682 |
+
|
683 |
+
class MultiPeriodDiscriminatorV2(torch.nn.Module):
|
684 |
+
def __init__(self, use_spectral_norm=False):
|
685 |
+
super(MultiPeriodDiscriminatorV2, self).__init__()
|
686 |
+
# periods = [2, 3, 5, 7, 11, 17]
|
687 |
+
periods = [2, 3, 5, 7, 11, 17, 23, 37]
|
688 |
+
|
689 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
690 |
+
discs = discs + [
|
691 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
692 |
+
]
|
693 |
+
self.discriminators = nn.ModuleList(discs)
|
694 |
+
|
695 |
+
def forward(self, y, y_hat):
|
696 |
+
y_d_rs = [] #
|
697 |
+
y_d_gs = []
|
698 |
+
fmap_rs = []
|
699 |
+
fmap_gs = []
|
700 |
+
for i, d in enumerate(self.discriminators):
|
701 |
+
y_d_r, fmap_r = d(y)
|
702 |
+
y_d_g, fmap_g = d(y_hat)
|
703 |
+
# for j in range(len(fmap_r)):
|
704 |
+
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
|
705 |
+
y_d_rs.append(y_d_r)
|
706 |
+
y_d_gs.append(y_d_g)
|
707 |
+
fmap_rs.append(fmap_r)
|
708 |
+
fmap_gs.append(fmap_g)
|
709 |
+
|
710 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
711 |
+
|
712 |
+
|
713 |
+
class DiscriminatorS(torch.nn.Module):
|
714 |
+
def __init__(self, use_spectral_norm=False):
|
715 |
+
super(DiscriminatorS, self).__init__()
|
716 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
717 |
+
self.convs = nn.ModuleList(
|
718 |
+
[
|
719 |
+
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
720 |
+
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
721 |
+
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
722 |
+
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
723 |
+
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
724 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
725 |
+
]
|
726 |
+
)
|
727 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
728 |
+
|
729 |
+
def forward(self, x):
|
730 |
+
fmap = []
|
731 |
+
|
732 |
+
for l in self.convs:
|
733 |
+
x = l(x)
|
734 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
735 |
+
fmap.append(x)
|
736 |
+
x = self.conv_post(x)
|
737 |
+
fmap.append(x)
|
738 |
+
x = torch.flatten(x, 1, -1)
|
739 |
+
|
740 |
+
return x, fmap
|
741 |
+
|
742 |
+
|
743 |
+
class DiscriminatorP(torch.nn.Module):
|
744 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
745 |
+
super(DiscriminatorP, self).__init__()
|
746 |
+
self.period = period
|
747 |
+
self.use_spectral_norm = use_spectral_norm
|
748 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
749 |
+
self.convs = nn.ModuleList(
|
750 |
+
[
|
751 |
+
norm_f(
|
752 |
+
Conv2d(
|
753 |
+
1,
|
754 |
+
32,
|
755 |
+
(kernel_size, 1),
|
756 |
+
(stride, 1),
|
757 |
+
padding=(get_padding(kernel_size, 1), 0),
|
758 |
+
)
|
759 |
+
),
|
760 |
+
norm_f(
|
761 |
+
Conv2d(
|
762 |
+
32,
|
763 |
+
128,
|
764 |
+
(kernel_size, 1),
|
765 |
+
(stride, 1),
|
766 |
+
padding=(get_padding(kernel_size, 1), 0),
|
767 |
+
)
|
768 |
+
),
|
769 |
+
norm_f(
|
770 |
+
Conv2d(
|
771 |
+
128,
|
772 |
+
512,
|
773 |
+
(kernel_size, 1),
|
774 |
+
(stride, 1),
|
775 |
+
padding=(get_padding(kernel_size, 1), 0),
|
776 |
+
)
|
777 |
+
),
|
778 |
+
norm_f(
|
779 |
+
Conv2d(
|
780 |
+
512,
|
781 |
+
1024,
|
782 |
+
(kernel_size, 1),
|
783 |
+
(stride, 1),
|
784 |
+
padding=(get_padding(kernel_size, 1), 0),
|
785 |
+
)
|
786 |
+
),
|
787 |
+
norm_f(
|
788 |
+
Conv2d(
|
789 |
+
1024,
|
790 |
+
1024,
|
791 |
+
(kernel_size, 1),
|
792 |
+
1,
|
793 |
+
padding=(get_padding(kernel_size, 1), 0),
|
794 |
+
)
|
795 |
+
),
|
796 |
+
]
|
797 |
+
)
|
798 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
799 |
+
|
800 |
+
def forward(self, x):
|
801 |
+
fmap = []
|
802 |
+
|
803 |
+
# 1d to 2d
|
804 |
+
b, c, t = x.shape
|
805 |
+
if t % self.period != 0: # pad first
|
806 |
+
n_pad = self.period - (t % self.period)
|
807 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
808 |
+
t = t + n_pad
|
809 |
+
x = x.view(b, c, t // self.period, self.period)
|
810 |
+
|
811 |
+
for l in self.convs:
|
812 |
+
x = l(x)
|
813 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
814 |
+
fmap.append(x)
|
815 |
+
x = self.conv_post(x)
|
816 |
+
fmap.append(x)
|
817 |
+
x = torch.flatten(x, 1, -1)
|
818 |
+
|
819 |
+
return x, fmap
|
lib/infer_pack/modules.py
ADDED
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import numpy as np
|
4 |
+
import scipy
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
from torch.nn import functional as F
|
8 |
+
|
9 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
10 |
+
from torch.nn.utils import weight_norm, remove_weight_norm
|
11 |
+
|
12 |
+
from lib.infer_pack import commons
|
13 |
+
from lib.infer_pack.commons import init_weights, get_padding
|
14 |
+
from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
|
15 |
+
|
16 |
+
|
17 |
+
LRELU_SLOPE = 0.1
|
18 |
+
|
19 |
+
|
20 |
+
class LayerNorm(nn.Module):
|
21 |
+
def __init__(self, channels, eps=1e-5):
|
22 |
+
super().__init__()
|
23 |
+
self.channels = channels
|
24 |
+
self.eps = eps
|
25 |
+
|
26 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
27 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
28 |
+
|
29 |
+
def forward(self, x):
|
30 |
+
x = x.transpose(1, -1)
|
31 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
32 |
+
return x.transpose(1, -1)
|
33 |
+
|
34 |
+
|
35 |
+
class ConvReluNorm(nn.Module):
|
36 |
+
def __init__(
|
37 |
+
self,
|
38 |
+
in_channels,
|
39 |
+
hidden_channels,
|
40 |
+
out_channels,
|
41 |
+
kernel_size,
|
42 |
+
n_layers,
|
43 |
+
p_dropout,
|
44 |
+
):
|
45 |
+
super().__init__()
|
46 |
+
self.in_channels = in_channels
|
47 |
+
self.hidden_channels = hidden_channels
|
48 |
+
self.out_channels = out_channels
|
49 |
+
self.kernel_size = kernel_size
|
50 |
+
self.n_layers = n_layers
|
51 |
+
self.p_dropout = p_dropout
|
52 |
+
assert n_layers > 1, "Number of layers should be larger than 0."
|
53 |
+
|
54 |
+
self.conv_layers = nn.ModuleList()
|
55 |
+
self.norm_layers = nn.ModuleList()
|
56 |
+
self.conv_layers.append(
|
57 |
+
nn.Conv1d(
|
58 |
+
in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
|
59 |
+
)
|
60 |
+
)
|
61 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
62 |
+
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
|
63 |
+
for _ in range(n_layers - 1):
|
64 |
+
self.conv_layers.append(
|
65 |
+
nn.Conv1d(
|
66 |
+
hidden_channels,
|
67 |
+
hidden_channels,
|
68 |
+
kernel_size,
|
69 |
+
padding=kernel_size // 2,
|
70 |
+
)
|
71 |
+
)
|
72 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
73 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
74 |
+
self.proj.weight.data.zero_()
|
75 |
+
self.proj.bias.data.zero_()
|
76 |
+
|
77 |
+
def forward(self, x, x_mask):
|
78 |
+
x_org = x
|
79 |
+
for i in range(self.n_layers):
|
80 |
+
x = self.conv_layers[i](x * x_mask)
|
81 |
+
x = self.norm_layers[i](x)
|
82 |
+
x = self.relu_drop(x)
|
83 |
+
x = x_org + self.proj(x)
|
84 |
+
return x * x_mask
|
85 |
+
|
86 |
+
|
87 |
+
class DDSConv(nn.Module):
|
88 |
+
"""
|
89 |
+
Dialted and Depth-Separable Convolution
|
90 |
+
"""
|
91 |
+
|
92 |
+
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
|
93 |
+
super().__init__()
|
94 |
+
self.channels = channels
|
95 |
+
self.kernel_size = kernel_size
|
96 |
+
self.n_layers = n_layers
|
97 |
+
self.p_dropout = p_dropout
|
98 |
+
|
99 |
+
self.drop = nn.Dropout(p_dropout)
|
100 |
+
self.convs_sep = nn.ModuleList()
|
101 |
+
self.convs_1x1 = nn.ModuleList()
|
102 |
+
self.norms_1 = nn.ModuleList()
|
103 |
+
self.norms_2 = nn.ModuleList()
|
104 |
+
for i in range(n_layers):
|
105 |
+
dilation = kernel_size**i
|
106 |
+
padding = (kernel_size * dilation - dilation) // 2
|
107 |
+
self.convs_sep.append(
|
108 |
+
nn.Conv1d(
|
109 |
+
channels,
|
110 |
+
channels,
|
111 |
+
kernel_size,
|
112 |
+
groups=channels,
|
113 |
+
dilation=dilation,
|
114 |
+
padding=padding,
|
115 |
+
)
|
116 |
+
)
|
117 |
+
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
118 |
+
self.norms_1.append(LayerNorm(channels))
|
119 |
+
self.norms_2.append(LayerNorm(channels))
|
120 |
+
|
121 |
+
def forward(self, x, x_mask, g=None):
|
122 |
+
if g is not None:
|
123 |
+
x = x + g
|
124 |
+
for i in range(self.n_layers):
|
125 |
+
y = self.convs_sep[i](x * x_mask)
|
126 |
+
y = self.norms_1[i](y)
|
127 |
+
y = F.gelu(y)
|
128 |
+
y = self.convs_1x1[i](y)
|
129 |
+
y = self.norms_2[i](y)
|
130 |
+
y = F.gelu(y)
|
131 |
+
y = self.drop(y)
|
132 |
+
x = x + y
|
133 |
+
return x * x_mask
|
134 |
+
|
135 |
+
|
136 |
+
class WN(torch.nn.Module):
|
137 |
+
def __init__(
|
138 |
+
self,
|
139 |
+
hidden_channels,
|
140 |
+
kernel_size,
|
141 |
+
dilation_rate,
|
142 |
+
n_layers,
|
143 |
+
gin_channels=0,
|
144 |
+
p_dropout=0,
|
145 |
+
):
|
146 |
+
super(WN, self).__init__()
|
147 |
+
assert kernel_size % 2 == 1
|
148 |
+
self.hidden_channels = hidden_channels
|
149 |
+
self.kernel_size = (kernel_size,)
|
150 |
+
self.dilation_rate = dilation_rate
|
151 |
+
self.n_layers = n_layers
|
152 |
+
self.gin_channels = gin_channels
|
153 |
+
self.p_dropout = p_dropout
|
154 |
+
|
155 |
+
self.in_layers = torch.nn.ModuleList()
|
156 |
+
self.res_skip_layers = torch.nn.ModuleList()
|
157 |
+
self.drop = nn.Dropout(p_dropout)
|
158 |
+
|
159 |
+
if gin_channels != 0:
|
160 |
+
cond_layer = torch.nn.Conv1d(
|
161 |
+
gin_channels, 2 * hidden_channels * n_layers, 1
|
162 |
+
)
|
163 |
+
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
|
164 |
+
|
165 |
+
for i in range(n_layers):
|
166 |
+
dilation = dilation_rate**i
|
167 |
+
padding = int((kernel_size * dilation - dilation) / 2)
|
168 |
+
in_layer = torch.nn.Conv1d(
|
169 |
+
hidden_channels,
|
170 |
+
2 * hidden_channels,
|
171 |
+
kernel_size,
|
172 |
+
dilation=dilation,
|
173 |
+
padding=padding,
|
174 |
+
)
|
175 |
+
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
|
176 |
+
self.in_layers.append(in_layer)
|
177 |
+
|
178 |
+
# last one is not necessary
|
179 |
+
if i < n_layers - 1:
|
180 |
+
res_skip_channels = 2 * hidden_channels
|
181 |
+
else:
|
182 |
+
res_skip_channels = hidden_channels
|
183 |
+
|
184 |
+
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
185 |
+
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
|
186 |
+
self.res_skip_layers.append(res_skip_layer)
|
187 |
+
|
188 |
+
def forward(self, x, x_mask, g=None, **kwargs):
|
189 |
+
output = torch.zeros_like(x)
|
190 |
+
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
191 |
+
|
192 |
+
if g is not None:
|
193 |
+
g = self.cond_layer(g)
|
194 |
+
|
195 |
+
for i in range(self.n_layers):
|
196 |
+
x_in = self.in_layers[i](x)
|
197 |
+
if g is not None:
|
198 |
+
cond_offset = i * 2 * self.hidden_channels
|
199 |
+
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
|
200 |
+
else:
|
201 |
+
g_l = torch.zeros_like(x_in)
|
202 |
+
|
203 |
+
acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
|
204 |
+
acts = self.drop(acts)
|
205 |
+
|
206 |
+
res_skip_acts = self.res_skip_layers[i](acts)
|
207 |
+
if i < self.n_layers - 1:
|
208 |
+
res_acts = res_skip_acts[:, : self.hidden_channels, :]
|
209 |
+
x = (x + res_acts) * x_mask
|
210 |
+
output = output + res_skip_acts[:, self.hidden_channels :, :]
|
211 |
+
else:
|
212 |
+
output = output + res_skip_acts
|
213 |
+
return output * x_mask
|
214 |
+
|
215 |
+
def remove_weight_norm(self):
|
216 |
+
if self.gin_channels != 0:
|
217 |
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
218 |
+
for l in self.in_layers:
|
219 |
+
torch.nn.utils.remove_weight_norm(l)
|
220 |
+
for l in self.res_skip_layers:
|
221 |
+
torch.nn.utils.remove_weight_norm(l)
|
222 |
+
|
223 |
+
|
224 |
+
class ResBlock1(torch.nn.Module):
|
225 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
226 |
+
super(ResBlock1, self).__init__()
|
227 |
+
self.convs1 = nn.ModuleList(
|
228 |
+
[
|
229 |
+
weight_norm(
|
230 |
+
Conv1d(
|
231 |
+
channels,
|
232 |
+
channels,
|
233 |
+
kernel_size,
|
234 |
+
1,
|
235 |
+
dilation=dilation[0],
|
236 |
+
padding=get_padding(kernel_size, dilation[0]),
|
237 |
+
)
|
238 |
+
),
|
239 |
+
weight_norm(
|
240 |
+
Conv1d(
|
241 |
+
channels,
|
242 |
+
channels,
|
243 |
+
kernel_size,
|
244 |
+
1,
|
245 |
+
dilation=dilation[1],
|
246 |
+
padding=get_padding(kernel_size, dilation[1]),
|
247 |
+
)
|
248 |
+
),
|
249 |
+
weight_norm(
|
250 |
+
Conv1d(
|
251 |
+
channels,
|
252 |
+
channels,
|
253 |
+
kernel_size,
|
254 |
+
1,
|
255 |
+
dilation=dilation[2],
|
256 |
+
padding=get_padding(kernel_size, dilation[2]),
|
257 |
+
)
|
258 |
+
),
|
259 |
+
]
|
260 |
+
)
|
261 |
+
self.convs1.apply(init_weights)
|
262 |
+
|
263 |
+
self.convs2 = nn.ModuleList(
|
264 |
+
[
|
265 |
+
weight_norm(
|
266 |
+
Conv1d(
|
267 |
+
channels,
|
268 |
+
channels,
|
269 |
+
kernel_size,
|
270 |
+
1,
|
271 |
+
dilation=1,
|
272 |
+
padding=get_padding(kernel_size, 1),
|
273 |
+
)
|
274 |
+
),
|
275 |
+
weight_norm(
|
276 |
+
Conv1d(
|
277 |
+
channels,
|
278 |
+
channels,
|
279 |
+
kernel_size,
|
280 |
+
1,
|
281 |
+
dilation=1,
|
282 |
+
padding=get_padding(kernel_size, 1),
|
283 |
+
)
|
284 |
+
),
|
285 |
+
weight_norm(
|
286 |
+
Conv1d(
|
287 |
+
channels,
|
288 |
+
channels,
|
289 |
+
kernel_size,
|
290 |
+
1,
|
291 |
+
dilation=1,
|
292 |
+
padding=get_padding(kernel_size, 1),
|
293 |
+
)
|
294 |
+
),
|
295 |
+
]
|
296 |
+
)
|
297 |
+
self.convs2.apply(init_weights)
|
298 |
+
|
299 |
+
def forward(self, x, x_mask=None):
|
300 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
301 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
302 |
+
if x_mask is not None:
|
303 |
+
xt = xt * x_mask
|
304 |
+
xt = c1(xt)
|
305 |
+
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
306 |
+
if x_mask is not None:
|
307 |
+
xt = xt * x_mask
|
308 |
+
xt = c2(xt)
|
309 |
+
x = xt + x
|
310 |
+
if x_mask is not None:
|
311 |
+
x = x * x_mask
|
312 |
+
return x
|
313 |
+
|
314 |
+
def remove_weight_norm(self):
|
315 |
+
for l in self.convs1:
|
316 |
+
remove_weight_norm(l)
|
317 |
+
for l in self.convs2:
|
318 |
+
remove_weight_norm(l)
|
319 |
+
|
320 |
+
|
321 |
+
class ResBlock2(torch.nn.Module):
|
322 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
323 |
+
super(ResBlock2, self).__init__()
|
324 |
+
self.convs = nn.ModuleList(
|
325 |
+
[
|
326 |
+
weight_norm(
|
327 |
+
Conv1d(
|
328 |
+
channels,
|
329 |
+
channels,
|
330 |
+
kernel_size,
|
331 |
+
1,
|
332 |
+
dilation=dilation[0],
|
333 |
+
padding=get_padding(kernel_size, dilation[0]),
|
334 |
+
)
|
335 |
+
),
|
336 |
+
weight_norm(
|
337 |
+
Conv1d(
|
338 |
+
channels,
|
339 |
+
channels,
|
340 |
+
kernel_size,
|
341 |
+
1,
|
342 |
+
dilation=dilation[1],
|
343 |
+
padding=get_padding(kernel_size, dilation[1]),
|
344 |
+
)
|
345 |
+
),
|
346 |
+
]
|
347 |
+
)
|
348 |
+
self.convs.apply(init_weights)
|
349 |
+
|
350 |
+
def forward(self, x, x_mask=None):
|
351 |
+
for c in self.convs:
|
352 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
353 |
+
if x_mask is not None:
|
354 |
+
xt = xt * x_mask
|
355 |
+
xt = c(xt)
|
356 |
+
x = xt + x
|
357 |
+
if x_mask is not None:
|
358 |
+
x = x * x_mask
|
359 |
+
return x
|
360 |
+
|
361 |
+
def remove_weight_norm(self):
|
362 |
+
for l in self.convs:
|
363 |
+
remove_weight_norm(l)
|
364 |
+
|
365 |
+
|
366 |
+
class Log(nn.Module):
|
367 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
368 |
+
if not reverse:
|
369 |
+
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
370 |
+
logdet = torch.sum(-y, [1, 2])
|
371 |
+
return y, logdet
|
372 |
+
else:
|
373 |
+
x = torch.exp(x) * x_mask
|
374 |
+
return x
|
375 |
+
|
376 |
+
|
377 |
+
class Flip(nn.Module):
|
378 |
+
def forward(self, x, *args, reverse=False, **kwargs):
|
379 |
+
x = torch.flip(x, [1])
|
380 |
+
if not reverse:
|
381 |
+
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
382 |
+
return x, logdet
|
383 |
+
else:
|
384 |
+
return x
|
385 |
+
|
386 |
+
|
387 |
+
class ElementwiseAffine(nn.Module):
|
388 |
+
def __init__(self, channels):
|
389 |
+
super().__init__()
|
390 |
+
self.channels = channels
|
391 |
+
self.m = nn.Parameter(torch.zeros(channels, 1))
|
392 |
+
self.logs = nn.Parameter(torch.zeros(channels, 1))
|
393 |
+
|
394 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
395 |
+
if not reverse:
|
396 |
+
y = self.m + torch.exp(self.logs) * x
|
397 |
+
y = y * x_mask
|
398 |
+
logdet = torch.sum(self.logs * x_mask, [1, 2])
|
399 |
+
return y, logdet
|
400 |
+
else:
|
401 |
+
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
402 |
+
return x
|
403 |
+
|
404 |
+
|
405 |
+
class ResidualCouplingLayer(nn.Module):
|
406 |
+
def __init__(
|
407 |
+
self,
|
408 |
+
channels,
|
409 |
+
hidden_channels,
|
410 |
+
kernel_size,
|
411 |
+
dilation_rate,
|
412 |
+
n_layers,
|
413 |
+
p_dropout=0,
|
414 |
+
gin_channels=0,
|
415 |
+
mean_only=False,
|
416 |
+
):
|
417 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
418 |
+
super().__init__()
|
419 |
+
self.channels = channels
|
420 |
+
self.hidden_channels = hidden_channels
|
421 |
+
self.kernel_size = kernel_size
|
422 |
+
self.dilation_rate = dilation_rate
|
423 |
+
self.n_layers = n_layers
|
424 |
+
self.half_channels = channels // 2
|
425 |
+
self.mean_only = mean_only
|
426 |
+
|
427 |
+
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
428 |
+
self.enc = WN(
|
429 |
+
hidden_channels,
|
430 |
+
kernel_size,
|
431 |
+
dilation_rate,
|
432 |
+
n_layers,
|
433 |
+
p_dropout=p_dropout,
|
434 |
+
gin_channels=gin_channels,
|
435 |
+
)
|
436 |
+
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
437 |
+
self.post.weight.data.zero_()
|
438 |
+
self.post.bias.data.zero_()
|
439 |
+
|
440 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
441 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
442 |
+
h = self.pre(x0) * x_mask
|
443 |
+
h = self.enc(h, x_mask, g=g)
|
444 |
+
stats = self.post(h) * x_mask
|
445 |
+
if not self.mean_only:
|
446 |
+
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
447 |
+
else:
|
448 |
+
m = stats
|
449 |
+
logs = torch.zeros_like(m)
|
450 |
+
|
451 |
+
if not reverse:
|
452 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
453 |
+
x = torch.cat([x0, x1], 1)
|
454 |
+
logdet = torch.sum(logs, [1, 2])
|
455 |
+
return x, logdet
|
456 |
+
else:
|
457 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
458 |
+
x = torch.cat([x0, x1], 1)
|
459 |
+
return x
|
460 |
+
|
461 |
+
def remove_weight_norm(self):
|
462 |
+
self.enc.remove_weight_norm()
|
463 |
+
|
464 |
+
|
465 |
+
class ConvFlow(nn.Module):
|
466 |
+
def __init__(
|
467 |
+
self,
|
468 |
+
in_channels,
|
469 |
+
filter_channels,
|
470 |
+
kernel_size,
|
471 |
+
n_layers,
|
472 |
+
num_bins=10,
|
473 |
+
tail_bound=5.0,
|
474 |
+
):
|
475 |
+
super().__init__()
|
476 |
+
self.in_channels = in_channels
|
477 |
+
self.filter_channels = filter_channels
|
478 |
+
self.kernel_size = kernel_size
|
479 |
+
self.n_layers = n_layers
|
480 |
+
self.num_bins = num_bins
|
481 |
+
self.tail_bound = tail_bound
|
482 |
+
self.half_channels = in_channels // 2
|
483 |
+
|
484 |
+
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
|
485 |
+
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
|
486 |
+
self.proj = nn.Conv1d(
|
487 |
+
filter_channels, self.half_channels * (num_bins * 3 - 1), 1
|
488 |
+
)
|
489 |
+
self.proj.weight.data.zero_()
|
490 |
+
self.proj.bias.data.zero_()
|
491 |
+
|
492 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
493 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
494 |
+
h = self.pre(x0)
|
495 |
+
h = self.convs(h, x_mask, g=g)
|
496 |
+
h = self.proj(h) * x_mask
|
497 |
+
|
498 |
+
b, c, t = x0.shape
|
499 |
+
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
|
500 |
+
|
501 |
+
unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
|
502 |
+
unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
|
503 |
+
self.filter_channels
|
504 |
+
)
|
505 |
+
unnormalized_derivatives = h[..., 2 * self.num_bins :]
|
506 |
+
|
507 |
+
x1, logabsdet = piecewise_rational_quadratic_transform(
|
508 |
+
x1,
|
509 |
+
unnormalized_widths,
|
510 |
+
unnormalized_heights,
|
511 |
+
unnormalized_derivatives,
|
512 |
+
inverse=reverse,
|
513 |
+
tails="linear",
|
514 |
+
tail_bound=self.tail_bound,
|
515 |
+
)
|
516 |
+
|
517 |
+
x = torch.cat([x0, x1], 1) * x_mask
|
518 |
+
logdet = torch.sum(logabsdet * x_mask, [1, 2])
|
519 |
+
if not reverse:
|
520 |
+
return x, logdet
|
521 |
+
else:
|
522 |
+
return x
|
lib/infer_pack/modules/F0Predictor/DioF0Predictor.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
+
import pyworld
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class DioF0Predictor(F0Predictor):
|
7 |
+
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
|
8 |
+
self.hop_length = hop_length
|
9 |
+
self.f0_min = f0_min
|
10 |
+
self.f0_max = f0_max
|
11 |
+
self.sampling_rate = sampling_rate
|
12 |
+
|
13 |
+
def interpolate_f0(self, f0):
|
14 |
+
"""
|
15 |
+
对F0进行插值处理
|
16 |
+
"""
|
17 |
+
|
18 |
+
data = np.reshape(f0, (f0.size, 1))
|
19 |
+
|
20 |
+
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
21 |
+
vuv_vector[data > 0.0] = 1.0
|
22 |
+
vuv_vector[data <= 0.0] = 0.0
|
23 |
+
|
24 |
+
ip_data = data
|
25 |
+
|
26 |
+
frame_number = data.size
|
27 |
+
last_value = 0.0
|
28 |
+
for i in range(frame_number):
|
29 |
+
if data[i] <= 0.0:
|
30 |
+
j = i + 1
|
31 |
+
for j in range(i + 1, frame_number):
|
32 |
+
if data[j] > 0.0:
|
33 |
+
break
|
34 |
+
if j < frame_number - 1:
|
35 |
+
if last_value > 0.0:
|
36 |
+
step = (data[j] - data[i - 1]) / float(j - i)
|
37 |
+
for k in range(i, j):
|
38 |
+
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
39 |
+
else:
|
40 |
+
for k in range(i, j):
|
41 |
+
ip_data[k] = data[j]
|
42 |
+
else:
|
43 |
+
for k in range(i, frame_number):
|
44 |
+
ip_data[k] = last_value
|
45 |
+
else:
|
46 |
+
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
|
47 |
+
last_value = data[i]
|
48 |
+
|
49 |
+
return ip_data[:, 0], vuv_vector[:, 0]
|
50 |
+
|
51 |
+
def resize_f0(self, x, target_len):
|
52 |
+
source = np.array(x)
|
53 |
+
source[source < 0.001] = np.nan
|
54 |
+
target = np.interp(
|
55 |
+
np.arange(0, len(source) * target_len, len(source)) / target_len,
|
56 |
+
np.arange(0, len(source)),
|
57 |
+
source,
|
58 |
+
)
|
59 |
+
res = np.nan_to_num(target)
|
60 |
+
return res
|
61 |
+
|
62 |
+
def compute_f0(self, wav, p_len=None):
|
63 |
+
if p_len is None:
|
64 |
+
p_len = wav.shape[0] // self.hop_length
|
65 |
+
f0, t = pyworld.dio(
|
66 |
+
wav.astype(np.double),
|
67 |
+
fs=self.sampling_rate,
|
68 |
+
f0_floor=self.f0_min,
|
69 |
+
f0_ceil=self.f0_max,
|
70 |
+
frame_period=1000 * self.hop_length / self.sampling_rate,
|
71 |
+
)
|
72 |
+
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
73 |
+
for index, pitch in enumerate(f0):
|
74 |
+
f0[index] = round(pitch, 1)
|
75 |
+
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
|
76 |
+
|
77 |
+
def compute_f0_uv(self, wav, p_len=None):
|
78 |
+
if p_len is None:
|
79 |
+
p_len = wav.shape[0] // self.hop_length
|
80 |
+
f0, t = pyworld.dio(
|
81 |
+
wav.astype(np.double),
|
82 |
+
fs=self.sampling_rate,
|
83 |
+
f0_floor=self.f0_min,
|
84 |
+
f0_ceil=self.f0_max,
|
85 |
+
frame_period=1000 * self.hop_length / self.sampling_rate,
|
86 |
+
)
|
87 |
+
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
88 |
+
for index, pitch in enumerate(f0):
|
89 |
+
f0[index] = round(pitch, 1)
|
90 |
+
return self.interpolate_f0(self.resize_f0(f0, p_len))
|
lib/infer_pack/modules/F0Predictor/F0Predictor.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class F0Predictor(object):
|
2 |
+
def compute_f0(self, wav, p_len):
|
3 |
+
"""
|
4 |
+
input: wav:[signal_length]
|
5 |
+
p_len:int
|
6 |
+
output: f0:[signal_length//hop_length]
|
7 |
+
"""
|
8 |
+
pass
|
9 |
+
|
10 |
+
def compute_f0_uv(self, wav, p_len):
|
11 |
+
"""
|
12 |
+
input: wav:[signal_length]
|
13 |
+
p_len:int
|
14 |
+
output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
|
15 |
+
"""
|
16 |
+
pass
|
lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
+
import pyworld
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class HarvestF0Predictor(F0Predictor):
|
7 |
+
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
|
8 |
+
self.hop_length = hop_length
|
9 |
+
self.f0_min = f0_min
|
10 |
+
self.f0_max = f0_max
|
11 |
+
self.sampling_rate = sampling_rate
|
12 |
+
|
13 |
+
def interpolate_f0(self, f0):
|
14 |
+
"""
|
15 |
+
对F0进行插值处理
|
16 |
+
"""
|
17 |
+
|
18 |
+
data = np.reshape(f0, (f0.size, 1))
|
19 |
+
|
20 |
+
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
21 |
+
vuv_vector[data > 0.0] = 1.0
|
22 |
+
vuv_vector[data <= 0.0] = 0.0
|
23 |
+
|
24 |
+
ip_data = data
|
25 |
+
|
26 |
+
frame_number = data.size
|
27 |
+
last_value = 0.0
|
28 |
+
for i in range(frame_number):
|
29 |
+
if data[i] <= 0.0:
|
30 |
+
j = i + 1
|
31 |
+
for j in range(i + 1, frame_number):
|
32 |
+
if data[j] > 0.0:
|
33 |
+
break
|
34 |
+
if j < frame_number - 1:
|
35 |
+
if last_value > 0.0:
|
36 |
+
step = (data[j] - data[i - 1]) / float(j - i)
|
37 |
+
for k in range(i, j):
|
38 |
+
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
39 |
+
else:
|
40 |
+
for k in range(i, j):
|
41 |
+
ip_data[k] = data[j]
|
42 |
+
else:
|
43 |
+
for k in range(i, frame_number):
|
44 |
+
ip_data[k] = last_value
|
45 |
+
else:
|
46 |
+
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
|
47 |
+
last_value = data[i]
|
48 |
+
|
49 |
+
return ip_data[:, 0], vuv_vector[:, 0]
|
50 |
+
|
51 |
+
def resize_f0(self, x, target_len):
|
52 |
+
source = np.array(x)
|
53 |
+
source[source < 0.001] = np.nan
|
54 |
+
target = np.interp(
|
55 |
+
np.arange(0, len(source) * target_len, len(source)) / target_len,
|
56 |
+
np.arange(0, len(source)),
|
57 |
+
source,
|
58 |
+
)
|
59 |
+
res = np.nan_to_num(target)
|
60 |
+
return res
|
61 |
+
|
62 |
+
def compute_f0(self, wav, p_len=None):
|
63 |
+
if p_len is None:
|
64 |
+
p_len = wav.shape[0] // self.hop_length
|
65 |
+
f0, t = pyworld.harvest(
|
66 |
+
wav.astype(np.double),
|
67 |
+
fs=self.hop_length,
|
68 |
+
f0_ceil=self.f0_max,
|
69 |
+
f0_floor=self.f0_min,
|
70 |
+
frame_period=1000 * self.hop_length / self.sampling_rate,
|
71 |
+
)
|
72 |
+
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
|
73 |
+
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
|
74 |
+
|
75 |
+
def compute_f0_uv(self, wav, p_len=None):
|
76 |
+
if p_len is None:
|
77 |
+
p_len = wav.shape[0] // self.hop_length
|
78 |
+
f0, t = pyworld.harvest(
|
79 |
+
wav.astype(np.double),
|
80 |
+
fs=self.sampling_rate,
|
81 |
+
f0_floor=self.f0_min,
|
82 |
+
f0_ceil=self.f0_max,
|
83 |
+
frame_period=1000 * self.hop_length / self.sampling_rate,
|
84 |
+
)
|
85 |
+
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
86 |
+
return self.interpolate_f0(self.resize_f0(f0, p_len))
|
lib/infer_pack/modules/F0Predictor/PMF0Predictor.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
+
import parselmouth
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class PMF0Predictor(F0Predictor):
|
7 |
+
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
|
8 |
+
self.hop_length = hop_length
|
9 |
+
self.f0_min = f0_min
|
10 |
+
self.f0_max = f0_max
|
11 |
+
self.sampling_rate = sampling_rate
|
12 |
+
|
13 |
+
def interpolate_f0(self, f0):
|
14 |
+
"""
|
15 |
+
对F0进行插值处理
|
16 |
+
"""
|
17 |
+
|
18 |
+
data = np.reshape(f0, (f0.size, 1))
|
19 |
+
|
20 |
+
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
21 |
+
vuv_vector[data > 0.0] = 1.0
|
22 |
+
vuv_vector[data <= 0.0] = 0.0
|
23 |
+
|
24 |
+
ip_data = data
|
25 |
+
|
26 |
+
frame_number = data.size
|
27 |
+
last_value = 0.0
|
28 |
+
for i in range(frame_number):
|
29 |
+
if data[i] <= 0.0:
|
30 |
+
j = i + 1
|
31 |
+
for j in range(i + 1, frame_number):
|
32 |
+
if data[j] > 0.0:
|
33 |
+
break
|
34 |
+
if j < frame_number - 1:
|
35 |
+
if last_value > 0.0:
|
36 |
+
step = (data[j] - data[i - 1]) / float(j - i)
|
37 |
+
for k in range(i, j):
|
38 |
+
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
39 |
+
else:
|
40 |
+
for k in range(i, j):
|
41 |
+
ip_data[k] = data[j]
|
42 |
+
else:
|
43 |
+
for k in range(i, frame_number):
|
44 |
+
ip_data[k] = last_value
|
45 |
+
else:
|
46 |
+
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
|
47 |
+
last_value = data[i]
|
48 |
+
|
49 |
+
return ip_data[:, 0], vuv_vector[:, 0]
|
50 |
+
|
51 |
+
def compute_f0(self, wav, p_len=None):
|
52 |
+
x = wav
|
53 |
+
if p_len is None:
|
54 |
+
p_len = x.shape[0] // self.hop_length
|
55 |
+
else:
|
56 |
+
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
|
57 |
+
time_step = self.hop_length / self.sampling_rate * 1000
|
58 |
+
f0 = (
|
59 |
+
parselmouth.Sound(x, self.sampling_rate)
|
60 |
+
.to_pitch_ac(
|
61 |
+
time_step=time_step / 1000,
|
62 |
+
voicing_threshold=0.6,
|
63 |
+
pitch_floor=self.f0_min,
|
64 |
+
pitch_ceiling=self.f0_max,
|
65 |
+
)
|
66 |
+
.selected_array["frequency"]
|
67 |
+
)
|
68 |
+
|
69 |
+
pad_size = (p_len - len(f0) + 1) // 2
|
70 |
+
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
71 |
+
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
72 |
+
f0, uv = self.interpolate_f0(f0)
|
73 |
+
return f0
|
74 |
+
|
75 |
+
def compute_f0_uv(self, wav, p_len=None):
|
76 |
+
x = wav
|
77 |
+
if p_len is None:
|
78 |
+
p_len = x.shape[0] // self.hop_length
|
79 |
+
else:
|
80 |
+
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
|
81 |
+
time_step = self.hop_length / self.sampling_rate * 1000
|
82 |
+
f0 = (
|
83 |
+
parselmouth.Sound(x, self.sampling_rate)
|
84 |
+
.to_pitch_ac(
|
85 |
+
time_step=time_step / 1000,
|
86 |
+
voicing_threshold=0.6,
|
87 |
+
pitch_floor=self.f0_min,
|
88 |
+
pitch_ceiling=self.f0_max,
|
89 |
+
)
|
90 |
+
.selected_array["frequency"]
|
91 |
+
)
|
92 |
+
|
93 |
+
pad_size = (p_len - len(f0) + 1) // 2
|
94 |
+
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
95 |
+
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
96 |
+
f0, uv = self.interpolate_f0(f0)
|
97 |
+
return f0, uv
|
lib/infer_pack/modules/F0Predictor/__init__.py
ADDED
File without changes
|
lib/infer_pack/transforms.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.nn import functional as F
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
DEFAULT_MIN_BIN_WIDTH = 1e-3
|
8 |
+
DEFAULT_MIN_BIN_HEIGHT = 1e-3
|
9 |
+
DEFAULT_MIN_DERIVATIVE = 1e-3
|
10 |
+
|
11 |
+
|
12 |
+
def piecewise_rational_quadratic_transform(
|
13 |
+
inputs,
|
14 |
+
unnormalized_widths,
|
15 |
+
unnormalized_heights,
|
16 |
+
unnormalized_derivatives,
|
17 |
+
inverse=False,
|
18 |
+
tails=None,
|
19 |
+
tail_bound=1.0,
|
20 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
21 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
22 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
23 |
+
):
|
24 |
+
if tails is None:
|
25 |
+
spline_fn = rational_quadratic_spline
|
26 |
+
spline_kwargs = {}
|
27 |
+
else:
|
28 |
+
spline_fn = unconstrained_rational_quadratic_spline
|
29 |
+
spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
|
30 |
+
|
31 |
+
outputs, logabsdet = spline_fn(
|
32 |
+
inputs=inputs,
|
33 |
+
unnormalized_widths=unnormalized_widths,
|
34 |
+
unnormalized_heights=unnormalized_heights,
|
35 |
+
unnormalized_derivatives=unnormalized_derivatives,
|
36 |
+
inverse=inverse,
|
37 |
+
min_bin_width=min_bin_width,
|
38 |
+
min_bin_height=min_bin_height,
|
39 |
+
min_derivative=min_derivative,
|
40 |
+
**spline_kwargs
|
41 |
+
)
|
42 |
+
return outputs, logabsdet
|
43 |
+
|
44 |
+
|
45 |
+
def searchsorted(bin_locations, inputs, eps=1e-6):
|
46 |
+
bin_locations[..., -1] += eps
|
47 |
+
return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
|
48 |
+
|
49 |
+
|
50 |
+
def unconstrained_rational_quadratic_spline(
|
51 |
+
inputs,
|
52 |
+
unnormalized_widths,
|
53 |
+
unnormalized_heights,
|
54 |
+
unnormalized_derivatives,
|
55 |
+
inverse=False,
|
56 |
+
tails="linear",
|
57 |
+
tail_bound=1.0,
|
58 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
59 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
60 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
61 |
+
):
|
62 |
+
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
|
63 |
+
outside_interval_mask = ~inside_interval_mask
|
64 |
+
|
65 |
+
outputs = torch.zeros_like(inputs)
|
66 |
+
logabsdet = torch.zeros_like(inputs)
|
67 |
+
|
68 |
+
if tails == "linear":
|
69 |
+
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
|
70 |
+
constant = np.log(np.exp(1 - min_derivative) - 1)
|
71 |
+
unnormalized_derivatives[..., 0] = constant
|
72 |
+
unnormalized_derivatives[..., -1] = constant
|
73 |
+
|
74 |
+
outputs[outside_interval_mask] = inputs[outside_interval_mask]
|
75 |
+
logabsdet[outside_interval_mask] = 0
|
76 |
+
else:
|
77 |
+
raise RuntimeError("{} tails are not implemented.".format(tails))
|
78 |
+
|
79 |
+
(
|
80 |
+
outputs[inside_interval_mask],
|
81 |
+
logabsdet[inside_interval_mask],
|
82 |
+
) = rational_quadratic_spline(
|
83 |
+
inputs=inputs[inside_interval_mask],
|
84 |
+
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
|
85 |
+
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
|
86 |
+
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
|
87 |
+
inverse=inverse,
|
88 |
+
left=-tail_bound,
|
89 |
+
right=tail_bound,
|
90 |
+
bottom=-tail_bound,
|
91 |
+
top=tail_bound,
|
92 |
+
min_bin_width=min_bin_width,
|
93 |
+
min_bin_height=min_bin_height,
|
94 |
+
min_derivative=min_derivative,
|
95 |
+
)
|
96 |
+
|
97 |
+
return outputs, logabsdet
|
98 |
+
|
99 |
+
|
100 |
+
def rational_quadratic_spline(
|
101 |
+
inputs,
|
102 |
+
unnormalized_widths,
|
103 |
+
unnormalized_heights,
|
104 |
+
unnormalized_derivatives,
|
105 |
+
inverse=False,
|
106 |
+
left=0.0,
|
107 |
+
right=1.0,
|
108 |
+
bottom=0.0,
|
109 |
+
top=1.0,
|
110 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
111 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
112 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
113 |
+
):
|
114 |
+
if torch.min(inputs) < left or torch.max(inputs) > right:
|
115 |
+
raise ValueError("Input to a transform is not within its domain")
|
116 |
+
|
117 |
+
num_bins = unnormalized_widths.shape[-1]
|
118 |
+
|
119 |
+
if min_bin_width * num_bins > 1.0:
|
120 |
+
raise ValueError("Minimal bin width too large for the number of bins")
|
121 |
+
if min_bin_height * num_bins > 1.0:
|
122 |
+
raise ValueError("Minimal bin height too large for the number of bins")
|
123 |
+
|
124 |
+
widths = F.softmax(unnormalized_widths, dim=-1)
|
125 |
+
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
|
126 |
+
cumwidths = torch.cumsum(widths, dim=-1)
|
127 |
+
cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
|
128 |
+
cumwidths = (right - left) * cumwidths + left
|
129 |
+
cumwidths[..., 0] = left
|
130 |
+
cumwidths[..., -1] = right
|
131 |
+
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
|
132 |
+
|
133 |
+
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
|
134 |
+
|
135 |
+
heights = F.softmax(unnormalized_heights, dim=-1)
|
136 |
+
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
|
137 |
+
cumheights = torch.cumsum(heights, dim=-1)
|
138 |
+
cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
|
139 |
+
cumheights = (top - bottom) * cumheights + bottom
|
140 |
+
cumheights[..., 0] = bottom
|
141 |
+
cumheights[..., -1] = top
|
142 |
+
heights = cumheights[..., 1:] - cumheights[..., :-1]
|
143 |
+
|
144 |
+
if inverse:
|
145 |
+
bin_idx = searchsorted(cumheights, inputs)[..., None]
|
146 |
+
else:
|
147 |
+
bin_idx = searchsorted(cumwidths, inputs)[..., None]
|
148 |
+
|
149 |
+
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
|
150 |
+
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
|
151 |
+
|
152 |
+
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
|
153 |
+
delta = heights / widths
|
154 |
+
input_delta = delta.gather(-1, bin_idx)[..., 0]
|
155 |
+
|
156 |
+
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
|
157 |
+
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
|
158 |
+
|
159 |
+
input_heights = heights.gather(-1, bin_idx)[..., 0]
|
160 |
+
|
161 |
+
if inverse:
|
162 |
+
a = (inputs - input_cumheights) * (
|
163 |
+
input_derivatives + input_derivatives_plus_one - 2 * input_delta
|
164 |
+
) + input_heights * (input_delta - input_derivatives)
|
165 |
+
b = input_heights * input_derivatives - (inputs - input_cumheights) * (
|
166 |
+
input_derivatives + input_derivatives_plus_one - 2 * input_delta
|
167 |
+
)
|
168 |
+
c = -input_delta * (inputs - input_cumheights)
|
169 |
+
|
170 |
+
discriminant = b.pow(2) - 4 * a * c
|
171 |
+
assert (discriminant >= 0).all()
|
172 |
+
|
173 |
+
root = (2 * c) / (-b - torch.sqrt(discriminant))
|
174 |
+
outputs = root * input_bin_widths + input_cumwidths
|
175 |
+
|
176 |
+
theta_one_minus_theta = root * (1 - root)
|
177 |
+
denominator = input_delta + (
|
178 |
+
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
179 |
+
* theta_one_minus_theta
|
180 |
+
)
|
181 |
+
derivative_numerator = input_delta.pow(2) * (
|
182 |
+
input_derivatives_plus_one * root.pow(2)
|
183 |
+
+ 2 * input_delta * theta_one_minus_theta
|
184 |
+
+ input_derivatives * (1 - root).pow(2)
|
185 |
+
)
|
186 |
+
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
187 |
+
|
188 |
+
return outputs, -logabsdet
|
189 |
+
else:
|
190 |
+
theta = (inputs - input_cumwidths) / input_bin_widths
|
191 |
+
theta_one_minus_theta = theta * (1 - theta)
|
192 |
+
|
193 |
+
numerator = input_heights * (
|
194 |
+
input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
|
195 |
+
)
|
196 |
+
denominator = input_delta + (
|
197 |
+
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
198 |
+
* theta_one_minus_theta
|
199 |
+
)
|
200 |
+
outputs = input_cumheights + numerator / denominator
|
201 |
+
|
202 |
+
derivative_numerator = input_delta.pow(2) * (
|
203 |
+
input_derivatives_plus_one * theta.pow(2)
|
204 |
+
+ 2 * input_delta * theta_one_minus_theta
|
205 |
+
+ input_derivatives * (1 - theta).pow(2)
|
206 |
+
)
|
207 |
+
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
208 |
+
|
209 |
+
return outputs, logabsdet
|
packages.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
build-essential
|
2 |
+
ffmpeg
|
3 |
+
aria2
|
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gTTS
|
2 |
+
elevenlabs
|
3 |
+
stftpitchshift==1.5.1
|
4 |
+
torchcrepe
|
5 |
+
setuptools
|
6 |
+
wheel
|
7 |
+
httpx==0.23.0
|
8 |
+
faiss-gpu
|
9 |
+
fairseq
|
10 |
+
gradio==3.34.0
|
11 |
+
ffmpeg-python
|
12 |
+
praat-parselmouth
|
13 |
+
pyworld
|
14 |
+
numpy==1.23.5
|
15 |
+
i18n
|
16 |
+
numba==0.56.4
|
17 |
+
librosa==0.9.2
|
18 |
+
mega.py
|
19 |
+
gdown
|
20 |
+
onnxruntime
|
21 |
+
pyngrok==4.1.12
|
22 |
+
torch
|
utils.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ffmpeg
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
# import praatio
|
5 |
+
# import praatio.praat_scripts
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
|
9 |
+
import random
|
10 |
+
|
11 |
+
import csv
|
12 |
+
|
13 |
+
platform_stft_mapping = {
|
14 |
+
"linux": "stftpitchshift",
|
15 |
+
"darwin": "stftpitchshift",
|
16 |
+
"win32": "stftpitchshift.exe",
|
17 |
+
}
|
18 |
+
|
19 |
+
stft = platform_stft_mapping.get(sys.platform)
|
20 |
+
# praatEXE = join('.',os.path.abspath(os.getcwd()) + r"\Praat.exe")
|
21 |
+
|
22 |
+
|
23 |
+
def CSVutil(file, rw, type, *args):
|
24 |
+
if type == "formanting":
|
25 |
+
if rw == "r":
|
26 |
+
with open(file) as fileCSVread:
|
27 |
+
csv_reader = list(csv.reader(fileCSVread))
|
28 |
+
return (
|
29 |
+
(csv_reader[0][0], csv_reader[0][1], csv_reader[0][2])
|
30 |
+
if csv_reader is not None
|
31 |
+
else (lambda: exec('raise ValueError("No data")'))()
|
32 |
+
)
|
33 |
+
else:
|
34 |
+
if args:
|
35 |
+
doformnt = args[0]
|
36 |
+
else:
|
37 |
+
doformnt = False
|
38 |
+
qfr = args[1] if len(args) > 1 else 1.0
|
39 |
+
tmb = args[2] if len(args) > 2 else 1.0
|
40 |
+
with open(file, rw, newline="") as fileCSVwrite:
|
41 |
+
csv_writer = csv.writer(fileCSVwrite, delimiter=",")
|
42 |
+
csv_writer.writerow([doformnt, qfr, tmb])
|
43 |
+
elif type == "stop":
|
44 |
+
stop = args[0] if args else False
|
45 |
+
with open(file, rw, newline="") as fileCSVwrite:
|
46 |
+
csv_writer = csv.writer(fileCSVwrite, delimiter=",")
|
47 |
+
csv_writer.writerow([stop])
|
48 |
+
|
49 |
+
|
50 |
+
def load_audio(file, sr, DoFormant, Quefrency, Timbre):
|
51 |
+
converted = False
|
52 |
+
DoFormant, Quefrency, Timbre = CSVutil("csvdb/formanting.csv", "r", "formanting")
|
53 |
+
try:
|
54 |
+
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
|
55 |
+
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
|
56 |
+
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
57 |
+
file = (
|
58 |
+
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
59 |
+
) # 防止小白拷路径头尾带了空格和"和回车
|
60 |
+
file_formanted = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
61 |
+
|
62 |
+
# print(f"dofor={bool(DoFormant)} timbr={Timbre} quef={Quefrency}\n")
|
63 |
+
|
64 |
+
if (
|
65 |
+
lambda DoFormant: True
|
66 |
+
if DoFormant.lower() == "true"
|
67 |
+
else (False if DoFormant.lower() == "false" else DoFormant)
|
68 |
+
)(DoFormant):
|
69 |
+
numerator = round(random.uniform(1, 4), 4)
|
70 |
+
# os.system(f"stftpitchshift -i {file} -q {Quefrency} -t {Timbre} -o {file_formanted}")
|
71 |
+
# print('stftpitchshift -i "%s" -p 1.0 --rms -w 128 -v 8 -q %s -t %s -o "%s"' % (file, Quefrency, Timbre, file_formanted))
|
72 |
+
|
73 |
+
if not file.endswith(".wav"):
|
74 |
+
if not os.path.isfile(f"{file_formanted}.wav"):
|
75 |
+
converted = True
|
76 |
+
# print(f"\nfile = {file}\n")
|
77 |
+
# print(f"\nfile_formanted = {file_formanted}\n")
|
78 |
+
converting = (
|
79 |
+
ffmpeg.input(file_formanted, threads=0)
|
80 |
+
.output(f"{file_formanted}.wav")
|
81 |
+
.run(
|
82 |
+
cmd=["ffmpeg", "-nostdin"],
|
83 |
+
capture_stdout=True,
|
84 |
+
capture_stderr=True,
|
85 |
+
)
|
86 |
+
)
|
87 |
+
else:
|
88 |
+
pass
|
89 |
+
|
90 |
+
file_formanted = (
|
91 |
+
f"{file_formanted}.wav"
|
92 |
+
if not file_formanted.endswith(".wav")
|
93 |
+
else file_formanted
|
94 |
+
)
|
95 |
+
|
96 |
+
print(f" · Formanting {file_formanted}...\n")
|
97 |
+
|
98 |
+
os.system(
|
99 |
+
'%s -i "%s" -q "%s" -t "%s" -o "%sFORMANTED_%s.wav"'
|
100 |
+
% (
|
101 |
+
stft,
|
102 |
+
file_formanted,
|
103 |
+
Quefrency,
|
104 |
+
Timbre,
|
105 |
+
file_formanted,
|
106 |
+
str(numerator),
|
107 |
+
)
|
108 |
+
)
|
109 |
+
|
110 |
+
print(f" · Formanted {file_formanted}!\n")
|
111 |
+
|
112 |
+
# filepraat = (os.path.abspath(os.getcwd()) + '\\' + file).replace('/','\\')
|
113 |
+
# file_formantedpraat = ('"' + os.path.abspath(os.getcwd()) + '/' + 'formanted'.join(file_formanted) + '"').replace('/','\\')
|
114 |
+
# print("%sFORMANTED_%s.wav" % (file_formanted, str(numerator)))
|
115 |
+
|
116 |
+
out, _ = (
|
117 |
+
ffmpeg.input(
|
118 |
+
"%sFORMANTED_%s.wav" % (file_formanted, str(numerator)), threads=0
|
119 |
+
)
|
120 |
+
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
|
121 |
+
.run(
|
122 |
+
cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
|
123 |
+
)
|
124 |
+
)
|
125 |
+
|
126 |
+
try:
|
127 |
+
os.remove("%sFORMANTED_%s.wav" % (file_formanted, str(numerator)))
|
128 |
+
except Exception:
|
129 |
+
pass
|
130 |
+
print("couldn't remove formanted type of file")
|
131 |
+
|
132 |
+
else:
|
133 |
+
out, _ = (
|
134 |
+
ffmpeg.input(file, threads=0)
|
135 |
+
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
|
136 |
+
.run(
|
137 |
+
cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
|
138 |
+
)
|
139 |
+
)
|
140 |
+
except Exception as e:
|
141 |
+
raise RuntimeError(f"Failed to load audio: {e}")
|
142 |
+
|
143 |
+
if converted:
|
144 |
+
try:
|
145 |
+
os.remove(file_formanted)
|
146 |
+
except Exception:
|
147 |
+
pass
|
148 |
+
print("couldn't remove converted type of file")
|
149 |
+
converted = False
|
150 |
+
|
151 |
+
return np.frombuffer(out, np.float32).flatten()
|
vc_infer_pipeline.py
ADDED
@@ -0,0 +1,646 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np, parselmouth, torch, pdb, sys, os
|
2 |
+
from time import time as ttime
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
|
5 |
+
from torch import Tensor
|
6 |
+
import scipy.signal as signal
|
7 |
+
import pyworld, os, traceback, faiss, librosa, torchcrepe
|
8 |
+
from scipy import signal
|
9 |
+
from functools import lru_cache
|
10 |
+
|
11 |
+
now_dir = os.getcwd()
|
12 |
+
sys.path.append(now_dir)
|
13 |
+
|
14 |
+
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
|
15 |
+
|
16 |
+
input_audio_path2wav = {}
|
17 |
+
|
18 |
+
|
19 |
+
@lru_cache
|
20 |
+
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
|
21 |
+
audio = input_audio_path2wav[input_audio_path]
|
22 |
+
f0, t = pyworld.harvest(
|
23 |
+
audio,
|
24 |
+
fs=fs,
|
25 |
+
f0_ceil=f0max,
|
26 |
+
f0_floor=f0min,
|
27 |
+
frame_period=frame_period,
|
28 |
+
)
|
29 |
+
f0 = pyworld.stonemask(audio, f0, t, fs)
|
30 |
+
return f0
|
31 |
+
|
32 |
+
|
33 |
+
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
|
34 |
+
# print(data1.max(),data2.max())
|
35 |
+
rms1 = librosa.feature.rms(
|
36 |
+
y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
|
37 |
+
) # 每半秒一个点
|
38 |
+
rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
|
39 |
+
rms1 = torch.from_numpy(rms1)
|
40 |
+
rms1 = F.interpolate(
|
41 |
+
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
|
42 |
+
).squeeze()
|
43 |
+
rms2 = torch.from_numpy(rms2)
|
44 |
+
rms2 = F.interpolate(
|
45 |
+
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
|
46 |
+
).squeeze()
|
47 |
+
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
|
48 |
+
data2 *= (
|
49 |
+
torch.pow(rms1, torch.tensor(1 - rate))
|
50 |
+
* torch.pow(rms2, torch.tensor(rate - 1))
|
51 |
+
).numpy()
|
52 |
+
return data2
|
53 |
+
|
54 |
+
|
55 |
+
class VC(object):
|
56 |
+
def __init__(self, tgt_sr, config):
|
57 |
+
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
|
58 |
+
config.x_pad,
|
59 |
+
config.x_query,
|
60 |
+
config.x_center,
|
61 |
+
config.x_max,
|
62 |
+
config.is_half,
|
63 |
+
)
|
64 |
+
self.sr = 16000 # hubert输入采样率
|
65 |
+
self.window = 160 # 每帧点数
|
66 |
+
self.t_pad = self.sr * self.x_pad # 每条前后pad时间
|
67 |
+
self.t_pad_tgt = tgt_sr * self.x_pad
|
68 |
+
self.t_pad2 = self.t_pad * 2
|
69 |
+
self.t_query = self.sr * self.x_query # 查询切点前后查询时间
|
70 |
+
self.t_center = self.sr * self.x_center # 查询切点位置
|
71 |
+
self.t_max = self.sr * self.x_max # 免查询时长阈值
|
72 |
+
self.device = config.device
|
73 |
+
|
74 |
+
# Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
|
75 |
+
def get_optimal_torch_device(self, index: int = 0) -> torch.device:
|
76 |
+
# Get cuda device
|
77 |
+
if torch.cuda.is_available():
|
78 |
+
return torch.device(
|
79 |
+
f"cuda:{index % torch.cuda.device_count()}"
|
80 |
+
) # Very fast
|
81 |
+
elif torch.backends.mps.is_available():
|
82 |
+
return torch.device("mps")
|
83 |
+
# Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
|
84 |
+
# Else wise return the "cpu" as a torch device,
|
85 |
+
return torch.device("cpu")
|
86 |
+
|
87 |
+
# Fork Feature: Compute f0 with the crepe method
|
88 |
+
def get_f0_crepe_computation(
|
89 |
+
self,
|
90 |
+
x,
|
91 |
+
f0_min,
|
92 |
+
f0_max,
|
93 |
+
p_len,
|
94 |
+
hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
|
95 |
+
model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
|
96 |
+
):
|
97 |
+
x = x.astype(
|
98 |
+
np.float32
|
99 |
+
) # fixes the F.conv2D exception. We needed to convert double to float.
|
100 |
+
x /= np.quantile(np.abs(x), 0.999)
|
101 |
+
torch_device = self.get_optimal_torch_device()
|
102 |
+
audio = torch.from_numpy(x).to(torch_device, copy=True)
|
103 |
+
audio = torch.unsqueeze(audio, dim=0)
|
104 |
+
if audio.ndim == 2 and audio.shape[0] > 1:
|
105 |
+
audio = torch.mean(audio, dim=0, keepdim=True).detach()
|
106 |
+
audio = audio.detach()
|
107 |
+
print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
|
108 |
+
pitch: Tensor = torchcrepe.predict(
|
109 |
+
audio,
|
110 |
+
self.sr,
|
111 |
+
hop_length,
|
112 |
+
f0_min,
|
113 |
+
f0_max,
|
114 |
+
model,
|
115 |
+
batch_size=hop_length * 2,
|
116 |
+
device=torch_device,
|
117 |
+
pad=True,
|
118 |
+
)
|
119 |
+
p_len = p_len or x.shape[0] // hop_length
|
120 |
+
# Resize the pitch for final f0
|
121 |
+
source = np.array(pitch.squeeze(0).cpu().float().numpy())
|
122 |
+
source[source < 0.001] = np.nan
|
123 |
+
target = np.interp(
|
124 |
+
np.arange(0, len(source) * p_len, len(source)) / p_len,
|
125 |
+
np.arange(0, len(source)),
|
126 |
+
source,
|
127 |
+
)
|
128 |
+
f0 = np.nan_to_num(target)
|
129 |
+
return f0 # Resized f0
|
130 |
+
|
131 |
+
def get_f0_official_crepe_computation(
|
132 |
+
self,
|
133 |
+
x,
|
134 |
+
f0_min,
|
135 |
+
f0_max,
|
136 |
+
model="full",
|
137 |
+
):
|
138 |
+
# Pick a batch size that doesn't cause memory errors on your gpu
|
139 |
+
batch_size = 512
|
140 |
+
# Compute pitch using first gpu
|
141 |
+
audio = torch.tensor(np.copy(x))[None].float()
|
142 |
+
f0, pd = torchcrepe.predict(
|
143 |
+
audio,
|
144 |
+
self.sr,
|
145 |
+
self.window,
|
146 |
+
f0_min,
|
147 |
+
f0_max,
|
148 |
+
model,
|
149 |
+
batch_size=batch_size,
|
150 |
+
device=self.device,
|
151 |
+
return_periodicity=True,
|
152 |
+
)
|
153 |
+
pd = torchcrepe.filter.median(pd, 3)
|
154 |
+
f0 = torchcrepe.filter.mean(f0, 3)
|
155 |
+
f0[pd < 0.1] = 0
|
156 |
+
f0 = f0[0].cpu().numpy()
|
157 |
+
return f0
|
158 |
+
|
159 |
+
# Fork Feature: Compute pYIN f0 method
|
160 |
+
def get_f0_pyin_computation(self, x, f0_min, f0_max):
|
161 |
+
y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
|
162 |
+
f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
|
163 |
+
f0 = f0[1:] # Get rid of extra first frame
|
164 |
+
return f0
|
165 |
+
|
166 |
+
# Fork Feature: Acquire median hybrid f0 estimation calculation
|
167 |
+
def get_f0_hybrid_computation(
|
168 |
+
self,
|
169 |
+
methods_str,
|
170 |
+
input_audio_path,
|
171 |
+
x,
|
172 |
+
f0_min,
|
173 |
+
f0_max,
|
174 |
+
p_len,
|
175 |
+
filter_radius,
|
176 |
+
crepe_hop_length,
|
177 |
+
time_step,
|
178 |
+
):
|
179 |
+
# Get various f0 methods from input to use in the computation stack
|
180 |
+
s = methods_str
|
181 |
+
s = s.split("hybrid")[1]
|
182 |
+
s = s.replace("[", "").replace("]", "")
|
183 |
+
methods = s.split("+")
|
184 |
+
f0_computation_stack = []
|
185 |
+
|
186 |
+
print("Calculating f0 pitch estimations for methods: %s" % str(methods))
|
187 |
+
x = x.astype(np.float32)
|
188 |
+
x /= np.quantile(np.abs(x), 0.999)
|
189 |
+
# Get f0 calculations for all methods specified
|
190 |
+
for method in methods:
|
191 |
+
f0 = None
|
192 |
+
if method == "pm":
|
193 |
+
f0 = (
|
194 |
+
parselmouth.Sound(x, self.sr)
|
195 |
+
.to_pitch_ac(
|
196 |
+
time_step=time_step / 1000,
|
197 |
+
voicing_threshold=0.6,
|
198 |
+
pitch_floor=f0_min,
|
199 |
+
pitch_ceiling=f0_max,
|
200 |
+
)
|
201 |
+
.selected_array["frequency"]
|
202 |
+
)
|
203 |
+
pad_size = (p_len - len(f0) + 1) // 2
|
204 |
+
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
205 |
+
f0 = np.pad(
|
206 |
+
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
|
207 |
+
)
|
208 |
+
elif method == "crepe":
|
209 |
+
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
|
210 |
+
f0 = f0[1:] # Get rid of extra first frame
|
211 |
+
elif method == "crepe-tiny":
|
212 |
+
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
|
213 |
+
f0 = f0[1:] # Get rid of extra first frame
|
214 |
+
elif method == "mangio-crepe":
|
215 |
+
f0 = self.get_f0_crepe_computation(
|
216 |
+
x, f0_min, f0_max, p_len, crepe_hop_length
|
217 |
+
)
|
218 |
+
elif method == "mangio-crepe-tiny":
|
219 |
+
f0 = self.get_f0_crepe_computation(
|
220 |
+
x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
|
221 |
+
)
|
222 |
+
elif method == "harvest":
|
223 |
+
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
|
224 |
+
if filter_radius > 2:
|
225 |
+
f0 = signal.medfilt(f0, 3)
|
226 |
+
f0 = f0[1:] # Get rid of first frame.
|
227 |
+
elif method == "dio": # Potentially buggy?
|
228 |
+
f0, t = pyworld.dio(
|
229 |
+
x.astype(np.double),
|
230 |
+
fs=self.sr,
|
231 |
+
f0_ceil=f0_max,
|
232 |
+
f0_floor=f0_min,
|
233 |
+
frame_period=10,
|
234 |
+
)
|
235 |
+
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
|
236 |
+
f0 = signal.medfilt(f0, 3)
|
237 |
+
f0 = f0[1:]
|
238 |
+
# elif method == "pyin": Not Working just yet
|
239 |
+
# f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
|
240 |
+
# Push method to the stack
|
241 |
+
f0_computation_stack.append(f0)
|
242 |
+
|
243 |
+
for fc in f0_computation_stack:
|
244 |
+
print(len(fc))
|
245 |
+
|
246 |
+
print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
|
247 |
+
f0_median_hybrid = None
|
248 |
+
if len(f0_computation_stack) == 1:
|
249 |
+
f0_median_hybrid = f0_computation_stack[0]
|
250 |
+
else:
|
251 |
+
f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
|
252 |
+
return f0_median_hybrid
|
253 |
+
|
254 |
+
def get_f0(
|
255 |
+
self,
|
256 |
+
input_audio_path,
|
257 |
+
x,
|
258 |
+
p_len,
|
259 |
+
f0_up_key,
|
260 |
+
f0_method,
|
261 |
+
filter_radius,
|
262 |
+
crepe_hop_length,
|
263 |
+
inp_f0=None,
|
264 |
+
):
|
265 |
+
global input_audio_path2wav
|
266 |
+
time_step = self.window / self.sr * 1000
|
267 |
+
f0_min = 50
|
268 |
+
f0_max = 1100
|
269 |
+
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
270 |
+
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
271 |
+
if f0_method == "pm":
|
272 |
+
f0 = (
|
273 |
+
parselmouth.Sound(x, self.sr)
|
274 |
+
.to_pitch_ac(
|
275 |
+
time_step=time_step / 1000,
|
276 |
+
voicing_threshold=0.6,
|
277 |
+
pitch_floor=f0_min,
|
278 |
+
pitch_ceiling=f0_max,
|
279 |
+
)
|
280 |
+
.selected_array["frequency"]
|
281 |
+
)
|
282 |
+
pad_size = (p_len - len(f0) + 1) // 2
|
283 |
+
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
284 |
+
f0 = np.pad(
|
285 |
+
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
|
286 |
+
)
|
287 |
+
elif f0_method == "harvest":
|
288 |
+
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
289 |
+
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
|
290 |
+
if filter_radius > 2:
|
291 |
+
f0 = signal.medfilt(f0, 3)
|
292 |
+
elif f0_method == "dio": # Potentially Buggy?
|
293 |
+
f0, t = pyworld.dio(
|
294 |
+
x.astype(np.double),
|
295 |
+
fs=self.sr,
|
296 |
+
f0_ceil=f0_max,
|
297 |
+
f0_floor=f0_min,
|
298 |
+
frame_period=10,
|
299 |
+
)
|
300 |
+
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
|
301 |
+
f0 = signal.medfilt(f0, 3)
|
302 |
+
elif f0_method == "crepe":
|
303 |
+
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
|
304 |
+
elif f0_method == "crepe-tiny":
|
305 |
+
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
|
306 |
+
elif f0_method == "mangio-crepe":
|
307 |
+
f0 = self.get_f0_crepe_computation(
|
308 |
+
x, f0_min, f0_max, p_len, crepe_hop_length
|
309 |
+
)
|
310 |
+
elif f0_method == "mangio-crepe-tiny":
|
311 |
+
f0 = self.get_f0_crepe_computation(
|
312 |
+
x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
|
313 |
+
)
|
314 |
+
elif f0_method == "rmvpe":
|
315 |
+
if hasattr(self, "model_rmvpe") == False:
|
316 |
+
from rmvpe import RMVPE
|
317 |
+
|
318 |
+
print("loading rmvpe model")
|
319 |
+
self.model_rmvpe = RMVPE(
|
320 |
+
"rmvpe.pt", is_half=self.is_half, device=self.device
|
321 |
+
)
|
322 |
+
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
323 |
+
|
324 |
+
elif "hybrid" in f0_method:
|
325 |
+
# Perform hybrid median pitch estimation
|
326 |
+
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
327 |
+
f0 = self.get_f0_hybrid_computation(
|
328 |
+
f0_method,
|
329 |
+
input_audio_path,
|
330 |
+
x,
|
331 |
+
f0_min,
|
332 |
+
f0_max,
|
333 |
+
p_len,
|
334 |
+
filter_radius,
|
335 |
+
crepe_hop_length,
|
336 |
+
time_step,
|
337 |
+
)
|
338 |
+
|
339 |
+
f0 *= pow(2, f0_up_key / 12)
|
340 |
+
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
341 |
+
tf0 = self.sr // self.window # 每秒f0点数
|
342 |
+
if inp_f0 is not None:
|
343 |
+
delta_t = np.round(
|
344 |
+
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
|
345 |
+
).astype("int16")
|
346 |
+
replace_f0 = np.interp(
|
347 |
+
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
|
348 |
+
)
|
349 |
+
shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
|
350 |
+
f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
|
351 |
+
:shape
|
352 |
+
]
|
353 |
+
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
354 |
+
f0bak = f0.copy()
|
355 |
+
f0_mel = 1127 * np.log(1 + f0 / 700)
|
356 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
357 |
+
f0_mel_max - f0_mel_min
|
358 |
+
) + 1
|
359 |
+
f0_mel[f0_mel <= 1] = 1
|
360 |
+
f0_mel[f0_mel > 255] = 255
|
361 |
+
f0_coarse = np.rint(f0_mel).astype(np.int)
|
362 |
+
|
363 |
+
return f0_coarse, f0bak # 1-0
|
364 |
+
|
365 |
+
def vc(
|
366 |
+
self,
|
367 |
+
model,
|
368 |
+
net_g,
|
369 |
+
sid,
|
370 |
+
audio0,
|
371 |
+
pitch,
|
372 |
+
pitchf,
|
373 |
+
times,
|
374 |
+
index,
|
375 |
+
big_npy,
|
376 |
+
index_rate,
|
377 |
+
version,
|
378 |
+
protect,
|
379 |
+
): # ,file_index,file_big_npy
|
380 |
+
feats = torch.from_numpy(audio0)
|
381 |
+
if self.is_half:
|
382 |
+
feats = feats.half()
|
383 |
+
else:
|
384 |
+
feats = feats.float()
|
385 |
+
if feats.dim() == 2: # double channels
|
386 |
+
feats = feats.mean(-1)
|
387 |
+
assert feats.dim() == 1, feats.dim()
|
388 |
+
feats = feats.view(1, -1)
|
389 |
+
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
390 |
+
|
391 |
+
inputs = {
|
392 |
+
"source": feats.to(self.device),
|
393 |
+
"padding_mask": padding_mask,
|
394 |
+
"output_layer": 9 if version == "v1" else 12,
|
395 |
+
}
|
396 |
+
t0 = ttime()
|
397 |
+
with torch.no_grad():
|
398 |
+
logits = model.extract_features(**inputs)
|
399 |
+
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
400 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
401 |
+
feats0 = feats.clone()
|
402 |
+
if (
|
403 |
+
isinstance(index, type(None)) == False
|
404 |
+
and isinstance(big_npy, type(None)) == False
|
405 |
+
and index_rate != 0
|
406 |
+
):
|
407 |
+
npy = feats[0].cpu().numpy()
|
408 |
+
if self.is_half:
|
409 |
+
npy = npy.astype("float32")
|
410 |
+
|
411 |
+
# _, I = index.search(npy, 1)
|
412 |
+
# npy = big_npy[I.squeeze()]
|
413 |
+
|
414 |
+
score, ix = index.search(npy, k=8)
|
415 |
+
weight = np.square(1 / score)
|
416 |
+
weight /= weight.sum(axis=1, keepdims=True)
|
417 |
+
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
418 |
+
|
419 |
+
if self.is_half:
|
420 |
+
npy = npy.astype("float16")
|
421 |
+
feats = (
|
422 |
+
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
|
423 |
+
+ (1 - index_rate) * feats
|
424 |
+
)
|
425 |
+
|
426 |
+
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
427 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
428 |
+
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
429 |
+
0, 2, 1
|
430 |
+
)
|
431 |
+
t1 = ttime()
|
432 |
+
p_len = audio0.shape[0] // self.window
|
433 |
+
if feats.shape[1] < p_len:
|
434 |
+
p_len = feats.shape[1]
|
435 |
+
if pitch != None and pitchf != None:
|
436 |
+
pitch = pitch[:, :p_len]
|
437 |
+
pitchf = pitchf[:, :p_len]
|
438 |
+
|
439 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
440 |
+
pitchff = pitchf.clone()
|
441 |
+
pitchff[pitchf > 0] = 1
|
442 |
+
pitchff[pitchf < 1] = protect
|
443 |
+
pitchff = pitchff.unsqueeze(-1)
|
444 |
+
feats = feats * pitchff + feats0 * (1 - pitchff)
|
445 |
+
feats = feats.to(feats0.dtype)
|
446 |
+
p_len = torch.tensor([p_len], device=self.device).long()
|
447 |
+
with torch.no_grad():
|
448 |
+
if pitch != None and pitchf != None:
|
449 |
+
audio1 = (
|
450 |
+
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
|
451 |
+
.data.cpu()
|
452 |
+
.float()
|
453 |
+
.numpy()
|
454 |
+
)
|
455 |
+
else:
|
456 |
+
audio1 = (
|
457 |
+
(net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
|
458 |
+
)
|
459 |
+
del feats, p_len, padding_mask
|
460 |
+
if torch.cuda.is_available():
|
461 |
+
torch.cuda.empty_cache()
|
462 |
+
t2 = ttime()
|
463 |
+
times[0] += t1 - t0
|
464 |
+
times[2] += t2 - t1
|
465 |
+
return audio1
|
466 |
+
|
467 |
+
def pipeline(
|
468 |
+
self,
|
469 |
+
model,
|
470 |
+
net_g,
|
471 |
+
sid,
|
472 |
+
audio,
|
473 |
+
input_audio_path,
|
474 |
+
times,
|
475 |
+
f0_up_key,
|
476 |
+
f0_method,
|
477 |
+
file_index,
|
478 |
+
# file_big_npy,
|
479 |
+
index_rate,
|
480 |
+
if_f0,
|
481 |
+
filter_radius,
|
482 |
+
tgt_sr,
|
483 |
+
resample_sr,
|
484 |
+
rms_mix_rate,
|
485 |
+
version,
|
486 |
+
protect,
|
487 |
+
crepe_hop_length,
|
488 |
+
f0_file=None,
|
489 |
+
):
|
490 |
+
if (
|
491 |
+
file_index != ""
|
492 |
+
# and file_big_npy != ""
|
493 |
+
# and os.path.exists(file_big_npy) == True
|
494 |
+
and os.path.exists(file_index) == True
|
495 |
+
and index_rate != 0
|
496 |
+
):
|
497 |
+
try:
|
498 |
+
index = faiss.read_index(file_index)
|
499 |
+
# big_npy = np.load(file_big_npy)
|
500 |
+
big_npy = index.reconstruct_n(0, index.ntotal)
|
501 |
+
except:
|
502 |
+
traceback.print_exc()
|
503 |
+
index = big_npy = None
|
504 |
+
else:
|
505 |
+
index = big_npy = None
|
506 |
+
audio = signal.filtfilt(bh, ah, audio)
|
507 |
+
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
|
508 |
+
opt_ts = []
|
509 |
+
if audio_pad.shape[0] > self.t_max:
|
510 |
+
audio_sum = np.zeros_like(audio)
|
511 |
+
for i in range(self.window):
|
512 |
+
audio_sum += audio_pad[i : i - self.window]
|
513 |
+
for t in range(self.t_center, audio.shape[0], self.t_center):
|
514 |
+
opt_ts.append(
|
515 |
+
t
|
516 |
+
- self.t_query
|
517 |
+
+ np.where(
|
518 |
+
np.abs(audio_sum[t - self.t_query : t + self.t_query])
|
519 |
+
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
520 |
+
)[0][0]
|
521 |
+
)
|
522 |
+
s = 0
|
523 |
+
audio_opt = []
|
524 |
+
t = None
|
525 |
+
t1 = ttime()
|
526 |
+
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
527 |
+
p_len = audio_pad.shape[0] // self.window
|
528 |
+
inp_f0 = None
|
529 |
+
if hasattr(f0_file, "name") == True:
|
530 |
+
try:
|
531 |
+
with open(f0_file.name, "r") as f:
|
532 |
+
lines = f.read().strip("\n").split("\n")
|
533 |
+
inp_f0 = []
|
534 |
+
for line in lines:
|
535 |
+
inp_f0.append([float(i) for i in line.split(",")])
|
536 |
+
inp_f0 = np.array(inp_f0, dtype="float32")
|
537 |
+
except:
|
538 |
+
traceback.print_exc()
|
539 |
+
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
540 |
+
pitch, pitchf = None, None
|
541 |
+
if if_f0 == 1:
|
542 |
+
pitch, pitchf = self.get_f0(
|
543 |
+
input_audio_path,
|
544 |
+
audio_pad,
|
545 |
+
p_len,
|
546 |
+
f0_up_key,
|
547 |
+
f0_method,
|
548 |
+
filter_radius,
|
549 |
+
crepe_hop_length,
|
550 |
+
inp_f0,
|
551 |
+
)
|
552 |
+
pitch = pitch[:p_len]
|
553 |
+
pitchf = pitchf[:p_len]
|
554 |
+
if self.device == "mps":
|
555 |
+
pitchf = pitchf.astype(np.float32)
|
556 |
+
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
557 |
+
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
558 |
+
t2 = ttime()
|
559 |
+
times[1] += t2 - t1
|
560 |
+
for t in opt_ts:
|
561 |
+
t = t // self.window * self.window
|
562 |
+
if if_f0 == 1:
|
563 |
+
audio_opt.append(
|
564 |
+
self.vc(
|
565 |
+
model,
|
566 |
+
net_g,
|
567 |
+
sid,
|
568 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
569 |
+
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
|
570 |
+
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
|
571 |
+
times,
|
572 |
+
index,
|
573 |
+
big_npy,
|
574 |
+
index_rate,
|
575 |
+
version,
|
576 |
+
protect,
|
577 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
578 |
+
)
|
579 |
+
else:
|
580 |
+
audio_opt.append(
|
581 |
+
self.vc(
|
582 |
+
model,
|
583 |
+
net_g,
|
584 |
+
sid,
|
585 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
586 |
+
None,
|
587 |
+
None,
|
588 |
+
times,
|
589 |
+
index,
|
590 |
+
big_npy,
|
591 |
+
index_rate,
|
592 |
+
version,
|
593 |
+
protect,
|
594 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
595 |
+
)
|
596 |
+
s = t
|
597 |
+
if if_f0 == 1:
|
598 |
+
audio_opt.append(
|
599 |
+
self.vc(
|
600 |
+
model,
|
601 |
+
net_g,
|
602 |
+
sid,
|
603 |
+
audio_pad[t:],
|
604 |
+
pitch[:, t // self.window :] if t is not None else pitch,
|
605 |
+
pitchf[:, t // self.window :] if t is not None else pitchf,
|
606 |
+
times,
|
607 |
+
index,
|
608 |
+
big_npy,
|
609 |
+
index_rate,
|
610 |
+
version,
|
611 |
+
protect,
|
612 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
613 |
+
)
|
614 |
+
else:
|
615 |
+
audio_opt.append(
|
616 |
+
self.vc(
|
617 |
+
model,
|
618 |
+
net_g,
|
619 |
+
sid,
|
620 |
+
audio_pad[t:],
|
621 |
+
None,
|
622 |
+
None,
|
623 |
+
times,
|
624 |
+
index,
|
625 |
+
big_npy,
|
626 |
+
index_rate,
|
627 |
+
version,
|
628 |
+
protect,
|
629 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
630 |
+
)
|
631 |
+
audio_opt = np.concatenate(audio_opt)
|
632 |
+
if rms_mix_rate != 1:
|
633 |
+
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
|
634 |
+
if resample_sr >= 16000 and tgt_sr != resample_sr:
|
635 |
+
audio_opt = librosa.resample(
|
636 |
+
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
|
637 |
+
)
|
638 |
+
audio_max = np.abs(audio_opt).max() / 0.99
|
639 |
+
max_int16 = 32768
|
640 |
+
if audio_max > 1:
|
641 |
+
max_int16 /= audio_max
|
642 |
+
audio_opt = (audio_opt * max_int16).astype(np.int16)
|
643 |
+
del pitch, pitchf, sid
|
644 |
+
if torch.cuda.is_available():
|
645 |
+
torch.cuda.empty_cache()
|
646 |
+
return audio_opt
|