File size: 2,351 Bytes
2de4670 0c4f028 2de4670 0c4f028 2de4670 0c4f028 2de4670 03504d1 2de4670 0c4f028 2de4670 0c4f028 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
sample_rate: 24_000
audio_backend: vocos
models:
- name: "ar+nar"
size: "full"
resp_levels: 8
prom_levels: 8
tasks: 8
langs: 2
tones: 1
arch_type: retnet
training: False
version: 2
dropout: 0.1
capabilities: ["ar", "nar"]
experimental:
audio_embedding_sums: True
hyperparameters:
autotune: False
autotune_params:
start_profile_step: 1
end_profile_step: 50
num_tuning_micro_batch_sizes: 8
batch_size: 16
gradient_accumulation_steps: 8
gradient_clipping: 1.0
warmup_steps: 250
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 16
frequency: 1000
size: 16
steps: 500
ar_temperature: 0.95
nar_temperature: 0.25
load_disabled_engines: True
trainer:
#no_logger: True
ddp: False
check_for_oom: False
iterations: 1_000_000
save_tag: step
save_on_oom: True
save_on_quit: True
save_frequency: 500
export_on_save: True
keep_last_checkpoints: 8
aggressive_optimizations: False
load_disabled_engines: False
gradient_checkpointing: True
#load_state_dict: True
strict_loading: False
#load_tag: "9500"
#load_states: False
#restart_step_count: True
gc_mode: None # "global_step"
weight_dtype: bfloat16
amp: True
backend: deepspeed
deepspeed:
inferencing: True
zero_optimization_level: 0
use_compression_training: False
amp: False
load_webui: False
inference:
backend: deepspeed
audio_backend: "vocos"
normalize: False
weight_dtype: bfloat16
amp: True
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
speaker_languages:
ja: []
use_hdf5: True
use_metadata: True
hdf5_flag: r
validate: True
workers: 6
cache: True
duration_range: [3.0, 16.0]
random_utterance: 1.0
max_prompts: 1
prompt_duration_range: [3.0, 9.0]
max_resps: 1
p_resp_append: 0.25
sample_type: path # path # speaker
tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
training: []
validation: []
noise: [] |