File size: 2,492 Bytes
2de4670 03504d1 ea6e94f 2de4670 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
sample_rate: 24_000
audio_backend: "vocos"
models:
- name: "ar+nar"
size: "full"
resp_levels: 8
prom_levels: 8
tasks: 8
langs: 2
tones: 1
arch_type: llama
training: False
version: 5
attention: auto
dropout: 0.1
loss_factors:
text: 0.01
prom: 0.5
resp: 1.0
capabilities: ["ar", "nar"]
experimental:
audio_embedding_sums: False
unified_position_ids: False
split_classifiers: True
hyperparameters:
autotune: False
autotune_params:
start_profile_step: 1
end_profile_step: 50
num_tuning_micro_batch_sizes: 8
batch_size: 16
gradient_accumulation_steps: 8
gradient_clipping: 1.0
warmup_steps: 250
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 16
frequency: 1000
size: 16
steps: 500
ar_temperature: 0.95
nar_temperature: 0.25
load_disabled_engines: True
trainer:
#no_logger: True
ddp: False
check_for_oom: False
iterations: 1_000_000
save_tag: step
save_on_oom: True
save_on_quit: True
save_frequency: 500
export_on_save: True
keep_last_checkpoints: 8
aggressive_optimizations: False
load_disabled_engines: False
gradient_checkpointing: True
#load_state_dict: True
strict_loading: False
#load_tag: "9500"
#load_states: False
#restart_step_count: True
gc_mode: None # "global_step"
weight_dtype: bfloat16
amp: True
backend: deepspeed
deepspeed:
inferencing: True
zero_optimization_level: 0
use_compression_training: False
amp: False
load_webui: False
inference:
backend: deepspeed
audio_backend: "vocos"
normalize: False
weight_dtype: bfloat16
amp: True
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
speaker_languages:
ja: []
use_hdf5: True
use_metadata: True
hdf5_flag: r
validate: True
workers: 6
cache: True
duration_range: [24.0, 32.0]
random_utterance: 1.0
max_prompts: 1
prompt_duration_range: [3.0, 9.0]
max_resps: 1
p_resp_append: 0.25
sample_type: path # path # speaker
tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
training: []
validation: []
noise: []
|