Spaces:
Sleeping
Sleeping
dataloader: | |
batch_size : 16 | |
shuffle: true | |
num_workers : 64 | |
drop_last : true | |
pin_memory : true | |
model: | |
target: cldm.cldm.ControlLDM | |
params: | |
# linear_start: 0.00085 | |
# linear_end: 0.0120 | |
num_timesteps_cond: 1 | |
log_every_t: 200 | |
timesteps: 1000 | |
first_stage_key: "jpg" | |
cond_stage_key: "txt" | |
control_key: "hint" | |
image_size: 64 | |
channels: 100 | |
cond_stage_trainable: true | |
# conditioning_key: crossattn | |
monitor: val/loss_simple_ema | |
scale_factor: 0.18215 | |
use_ema: False | |
only_mid_control: False | |
# control_stage_config: | |
# target: cldm.cldm.ControlNet | |
# params: | |
# image_size: 32 # unused | |
# in_channels: 100 | |
# hint_channels: 768 | |
# model_channels: 128 | |
# attention_resolutions: [ 4, 2, 1 ] | |
# num_res_blocks: 2 | |
# channel_mult: [ 1, 2, 4, 4 ] | |
# num_heads: 8 | |
# use_spatial_transformer: True | |
# transformer_depth: 1 | |
# context_dim: 768 | |
# use_checkpoint: True | |
# legacy: False | |
refer_config: | |
target: cldm.cldm.ReferenceNet | |
params: | |
image_size: 32 # unused | |
hint_in_channels: 1024 | |
hint_out_channels: 128 | |
in_channels: 100 | |
out_channels: 100 | |
model_channels: 1024 | |
attention_resolutions: [ 4, 2, 1 ] | |
num_res_blocks: 1 | |
channel_mult: [ 1, 1 ] | |
num_heads: 8 | |
use_spatial_transformer: True | |
transformer_depth: 1 | |
context_dim: 512 | |
use_checkpoint: True | |
dims: 1 | |
legacy: False | |
unet_config: | |
target: tortoise_model.DiffusionTts | |
params: | |
model_channels: 512 | |
num_layers: 8 | |
in_channels: 100 | |
in_latent_channels: 1024 | |
out_channels: 100 | |
dropout: 0 | |
use_fp16: False | |
num_heads: 16 | |
layer_drop: .1 | |
unconditioned_percentage: .1 | |
# target: cldm.cldm.ControlledUnetModel | |
# params: | |
# image_size: 32 # unused | |
# hint_in_channels: 1024 | |
# hint_out_channels: 128 | |
# in_channels: 100 | |
# out_channels: 100 | |
# model_channels: 1024 | |
# attention_resolutions: [ 4, 2, 1 ] | |
# num_res_blocks: 1 | |
# resblock_updown: True | |
# channel_mult: [ 1, 1] | |
# num_heads: 8 | |
# use_spatial_transformer: True | |
# transformer_depth: 1 | |
# context_dim: 512 | |
# use_checkpoint: True | |
# dims: 1 | |
# legacy: False | |
cond_stage_config: | |
target: cldm.cond_emb.CLIP | |
params: | |
embed_dim: 512 | |
vision_cfg: | |
layers: 6 | |
width: 512 | |
head_width: 64 | |
mlp_ratio: 4.0 | |
patch_dropout: 0.4 | |
attentional_pool: False | |
patch_size: 64 | |
image_size: 1000 | |
in_channels: 100 | |
pool_type: 'tok' | |
pos_embed_type: 'learnable' | |
final_ln_after_pool: false | |
train: | |
train_batch_size : 32 | |
gradient_accumulate_every : 1 | |
train_lr : 0.0001 | |
train_num_steps : 1000000 | |
ema_update_every : 10 | |
ema_decay : 0.995 | |
adam_betas : [0.9, 0.99] | |
save_and_sample_every : 1000 | |
timesteps : 1000 | |
sampling_timesteps : 1000 | |
results_folder : "results" | |
logs_folder : "ttts/AA_diffusion/logs" | |
num_workers : 32 | |
eps : 0.000000001 | |
keep_ckpts : 3 | |
all_in_mem : false | |
dataset: | |
path : "/home/hyc/tortoise_plus_zh/ttts/datasets/databaker_data.jsonl" | |
gpt_path : "/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2023-12-24-14-22-14/model-70.pt" | |