mrfakename's picture
Add source code
4ee33aa
raw
history blame
3.51 kB
dataloader:
batch_size : 16
shuffle: true
num_workers : 64
drop_last : true
pin_memory : true
model:
target: cldm.cldm.ControlLDM
params:
# linear_start: 0.00085
# linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
control_key: "hint"
image_size: 64
channels: 100
cond_stage_trainable: true
# conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
only_mid_control: False
# control_stage_config:
# target: cldm.cldm.ControlNet
# params:
# image_size: 32 # unused
# in_channels: 100
# hint_channels: 768
# model_channels: 128
# attention_resolutions: [ 4, 2, 1 ]
# num_res_blocks: 2
# channel_mult: [ 1, 2, 4, 4 ]
# num_heads: 8
# use_spatial_transformer: True
# transformer_depth: 1
# context_dim: 768
# use_checkpoint: True
# legacy: False
refer_config:
target: cldm.cldm.ReferenceNet
params:
image_size: 32 # unused
hint_in_channels: 1024
hint_out_channels: 128
in_channels: 100
out_channels: 100
model_channels: 1024
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 1
channel_mult: [ 1, 1 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 512
use_checkpoint: True
dims: 1
legacy: False
unet_config:
target: tortoise_model.DiffusionTts
params:
model_channels: 512
num_layers: 8
in_channels: 100
in_latent_channels: 1024
out_channels: 100
dropout: 0
use_fp16: False
num_heads: 16
layer_drop: .1
unconditioned_percentage: .1
# target: cldm.cldm.ControlledUnetModel
# params:
# image_size: 32 # unused
# hint_in_channels: 1024
# hint_out_channels: 128
# in_channels: 100
# out_channels: 100
# model_channels: 1024
# attention_resolutions: [ 4, 2, 1 ]
# num_res_blocks: 1
# resblock_updown: True
# channel_mult: [ 1, 1]
# num_heads: 8
# use_spatial_transformer: True
# transformer_depth: 1
# context_dim: 512
# use_checkpoint: True
# dims: 1
# legacy: False
cond_stage_config:
target: cldm.cond_emb.CLIP
params:
embed_dim: 512
vision_cfg:
layers: 6
width: 512
head_width: 64
mlp_ratio: 4.0
patch_dropout: 0.4
attentional_pool: False
patch_size: 64
image_size: 1000
in_channels: 100
pool_type: 'tok'
pos_embed_type: 'learnable'
final_ln_after_pool: false
train:
train_batch_size : 32
gradient_accumulate_every : 1
train_lr : 0.0001
train_num_steps : 1000000
ema_update_every : 10
ema_decay : 0.995
adam_betas : [0.9, 0.99]
save_and_sample_every : 1000
timesteps : 1000
sampling_timesteps : 1000
results_folder : "results"
logs_folder : "ttts/AA_diffusion/logs"
num_workers : 32
eps : 0.000000001
keep_ckpts : 3
all_in_mem : false
dataset:
path : "/home/hyc/tortoise_plus_zh/ttts/datasets/databaker_data.jsonl"
gpt_path : "/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2023-12-24-14-22-14/model-70.pt"