| 
							 | 
						model: | 
					
					
						
						| 
							 | 
						  target: cldm.cldm.ControlLDM | 
					
					
						
						| 
							 | 
						  params: | 
					
					
						
						| 
							 | 
						    linear_start: 0.00085 | 
					
					
						
						| 
							 | 
						    linear_end: 0.0120 | 
					
					
						
						| 
							 | 
						    num_timesteps_cond: 1 | 
					
					
						
						| 
							 | 
						    log_every_t: 200 | 
					
					
						
						| 
							 | 
						    timesteps: 1000 | 
					
					
						
						| 
							 | 
						    first_stage_key: "jpg" | 
					
					
						
						| 
							 | 
						    cond_stage_key: "txt" | 
					
					
						
						| 
							 | 
						    control_key: "hint" | 
					
					
						
						| 
							 | 
						    image_size: 64 | 
					
					
						
						| 
							 | 
						    channels: 4 | 
					
					
						
						| 
							 | 
						    cond_stage_trainable: false | 
					
					
						
						| 
							 | 
						    conditioning_key: crossattn | 
					
					
						
						| 
							 | 
						    monitor: val/loss_simple_ema | 
					
					
						
						| 
							 | 
						    scale_factor: 0.18215 | 
					
					
						
						| 
							 | 
						    use_ema: False | 
					
					
						
						| 
							 | 
						    only_mid_control: False | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    control_stage_config: | 
					
					
						
						| 
							 | 
						      target: cldm.cldm.ControlNet | 
					
					
						
						| 
							 | 
						      params: | 
					
					
						
						| 
							 | 
						        image_size: 32  | 
					
					
						
						| 
							 | 
						        in_channels: 4 | 
					
					
						
						| 
							 | 
						        hint_channels: 3 | 
					
					
						
						| 
							 | 
						        model_channels: 320 | 
					
					
						
						| 
							 | 
						        attention_resolutions: [ 4, 2, 1 ] | 
					
					
						
						| 
							 | 
						        num_res_blocks: 2 | 
					
					
						
						| 
							 | 
						        channel_mult: [ 1, 2, 4, 4 ] | 
					
					
						
						| 
							 | 
						        num_heads: 8 | 
					
					
						
						| 
							 | 
						        use_spatial_transformer: True | 
					
					
						
						| 
							 | 
						        transformer_depth: 1 | 
					
					
						
						| 
							 | 
						        context_dim: 768 | 
					
					
						
						| 
							 | 
						        use_checkpoint: True | 
					
					
						
						| 
							 | 
						        legacy: False | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    unet_config: | 
					
					
						
						| 
							 | 
						      target: cldm.cldm.ControlledUnetModel | 
					
					
						
						| 
							 | 
						      params: | 
					
					
						
						| 
							 | 
						        image_size: 32  | 
					
					
						
						| 
							 | 
						        in_channels: 4 | 
					
					
						
						| 
							 | 
						        out_channels: 4 | 
					
					
						
						| 
							 | 
						        model_channels: 320 | 
					
					
						
						| 
							 | 
						        attention_resolutions: [ 4, 2, 1 ] | 
					
					
						
						| 
							 | 
						        num_res_blocks: 2 | 
					
					
						
						| 
							 | 
						        channel_mult: [ 1, 2, 4, 4 ] | 
					
					
						
						| 
							 | 
						        num_heads: 8 | 
					
					
						
						| 
							 | 
						        use_spatial_transformer: True | 
					
					
						
						| 
							 | 
						        transformer_depth: 1 | 
					
					
						
						| 
							 | 
						        context_dim: 768 | 
					
					
						
						| 
							 | 
						        use_checkpoint: True | 
					
					
						
						| 
							 | 
						        legacy: False | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    first_stage_config: | 
					
					
						
						| 
							 | 
						      target: ldm.models.autoencoder.AutoencoderKL | 
					
					
						
						| 
							 | 
						      params: | 
					
					
						
						| 
							 | 
						        embed_dim: 4 | 
					
					
						
						| 
							 | 
						        monitor: val/rec_loss | 
					
					
						
						| 
							 | 
						        ddconfig: | 
					
					
						
						| 
							 | 
						          double_z: true | 
					
					
						
						| 
							 | 
						          z_channels: 4 | 
					
					
						
						| 
							 | 
						          resolution: 256 | 
					
					
						
						| 
							 | 
						          in_channels: 3 | 
					
					
						
						| 
							 | 
						          out_ch: 3 | 
					
					
						
						| 
							 | 
						          ch: 128 | 
					
					
						
						| 
							 | 
						          ch_mult: | 
					
					
						
						| 
							 | 
						          - 1 | 
					
					
						
						| 
							 | 
						          - 2 | 
					
					
						
						| 
							 | 
						          - 4 | 
					
					
						
						| 
							 | 
						          - 4 | 
					
					
						
						| 
							 | 
						          num_res_blocks: 2 | 
					
					
						
						| 
							 | 
						          attn_resolutions: [] | 
					
					
						
						| 
							 | 
						          dropout: 0.0 | 
					
					
						
						| 
							 | 
						        lossconfig: | 
					
					
						
						| 
							 | 
						          target: torch.nn.Identity | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    cond_stage_config: | 
					
					
						
						| 
							 | 
						      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder | 
					
					
						
						| 
							 | 
						
 |