Spaces:

John6666
/

qwen2vl-flux-zero

Runtime error

App Files Files Community

qwen2vl-flux-zero / model.py

John6666

Upload 46 files

0eea822 verified 29 days ago

raw

history blame contribute delete

27.9 kB

	import torch
	from torch import nn
	from PIL import Image
	from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, T5EncoderModel, T5TokenizerFast
	from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
	from flux.transformer_flux import FluxTransformer2DModel

	from flux.pipeline_flux_chameleon import FluxPipeline
	from flux.pipeline_flux_img2img import FluxImg2ImgPipeline
	from flux.pipeline_flux_inpaint import FluxInpaintPipeline
	from flux.pipeline_flux_controlnet import FluxControlNetPipeline, FluxControlNetModel
	from flux.pipeline_flux_controlnet_img2img import FluxControlNetImg2ImgPipeline
	from flux.controlnet_flux import FluxMultiControlNetModel
	from flux.pipeline_flux_controlnet_inpainting import FluxControlNetInpaintPipeline

	from qwen2_vl.modeling_qwen2_vl import Qwen2VLSimplifiedModel
	import os
	import cv2
	import numpy as np
	import math

	def get_model_path(model_name):
	"""Get the full path for a model based on the checkpoints directory."""
	base_dir = os.getenv('CHECKPOINT_DIR', 'checkpoints') # Allow environment variable override
	return os.path.join(base_dir, model_name)

	# Model paths configuration
	MODEL_PATHS = {
	'flux': get_model_path('flux'),
	'qwen2vl': get_model_path('qwen2-vl'),
	'controlnet': get_model_path('controlnet'),
	'depth_anything': {
	'path': get_model_path('depth-anything-v2'),
	'weights': 'depth_anything_v2_vitl.pth'
	},
	'anyline': {
	'path': get_model_path('anyline'),
	'weights': 'MTEED.pth'
	},
	'sam2': {
	'path': get_model_path('segment-anything-2'),
	'weights': 'sam2_hiera_large.pt',
	'config': 'sam2_hiera_l.yaml'
	}
	}


	ASPECT_RATIOS = {
	"1:1": (1024, 1024),
	"16:9": (1344, 768),
	"9:16": (768, 1344),
	"2.4:1": (1536, 640),
	"3:4": (896, 1152),
	"4:3": (1152, 896),
	}

	class Qwen2Connector(nn.Module):
	def __init__(self, input_dim=3584, output_dim=4096):
	super().__init__()
	self.linear = nn.Linear(input_dim, output_dim)

	def forward(self, x):
	return self.linear(x)

	class FluxModel:
	def __init__(self, is_turbo=False, device="cuda", required_features=None):
	"""
	Initialize FluxModel with specified features
	Args:
	is_turbo: Enable turbo mode for faster inference
	device: Device to run the model on
	required_features: List of required features ['controlnet', 'depth', 'line', 'sam']
	"""
	self.device = torch.device(device)
	self.dtype = torch.bfloat16
	if required_features is None:
	required_features = []

	self._line_detector_imported = False
	self._depth_model_imported = False
	self._sam_imported = False
	self._turbo_imported = False

	# Initialize base models (always required)
	self._init_base_models()

	# Initialize optional models based on requirements
	if 'controlnet' in required_features or any(f in required_features for f in ['depth', 'line']):
	self._init_controlnet()

	if 'depth' in required_features:
	self._init_depth_model()

	if 'line' in required_features:
	self._init_line_detector()

	if 'sam' in required_features:
	self._init_sam()

	if is_turbo:
	self._enable_turbo()

	def _init_base_models(self):
	"""Initialize the core models that are always needed"""
	# Qwen2VL and connector initialization
	self.qwen2vl = Qwen2VLSimplifiedModel.from_pretrained(
	MODEL_PATHS['qwen2vl'],
	torch_dtype=self.dtype
	)
	self.qwen2vl.requires_grad_(False).to(self.device)

	self.connector = Qwen2Connector(input_dim=3584, output_dim=4096)
	connector_path = os.path.join(MODEL_PATHS['qwen2vl'], "connector.pt")
	if os.path.exists(connector_path):
	connector_state_dict = torch.load(connector_path, map_location=self.device, weights_only=True)
	connector_state_dict = {k.replace('module.', ''): v for k, v in connector_state_dict.items()}
	self.connector.load_state_dict(connector_state_dict)
	self.connector.to(self.dtype).to(self.device)

	# Text encoders initialization
	self.tokenizer = CLIPTokenizer.from_pretrained(MODEL_PATHS['flux'], subfolder="tokenizer")
	self.text_encoder = CLIPTextModel.from_pretrained(MODEL_PATHS['flux'], subfolder="text_encoder")
	self.text_encoder_two = T5EncoderModel.from_pretrained(MODEL_PATHS['flux'], subfolder="text_encoder_2")
	self.tokenizer_two = T5TokenizerFast.from_pretrained(MODEL_PATHS['flux'], subfolder="tokenizer_2")

	self.text_encoder.requires_grad_(False).to(self.dtype).to(self.device)
	self.text_encoder_two.requires_grad_(False).to(self.dtype).to(self.device)

	# T5 context embedder
	self.t5_context_embedder = nn.Linear(4096, 3072)
	t5_embedder_path = os.path.join(MODEL_PATHS['qwen2vl'], "t5_embedder.pt")
	t5_embedder_state_dict = torch.load(t5_embedder_path, map_location=self.device, weights_only=True)
	self.t5_context_embedder.load_state_dict(t5_embedder_state_dict)
	self.t5_context_embedder.to(self.dtype).to(self.device)

	# Basic components
	self.noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(MODEL_PATHS['flux'], subfolder="scheduler", shift=1)
	self.vae = AutoencoderKL.from_pretrained(MODEL_PATHS['flux'], subfolder="vae")
	self.transformer = FluxTransformer2DModel.from_pretrained(MODEL_PATHS['flux'], subfolder="transformer")

	self.vae.requires_grad_(False).to(self.dtype).to(self.device)
	self.transformer.requires_grad_(False).to(self.dtype).to(self.device)

	def _init_controlnet(self):
	"""Initialize ControlNet model"""
	self.controlnet_union = FluxControlNetModel.from_pretrained(
	MODEL_PATHS['controlnet'],
	torch_dtype=torch.bfloat16
	)
	self.controlnet_union.requires_grad_(False).to(self.device)
	self.controlnet = FluxMultiControlNetModel([self.controlnet_union])

	def _init_depth_model(self):
	"""Initialize Depth Anything V2 model"""
	if not self._depth_model_imported:
	from depth_anything_v2.dpt import DepthAnythingV2
	self._depth_model_imported = True

	self.depth_model = DepthAnythingV2(
	encoder='vitl',
	features=256,
	out_channels=[256, 512, 1024, 1024]
	)
	depth_weights = os.path.join(MODEL_PATHS['depth_anything']['path'],
	MODEL_PATHS['depth_anything']['weights'])
	self.depth_model.load_state_dict(torch.load(depth_weights, map_location=self.device))
	self.depth_model.requires_grad_(False).to(self.device)

	def _init_line_detector(self):
	"""Initialize line detection model"""
	if not self._line_detector_imported:
	from controlnet_aux import AnylineDetector
	self._line_detector_imported = True

	self.anyline = AnylineDetector.from_pretrained(
	MODEL_PATHS['anyline']['path'],
	filename=MODEL_PATHS['anyline']['weights']
	)
	self.anyline.to(self.device)

	def _init_sam(self):
	"""Initialize SAM2 model"""
	if not self._sam_imported:
	from sam2.build_sam import build_sam2
	from sam2.sam2_image_predictor import SAM2ImagePredictor
	self._sam_imported = True

	sam2_checkpoint = os.path.join(MODEL_PATHS['sam2']['path'],
	MODEL_PATHS['sam2']['weights'])
	model_cfg = os.path.join(MODEL_PATHS['sam2']['path'],
	MODEL_PATHS['sam2']['config'])
	self.sam2_model = build_sam2(model_cfg, sam2_checkpoint, device=self.device)
	self.sam2_predictor = SAM2ImagePredictor(self.sam2_model)

	def _enable_turbo(self):
	"""Enable turbo mode for faster inference"""
	if not self._turbo_imported:
	from optimum.quanto import freeze, qfloat8, quantize
	self._turbo_imported = True

	quantize(
	self.transformer,
	weights=qfloat8,
	exclude=[
	".norm", ".norm1", ".norm2", ".norm2_context",
	"proj_out", "x_embedder", "norm_out", "context_embedder",
	],
	)
	freeze(self.transformer)

	def generate_mask(self, image, input_points, input_labels):
	"""
	使用SAM2生成分割mask

	Args:
	image: PIL Image或numpy数组
	input_points: numpy数组，形状为(N, 2)，包含点的坐标
	input_labels: numpy数组，形状为(N,)，1表示前景点，0表示背景点

	Returns:
	PIL Image: 最高分数的mask
	"""
	try:
	# 确保图像是numpy数组
	if isinstance(image, Image.Image):
	image_array = np.array(image)
	else:
	image_array = image

	# 设置图像
	self.sam2_predictor.set_image(image_array)

	# 进行预测
	with torch.inference_mode():
	masks, scores, logits = self.sam2_predictor.predict(
	point_coords=input_points,
	point_labels=input_labels,
	multimask_output=True,
	)

	# 返回得分最高的mask
	best_mask_idx = scores.argmax()
	mask = masks[best_mask_idx]
	mask_image = Image.fromarray((mask * 255).astype(np.uint8))
	return mask_image

	except Exception as e:
	print(f"Mask generation failed: {str(e)}")
	raise

	def recover_2d_shape(self, image_hidden_state, grid_thw):
	batch_size, num_tokens, hidden_dim = image_hidden_state.shape
	_, h, w = grid_thw
	h_out = h // 2
	w_out = w // 2
	# 重塑为 (batch_size, height, width, hidden_dim)
	reshaped = image_hidden_state.view(batch_size, h_out, w_out, hidden_dim)
	return reshaped

	def generate_attention_matrix(self, center_x, center_y, radius, image_shape):
	height, width = image_shape
	y, x = np.ogrid[:height, :width]
	center_y, center_x = center_y * height, center_x * width
	distances = np.sqrt((x - center_x)2 + (y - center_y)2)
	attention = np.clip(1 - distances / (radius * min(height, width)), 0, 1)
	return attention

	def apply_attention(self, image_hidden_state, image_grid_thw, center_x, center_y, radius):
	qwen2_2d_image_embedding = self.recover_2d_shape(image_hidden_state, tuple(image_grid_thw.tolist()[0]))
	attention_matrix = self.generate_attention_matrix(
	center_x, center_y, radius,
	(qwen2_2d_image_embedding.size(1), qwen2_2d_image_embedding.size(2))
	)
	attention_tensor = torch.from_numpy(attention_matrix).to(self.dtype).unsqueeze(0).unsqueeze(-1)
	qwen2_2d_image_embedding = qwen2_2d_image_embedding * attention_tensor.to(self.device)
	return qwen2_2d_image_embedding.view(1, -1, qwen2_2d_image_embedding.size(3))

	def compute_text_embeddings(self, prompt):
	with torch.no_grad():
	text_inputs = self.tokenizer(prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt")
	text_input_ids = text_inputs.input_ids.to(self.device)
	prompt_embeds = self.text_encoder(text_input_ids, output_hidden_states=False)
	pooled_prompt_embeds = prompt_embeds.pooler_output
	return pooled_prompt_embeds.to(self.dtype)

	def compute_t5_text_embeddings(
	self,
	max_sequence_length=256,
	prompt=None,
	num_images_per_prompt=1,
	device=None,
	):
	prompt = [prompt] if isinstance(prompt, str) else prompt
	batch_size = len(prompt)

	text_inputs = self.tokenizer_two(
	prompt,
	padding="max_length",
	max_length=max_sequence_length,
	truncation=True,
	return_length=False,
	return_overflowing_tokens=False,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	prompt_embeds = self.text_encoder_two(text_input_ids.to(device))[0]

	dtype = self.text_encoder_two.dtype
	prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)

	_, seq_len, _ = prompt_embeds.shape

	# duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

	return prompt_embeds

	def process_image(self, image):
	message = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": "Describe this image."},
	]
	}
	]
	text = self.qwen2vl_processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)

	with torch.no_grad():
	inputs = self.qwen2vl_processor(text=[text], images=[image], padding=True, return_tensors="pt").to(self.device)
	output_hidden_state, image_token_mask, image_grid_thw = self.qwen2vl(**inputs)
	image_hidden_state = output_hidden_state[image_token_mask].view(1, -1, output_hidden_state.size(-1))

	return image_hidden_state, image_grid_thw

	def resize_image(self, img, max_pixels=1050000):
	# 确保输入是 PIL Image
	if not isinstance(img, Image.Image):
	img = Image.fromarray(img)

	width, height = img.size
	num_pixels = width * height

	if num_pixels > max_pixels:
	scale = math.sqrt(max_pixels / num_pixels)
	new_width = int(width * scale)
	new_height = int(height * scale)
	# 调整宽度和高度，使其能被8整除
	new_width = new_width - (new_width % 8)
	new_height = new_height - (new_height % 8)
	img = img.resize((new_width, new_height), Image.LANCZOS)
	else:
	# 如果图片不需要缩小，仍然需要确保尺寸能被8整除
	new_width = width - (width % 8)
	new_height = height - (height % 8)
	if new_width != width or new_height != height:
	img = img.resize((new_width, new_height), Image.LANCZOS)

	return img

	def generate_depth_map(self, image):
	"""Generate depth map using Depth Anything V2"""
	# Convert PIL to numpy array
	image_np = np.array(image)

	# Convert RGB to BGR for cv2
	image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)

	# Generate depth map
	with torch.no_grad():
	depth = self.depth_model.infer_image(image_bgr)

	# Normalize depth to 0-1 range
	depth_norm = (depth - depth.min()) / (depth.max() - depth.min())

	# Convert to RGB image
	depth_rgb = (depth_norm * 255).astype(np.uint8)
	depth_rgb = cv2.cvtColor(depth_rgb, cv2.COLOR_GRAY2RGB)

	return Image.fromarray(depth_rgb)


	def generate(self, input_image_a, input_image_b=None, prompt="", guidance_scale=3.5, num_inference_steps=28,
	aspect_ratio="1:1", center_x=None, center_y=None, radius=None, mode="variation",
	denoise_strength=0.8, mask_image=None, imageCount=2,
	line_mode=True, depth_mode=True, line_strength=0.4, depth_strength=0.2):

	batch_size = imageCount
	if aspect_ratio not in ASPECT_RATIOS:
	raise ValueError(f"Invalid aspect ratio. Choose from {list(ASPECT_RATIOS.keys())}")

	width, height = ASPECT_RATIOS[aspect_ratio]

	pooled_prompt_embeds = self.compute_text_embeddings(prompt="")
	t5_prompt_embeds = None
	if prompt != "":
	self.qwen2vl_processor = AutoProcessor.from_pretrained(MODEL_PATHS['qwen2vl'], min_pixels=2562828, max_pixels=2562828)
	t5_prompt_embeds = self.compute_t5_text_embeddings(prompt=prompt, device=self.device)
	t5_prompt_embeds = self.t5_context_embedder(t5_prompt_embeds)
	else:
	self.qwen2vl_processor = AutoProcessor.from_pretrained(MODEL_PATHS['qwen2vl'], min_pixels=5122828, max_pixels=5122828)

	qwen2_hidden_state_a, image_grid_thw_a = self.process_image(input_image_a)
	# 只有当所有注意力参数都被提供时，才应用注意力机制
	if mode == "variation":
	if center_x is not None and center_y is not None and radius is not None:
	qwen2_hidden_state_a = self.apply_attention(qwen2_hidden_state_a, image_grid_thw_a, center_x, center_y, radius)
	qwen2_hidden_state_a = self.connector(qwen2_hidden_state_a)

	if mode == "img2img" or mode == "inpaint":
	if input_image_b:
	qwen2_hidden_state_b, image_grid_thw_b = self.process_image(input_image_b)
	if center_x is not None and center_y is not None and radius is not None:
	qwen2_hidden_state_b = self.apply_attention(qwen2_hidden_state_b, image_grid_thw_b, center_x, center_y, radius)
	qwen2_hidden_state_b = self.connector(qwen2_hidden_state_b)
	else:
	qwen2_hidden_state_a = self.connector(qwen2_hidden_state_a)
	qwen2_hidden_state_b = None

	if mode == "controlnet" or mode == "controlnet-inpaint":
	qwen2_hidden_state_b = None
	if input_image_b:
	qwen2_hidden_state_b, image_grid_thw_b = self.process_image(input_image_b)
	if center_x is not None and center_y is not None and radius is not None:
	qwen2_hidden_state_b = self.apply_attention(qwen2_hidden_state_b, image_grid_thw_b, center_x, center_y, radius)
	qwen2_hidden_state_b = self.connector(qwen2_hidden_state_b)
	qwen2_hidden_state_a = self.connector(qwen2_hidden_state_a)

	#############################
	# IMAGE GENERATION
	#############################
	if mode == "variation":
	# Initialize different pipelines
	pipeline = FluxPipeline(
	transformer=self.transformer,
	scheduler=self.noise_scheduler,
	vae=self.vae,
	text_encoder=self.text_encoder,
	tokenizer=self.tokenizer,
	)

	gen_images = pipeline(
	prompt_embeds=qwen2_hidden_state_a.repeat(batch_size, 1, 1),
	t5_prompt_embeds=t5_prompt_embeds.repeat(batch_size, 1, 1) if t5_prompt_embeds is not None else None,
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	height=height,
	width=width,
	).images


	#############################
	# IMAGE-TO-IMAGE
	#############################
	elif mode == "img2img":
	input_image_a = self.resize_image(input_image_a)
	width, height = input_image_a.size

	img2img_pipeline = FluxImg2ImgPipeline(
	transformer=self.transformer,
	scheduler=self.noise_scheduler,
	vae=self.vae,
	text_encoder=self.text_encoder,
	tokenizer=self.tokenizer,
	)

	gen_images = img2img_pipeline(
	image=input_image_a,
	strength=denoise_strength,
	prompt_embeds=qwen2_hidden_state_b.repeat(batch_size, 1, 1) if qwen2_hidden_state_b is not None else qwen2_hidden_state_a.repeat(batch_size, 1, 1),
	t5_prompt_embeds=t5_prompt_embeds.repeat(batch_size, 1, 1) if t5_prompt_embeds is not None else None,
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	height=height,
	width=width,
	).images


	#############################
	# INPAINTING
	#############################
	elif mode == "inpaint":
	if mask_image is None:
	raise ValueError("Mask image is required for inpainting mode")

	input_image_a = self.resize_image(input_image_a)
	mask_image = self.resize_image(mask_image)
	width, height = input_image_a.size

	inpaint_pipeline = FluxInpaintPipeline(
	transformer=self.transformer,
	scheduler=self.noise_scheduler,
	vae=self.vae,
	text_encoder=self.text_encoder,
	tokenizer=self.tokenizer,
	)

	gen_images = inpaint_pipeline(
	image=input_image_a,
	mask_image=mask_image,
	strength=denoise_strength,
	prompt_embeds=qwen2_hidden_state_b.repeat(batch_size, 1, 1) if qwen2_hidden_state_b is not None else qwen2_hidden_state_a.repeat(batch_size, 1, 1),
	t5_prompt_embeds=t5_prompt_embeds.repeat(batch_size, 1, 1) if t5_prompt_embeds is not None else None,
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	height=height,
	width=width,
	).images

	#############################
	# CONTROLNET
	#############################
	elif mode == "controlnet":
	input_image_a = self.resize_image(input_image_a)
	width, height = input_image_a.size

	controlnet_pipeline = FluxControlNetImg2ImgPipeline(
	transformer=self.transformer,
	scheduler=self.noise_scheduler,
	vae=self.vae,
	text_encoder=self.text_encoder,
	tokenizer=self.tokenizer,
	controlnet=self.controlnet,
	)

	# 准备控制图像和模式列表
	control_images = []
	control_modes = []
	conditioning_scales = []

	# 根据用户选择添加控制模式
	if depth_mode:
	control_image_depth = self.generate_depth_map(input_image_a)
	control_images.append(control_image_depth)
	control_modes.append(2) # depth mode
	conditioning_scales.append(depth_strength)

	if line_mode:
	control_image_canny = self.anyline(input_image_a, detect_resolution=1280)
	control_images.append(control_image_canny)
	control_modes.append(0) # line mode
	conditioning_scales.append(line_strength)

	# 如果没有启用任何模式，默认使用line+depth模式
	if not line_mode and not depth_mode:
	control_image_depth = self.generate_depth_map(input_image_a)
	control_image_canny = self.anyline(input_image_a, detect_resolution=1280)
	control_images = [control_image_depth, control_image_canny]
	control_modes = [2, 0]
	conditioning_scales = [0.2, 0.4]

	if qwen2_hidden_state_b is not None:
	qwen2_hidden_state_b = qwen2_hidden_state_b[:, :qwen2_hidden_state_a.shape[1], :]
	qwen2_hidden_state_a = qwen2_hidden_state_a[:, :qwen2_hidden_state_b.shape[1], :]

	gen_images = controlnet_pipeline(
	image=input_image_a,
	strength=denoise_strength,
	control_image=control_images,
	control_mode=control_modes,
	controlnet_conditioning_scale=conditioning_scales,
	prompt_embeds=qwen2_hidden_state_b.repeat(batch_size, 1, 1) if qwen2_hidden_state_b is not None else qwen2_hidden_state_a.repeat(batch_size, 1, 1),
	t5_prompt_embeds=t5_prompt_embeds.repeat(batch_size, 1, 1) if t5_prompt_embeds is not None else None,
	prompt_embeds_control=qwen2_hidden_state_a.repeat(batch_size, 1, 1),
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	height=height,
	width=width,
	).images

	#############################
	# CONTROLNET INPAINT
	#############################
	elif mode == "controlnet-inpaint":
	input_image_a = self.resize_image(input_image_a)
	mask_image = self.resize_image(mask_image)
	width, height = input_image_a.size

	controlnet_pipeline = FluxControlNetInpaintPipeline(
	transformer=self.transformer,
	scheduler=self.noise_scheduler,
	vae=self.vae,
	text_encoder=self.text_encoder,
	tokenizer=self.tokenizer,
	controlnet=self.controlnet,
	)

	# 准备控制图像和模式列表
	control_images = []
	control_modes = []
	conditioning_scales = []

	# 根据用户选择添加控制模式
	if depth_mode:
	control_image_depth = self.generate_depth_map(input_image_a)
	control_images.append(control_image_depth)
	control_modes.append(2) # depth mode
	conditioning_scales.append(depth_strength)

	if line_mode:
	control_image_canny = self.anyline(input_image_a, detect_resolution=1280)
	control_images.append(control_image_canny)
	control_modes.append(0) # line mode
	conditioning_scales.append(line_strength)

	# 如果没有启用任何模式，默认使用line+depth模式
	if not line_mode and not depth_mode:
	control_image_depth = self.generate_depth_map(input_image_a)
	control_image_canny = self.anyline(input_image_a, detect_resolution=1280)
	control_images = [control_image_depth, control_image_canny]
	control_modes = [2, 0]
	conditioning_scales = [0.2, 0.4]

	if qwen2_hidden_state_b is not None:
	qwen2_hidden_state_b = qwen2_hidden_state_b[:, :qwen2_hidden_state_a.shape[1], :]
	qwen2_hidden_state_a = qwen2_hidden_state_a[:, :qwen2_hidden_state_b.shape[1], :]

	gen_images = controlnet_pipeline(
	image=input_image_a,
	mask_image=mask_image,
	control_image=control_images,
	control_mode=control_modes,
	controlnet_conditioning_scale=conditioning_scales,
	strength=denoise_strength,
	prompt_embeds=qwen2_hidden_state_b.repeat(batch_size, 1, 1) if qwen2_hidden_state_b is not None else qwen2_hidden_state_a.repeat(batch_size, 1, 1),
	t5_prompt_embeds=t5_prompt_embeds.repeat(batch_size, 1, 1) if t5_prompt_embeds is not None else None,
	prompt_embeds_control=qwen2_hidden_state_a.repeat(batch_size, 1, 1),
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	height=height,
	width=width,
	).images

	else:
	raise ValueError(f"Invalid mode: {mode}")

	return gen_images