BRIA-3.1

Runtime error

App Files Files Community

BRIA-3.1 / bria_utils.py

Eyalgut

Update bria_utils.py

36c6ec5 verified 4 months ago

raw

history blame contribute delete

11.7 kB

	from typing import Union, Optional, List
	import torch
	from diffusers.utils import logging
	from transformers import (
	T5EncoderModel,
	T5TokenizerFast,
	)
	from transformers import (
	CLIPTextModel,
	CLIPTextModelWithProjection,
	CLIPTokenizer
	)

	import numpy as np
	import torch.distributed as dist
	import math
	import os

	logger = logging.get_logger(__name__) # pylint: disable=invalid-name

	def get_t5_prompt_embeds(
	tokenizer: T5TokenizerFast ,
	text_encoder: T5EncoderModel,
	prompt: Union[str, List[str]] = None,
	num_images_per_prompt: int = 1,
	max_sequence_length: int = 128,
	device: Optional[torch.device] = None,
	):
	device = device or text_encoder.device

	prompt = [prompt] if isinstance(prompt, str) else prompt
	batch_size = len(prompt)

	text_inputs = tokenizer(
	prompt,
	# padding="max_length",
	max_length=max_sequence_length,
	truncation=True,
	add_special_tokens=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids

	if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
	removed_text = tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
	logger.warning(
	"The following part of your input was truncated because `max_sequence_length` is set to "
	f" {max_sequence_length} tokens: {removed_text}"
	)

	prompt_embeds = text_encoder(text_input_ids.to(device))[0]

	# Concat zeros to max_sequence
	b, seq_len, dim = prompt_embeds.shape
	if seq_len<max_sequence_length:
	padding = torch.zeros((b,max_sequence_length-seq_len,dim),dtype=prompt_embeds.dtype,device=prompt_embeds.device)
	prompt_embeds = torch.concat([prompt_embeds,padding],dim=1)

	prompt_embeds = prompt_embeds.to(device=device)

	_, seq_len, _ = prompt_embeds.shape

	# duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

	return prompt_embeds

	# in order the get the same sigmas as in training and sample from them
	def get_original_sigmas(num_train_timesteps=1000,num_inference_steps=1000):
	timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
	sigmas = timesteps / num_train_timesteps

	inds = [int(ind) for ind in np.linspace(0, num_train_timesteps-1, num_inference_steps)]
	new_sigmas = sigmas[inds]
	return new_sigmas

	def is_ng_none(negative_prompt):
	return negative_prompt is None or negative_prompt=='' or (isinstance(negative_prompt,list) and negative_prompt[0] is None) or (type(negative_prompt)==list and negative_prompt[0]=='')

	class CudaTimerContext:
	def __init__(self, times_arr):
	self.times_arr = times_arr

	def __enter__(self):
	self.before_event = torch.cuda.Event(enable_timing=True)
	self.after_event = torch.cuda.Event(enable_timing=True)
	self.before_event.record()

	def __exit__(self, type, value, traceback):
	self.after_event.record()
	torch.cuda.synchronize()
	elapsed_time = self.before_event.elapsed_time(self.after_event)/1000
	self.times_arr.append(elapsed_time)


	def get_env_prefix():
	env = os.environ.get("CLOUD_PROVIDER",'AWS').upper()
	if env=='AWS':
	return 'SM_CHANNEL'
	elif env=='AZURE':
	return 'AZUREML_DATAREFERENCE'

	raise Exception(f'Env {env} not supported')


	def compute_density_for_timestep_sampling(
	weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
	):
	"""Compute the density for sampling the timesteps when doing SD3 training.

	Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.

	SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
	"""
	if weighting_scheme == "logit_normal":
	# See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
	u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
	u = torch.nn.functional.sigmoid(u)
	elif weighting_scheme == "mode":
	u = torch.rand(size=(batch_size,), device="cpu")
	u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
	else:
	u = torch.rand(size=(batch_size,), device="cpu")
	return u

	def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
	"""Computes loss weighting scheme for SD3 training.

	Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.

	SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
	"""
	if weighting_scheme == "sigma_sqrt":
	weighting = (sigmas**-2.0).float()
	elif weighting_scheme == "cosmap":
	bot = 1 - 2 * sigmas + 2 * sigmas**2
	weighting = 2 / (math.pi * bot)
	else:
	weighting = torch.ones_like(sigmas)
	return weighting


	def initialize_distributed():
	# Initialize the process group for distributed training
	dist.init_process_group('nccl')

	# Get the current process's rank (ID) and the total number of processes (world size)
	rank = dist.get_rank()
	world_size = dist.get_world_size()

	print(f"Initialized distributed training: Rank {rank}/{world_size}")


	def get_clip_prompt_embeds(
	text_encoder: CLIPTextModel,
	text_encoder_2: CLIPTextModelWithProjection,
	tokenizer: CLIPTokenizer,
	tokenizer_2: CLIPTokenizer,
	prompt: Union[str, List[str]] = None,
	num_images_per_prompt: int = 1,
	max_sequence_length: int = 77,
	device: Optional[torch.device] = None,
	):

	device = device or text_encoder.device
	assert max_sequence_length == tokenizer.model_max_length
	prompt = [prompt] if isinstance(prompt, str) else prompt

	# Define tokenizers and text encoders
	tokenizers = [tokenizer, tokenizer_2]
	text_encoders = [text_encoder, text_encoder_2]

	# textual inversion: process multi-vector tokens if necessary
	prompt_embeds_list = []
	prompts = [prompt, prompt]
	for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
	text_inputs = tokenizer(
	prompt,
	padding="max_length",
	max_length=tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	text_input_ids = text_inputs.input_ids
	prompt_embeds = text_encoder(text_input_ids.to(text_encoder.device), output_hidden_states=True)

	# We are only ALWAYS interested in the pooled output of the final text encoder
	pooled_prompt_embeds = prompt_embeds[0]
	prompt_embeds = prompt_embeds.hidden_states[-2]

	prompt_embeds_list.append(prompt_embeds)

	prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)


	bs_embed, seq_len, _ = prompt_embeds.shape
	# duplicate text embeddings for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
	pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
	bs_embed * num_images_per_prompt, -1
	)

	return prompt_embeds, pooled_prompt_embeds

	def get_1d_rotary_pos_embed(
	dim: int,
	pos: Union[np.ndarray, int],
	theta: float = 10000.0,
	use_real=False,
	linear_factor=1.0,
	ntk_factor=1.0,
	repeat_interleave_real=True,
	freqs_dtype=torch.float32, # torch.float32, torch.float64 (flux)
	):
	"""
	Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

	This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
	index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
	data type.

	Args:
	dim (`int`): Dimension of the frequency tensor.
	pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
	theta (`float`, optional, defaults to 10000.0):
	Scaling factor for frequency computation. Defaults to 10000.0.
	use_real (`bool`, optional):
	If True, return real part and imaginary part separately. Otherwise, return complex numbers.
	linear_factor (`float`, optional, defaults to 1.0):
	Scaling factor for the context extrapolation. Defaults to 1.0.
	ntk_factor (`float`, optional, defaults to 1.0):
	Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
	repeat_interleave_real (`bool`, optional, defaults to `True`):
	If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
	Otherwise, they are concateanted with themselves.
	freqs_dtype (`torch.float32` or `torch.float64`, optional, defaults to `torch.float32`):
	the dtype of the frequency tensor.
	Returns:
	`torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
	"""
	assert dim % 2 == 0

	if isinstance(pos, int):
	pos = torch.arange(pos)
	if isinstance(pos, np.ndarray):
	pos = torch.from_numpy(pos) # type: ignore # [S]

	theta = theta * ntk_factor
	freqs = (
	1.0
	/ (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device)[: (dim // 2)] / dim))
	/ linear_factor
	) # [D/2]
	freqs = torch.outer(pos, freqs) # type: ignore # [S, D/2]
	if use_real and repeat_interleave_real:
	# flux, hunyuan-dit, cogvideox
	freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float() # [S, D]
	freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float() # [S, D]
	return freqs_cos, freqs_sin
	elif use_real:
	# stable audio, allegro
	freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float() # [S, D]
	freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float() # [S, D]
	return freqs_cos, freqs_sin
	else:
	# lumina
	freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 # [S, D/2]
	return freqs_cis


	class FluxPosEmbed(torch.nn.Module):
	# modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11
	def __init__(self, theta: int, axes_dim: List[int]):
	super().__init__()
	self.theta = theta
	self.axes_dim = axes_dim

	def forward(self, ids: torch.Tensor) -> torch.Tensor:
	n_axes = ids.shape[-1]
	cos_out = []
	sin_out = []
	pos = ids.float()
	is_mps = ids.device.type == "mps"
	freqs_dtype = torch.float32 if is_mps else torch.float64
	for i in range(n_axes):
	cos, sin = get_1d_rotary_pos_embed(
	self.axes_dim[i],
	pos[:, i],
	theta=self.theta,
	repeat_interleave_real=True,
	use_real=True,
	freqs_dtype=freqs_dtype,
	)
	cos_out.append(cos)
	sin_out.append(sin)
	freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
	freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
	return freqs_cos, freqs_sin