SkyReels_L

Paused

App Files Files Community

SkyReels_L / app.py

1inkusFace

Update app.py

f9a089d verified 4 months ago

raw

history blame

10.6 kB

	import spaces
	import gradio as gr
	import argparse
	import sys
	import os
	import random
	import subprocess
	from PIL import Image
	import numpy as np

	# Removed environment-specific lines
	from diffusers.utils import export_to_video
	from diffusers.utils import load_image

	import torch
	import logging
	from collections import OrderedDict

	torch.backends.cuda.matmul.allow_tf32 = False
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
	torch.backends.cudnn.allow_tf32 = False
	torch.backends.cudnn.deterministic = False
	torch.backends.cudnn.benchmark = False
	torch.set_float32_matmul_precision("highest")
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	logger = logging.getLogger(__name__)


	# --- Dummy Classes (Keep for standalone execution) ---
	class OffloadConfig:
	def __init__(
	self,
	high_cpu_memory: bool = False,
	parameters_level: bool = False,
	compiler_transformer: bool = False,
	compiler_cache: str = "",
	):
	self.high_cpu_memory = high_cpu_memory
	self.parameters_level = parameters_level
	self.compiler_transformer = compiler_transformer
	self.compiler_cache = compiler_cache


	class TaskType: # Keep here for infer
	T2V = 0
	I2V = 1


	class LlamaModel:
	@staticmethod
	def from_pretrained(args, *kwargs):
	return LlamaModel()

	def to(self, device):
	return self


	class HunyuanVideoTransformer3DModel:
	@staticmethod
	def from_pretrained(args, *kwargs):
	return HunyuanVideoTransformer3DModel()

	def to(self, device):
	return self


	class SkyreelsVideoPipeline:
	@staticmethod
	def from_pretrained(args, *kwargs):
	return SkyreelsVideoPipeline()

	def to(self, device):
	return self

	def __call__(self, args, *kwargs):
	num_frames = kwargs.get("num_frames", 16) # Default to 16 frames
	height = kwargs.get("height", 512)
	width = kwargs.get("width", 512)

	if "image" in kwargs: # I2V
	image = kwargs["image"]
	# Convert PIL Image to PyTorch tensor (and normalize to [0, 1])
	image_tensor = torch.from_numpy(np.array(image)).float() / 255.0
	image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0) # (H, W, C) -> (1, C, H, W)

	# Create video by repeating the image
	frames = image_tensor.repeat(1, 1, num_frames, 1, 1) # (1, C, T, H, W)
	frames = frames + torch.randn_like(frames) * 0.05 # Add a little noise
	# Correct shape: (1, C, T, H, W) - NO PERMUTE HERE

	else: # T2V
	frames = torch.randn(1, 3, num_frames, height, width) # (1, C, T, H, W) - Correct!

	return type("obj", (object,), {"frames": frames})() # No longer a list!

	def __init__(self):
	super().__init__()
	self._modules = OrderedDict()
	self.vae = self.VAE()
	self._modules["vae"] = self.vae

	def named_children(self):
	return self._modules.items()

	class VAE:
	def enable_tiling(self):
	pass


	def quantize_(args, *kwargs):
	return


	def float8_weight_only():
	return


	# --- End Dummy Classes ---


	class SkyReelsVideoSingleGpuInfer:
	def _load_model(
	self, model_id: str, base_model_id: str = "hunyuanvideo-community/HunyuanVideo", quant_model: bool = True
	):
	logger.info(f"load model model_id:{model_id} quan_model:{quant_model}")
	text_encoder = LlamaModel.from_pretrained(
	base_model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16
	).to("cpu")
	transformer = HunyuanVideoTransformer3DModel.from_pretrained(
	model_id, torch_dtype=torch.bfloat16, device="cpu"
	).to("cpu")

	if quant_model:
	quantize_(text_encoder, float8_weight_only())
	text_encoder.to("cpu")
	torch.cuda.empty_cache()
	quantize_(transformer, float8_weight_only())
	transformer.to("cpu")
	torch.cuda.empty_cache()

	pipe = SkyreelsVideoPipeline.from_pretrained(
	base_model_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch.bfloat16
	).to("cpu")
	pipe.vae.enable_tiling()
	torch.cuda.empty_cache()
	return pipe

	def __init__(
	self,
	task_type: TaskType,
	model_id: str,
	quant_model: bool = True,
	is_offload: bool = True,
	offload_config: OffloadConfig = OffloadConfig(),
	enable_cfg_parallel: bool = True,
	):
	self.task_type = task_type
	self.model_id = model_id
	self.quant_model = quant_model
	self.is_offload = is_offload
	self.offload_config = offload_config
	self.enable_cfg_parallel = enable_cfg_parallel
	self.pipe = None
	self.is_initialized = False
	self.gpu_device = None

	def initialize(self):
	"""Initializes the model and moves it to the GPU."""
	if self.is_initialized:
	return

	if not torch.cuda.is_available():
	raise RuntimeError("CUDA is not available. Cannot initialize model.")

	self.gpu_device = "cuda:0"
	self.pipe = self._load_model(model_id=self.model_id, quant_model=self.quant_model)

	if self.is_offload:
	pass
	else:
	self.pipe.to(self.gpu_device)

	if self.offload_config.compiler_transformer:
	torch._dynamo.config.suppress_errors = True
	os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
	os.environ["TORCHINDUCTOR_CACHE_DIR"] = f"{self.offload_config.compiler_cache}"
	self.pipe.transformer = torch.compile(
	self.pipe.transformer, mode="max-autotune-no-cudagraphs", dynamic=True
	)
	if self.offload_config.compiler_transformer:
	self.warm_up()
	self.is_initialized = True

	def warm_up(self):
	if not self.is_initialized:
	raise RuntimeError("Model must be initialized before warm-up.")

	init_kwargs = {
	"prompt": "A woman is dancing in a room",
	"height": 544,
	"width": 960,
	"guidance_scale": 6,
	"num_inference_steps": 1,
	"negative_prompt": "bad quality",
	"num_frames": 16,
	"generator": torch.Generator(self.gpu_device).manual_seed(42),
	"embedded_guidance_scale": 1.0,
	}
	if self.task_type == TaskType.I2V:
	init_kwargs["image"] = Image.new("RGB", (544, 960), color="black")
	self.pipe(**init_kwargs)
	logger.info("Warm-up complete.")

	def infer(self, **kwargs):
	"""Handles inference requests."""
	if not self.is_initialized:
	self.initialize()
	if "seed" in kwargs:
	kwargs["generator"] = torch.Generator(self.gpu_device).manual_seed(kwargs["seed"])
	del kwargs["seed"]
	assert (self.task_type == TaskType.I2V and "image" in kwargs) or self.task_type == TaskType.T2V
	result = self.pipe(**kwargs).frames # Return the tensor directly
	return result


	_predictor = None


	@spaces.GPU(duration=90)
	def generate_video(prompt: str, seed: int, image: str = None) -> tuple[str, dict]:
	"""Generates a video based on the given prompt and seed.

	Args:
	prompt: The text prompt to guide video generation.
	seed: The random seed for reproducibility.
	image: Optional path to an image for Image-to-Video.

	Returns:
	A tuple containing the path to the generated video and the parameters used.
	"""
	global _predictor

	if seed == -1:
	random.seed()
	seed = int(random.randrange(4294967294))

	if image is None:
	task_type = TaskType.T2V
	model_id = "Skywork/SkyReels-V1-Hunyuan-T2V"
	kwargs = {
	"prompt": prompt,
	"height": 512,
	"width": 512,
	"num_frames": 16,
	"num_inference_steps": 30,
	"seed": seed,
	"guidance_scale": 7.5,
	"negative_prompt": "bad quality, worst quality",
	}
	else:
	task_type = TaskType.I2V
	model_id = "Skywork/SkyReels-V1-Hunyuan-I2V"
	kwargs = {
	"prompt": prompt,
	"image": load_image(image),
	"height": 512,
	"width": 512,
	"num_frames": 97,
	"num_inference_steps": 30,
	"seed": seed,
	"guidance_scale": 6.0,
	"embedded_guidance_scale": 1.0,
	"negative_prompt": "Aerial view, low quality, bad hands",
	"cfg_for": False,
	}

	if _predictor is None:
	_predictor = SkyReelsVideoSingleGpuInfer(
	task_type=task_type,
	model_id=model_id,
	quant_model=True,
	is_offload=True,
	offload_config=OffloadConfig(
	high_cpu_memory=True,
	parameters_level=True,
	compiler_transformer=False,
	),
	)
	_predictor.initialize()
	logger.info("Predictor initialized")

	with torch.no_grad():
	output = _predictor.infer(**kwargs)
	'''
	output = (output.numpy() * 255).astype(np.uint8)
	# Correct Transpose: (1, C, T, H, W) -> (1, T, H, W, C)
	output = output.transpose(0, 2, 3, 4, 1)
	output = output[0] # Remove batch dimension: (T, H, W, C)
	'''

	save_dir = f"./result"
	os.makedirs(save_dir, exist_ok=True)
	video_out_file = f"{save_dir}/{seed}.mp4"
	print(f"generate video, local path: {video_out_file}")
	export_to_video(output, video_out_file, fps=24)
	return video_out_file, kwargs


	def create_gradio_interface():
	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	image = gr.Image(label="Upload Image", type="filepath")
	prompt = gr.Textbox(label="Input Prompt")
	seed = gr.Number(label="Random Seed", value=-1)
	with gr.Column():
	submit_button = gr.Button("Generate Video")
	output_video = gr.Video(label="Generated Video")
	output_params = gr.Textbox(label="Output Parameters")

	submit_button.click(
	fn=generate_video,
	inputs=[prompt, seed, image],
	outputs=[output_video, output_params],
	)
	return demo


	if __name__ == "__main__":
	demo = create_gradio_interface()
	demo.queue().launch()