Spaces:

Kkonjeong
/

virtual_character

Runtime error

App Files Files Community

zejunyang commited on Apr 3, 2024

Commit

2de857a

1 Parent(s): bf4c058

init

Browse files

Files changed (4) hide show

app.py +50 -46
src/audio2vid.py +73 -67
src/utils/crop_face_single.py +45 -0
src/vid2vid.py +69 -67

app.py CHANGED Viewed

@@ -17,68 +17,72 @@ with gr.Blocks() as demo:
     gr.Markdown(description)
     with gr.Tab("Audio2video"):
-        with gr.Column():
-            with gr.Row():
-                a2v_input_audio = gr.Audio(sources=["upload", "microphone"], type="filepath", editable=True, label="Input audio", interactive=True)
-                # with gr.Column():
-                #     a2v_ref_img = gr.Image(label="Upload reference image", sources="upload")
-                #     a2v_img_trans_real_botton = gr.Button("Translate to realistic style")
-                a2v_ref_img = gr.Image(label="Upload reference image", sources="upload")
-                a2v_headpose_video = gr.Video(label="Option: upload head pose reference video", sources="upload")
-            with gr.Row():
-                a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
-                a2v_step_slider = gr.Slider(minimum=5, maximum=50, value=25, label="Steps (--steps)")
-            with gr.Row():
-                a2v_length = gr.Number(value=150, label="Length (-L) (Set 0 to automatically calculate video length.)")
-                a2v_seed = gr.Number(value=42, label="Seed (--seed)")
-            a2v_botton = gr.Button("Generate", variant="primary")
             a2v_output_video = gr.PlayableVideo(label="Result", interactive=False)
     with gr.Tab("Video2video"):
-         with gr.Column():
-            with gr.Row():
-                # with gr.Column():
-                #     v2v_ref_img = gr.Image(label="Upload reference image", sources="upload")
-                #     v2v_img_trans_real_botton = gr.Button("Translate to realistic style")
-                v2v_ref_img = gr.Image(label="Upload reference image", sources="upload")
-                v2v_source_video = gr.Video(label="Upload source video", sources="upload")
-            with gr.Row():
-                v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
-                v2v_step_slider = gr.Slider(minimum=5, maximum=50, value=25, label="Steps (--steps)")
-            with gr.Row():
-                v2v_length = gr.Number(value=150, label="Length (-L) (Set 0 to automatically calculate video length.)")
-                v2v_seed = gr.Number(value=42, label="Seed (--seed)")
-            v2v_botton = gr.Button("Generate", variant="primary")
             v2v_output_video = gr.PlayableVideo(label="Result", interactive=False)
     a2v_botton.click(
         fn=audio2video,
         inputs=[a2v_input_audio, a2v_ref_img, a2v_headpose_video,
                 a2v_size_slider, a2v_step_slider, a2v_length, a2v_seed],
-        outputs=[a2v_output_video]
     )
-    # a2v_img_trans_real_botton.click(
-    #     fn=sd_img2real,
-    #     inputs=[a2v_ref_img],
-    #     outputs=[a2v_ref_img]
-    # )
     v2v_botton.click(
         fn=video2video,
         inputs=[v2v_ref_img, v2v_source_video,
                 v2v_size_slider, v2v_step_slider, v2v_length, v2v_seed],
-        outputs=[v2v_output_video]
     )
-    # v2v_img_trans_real_botton.click(
-    #     fn=sd_img2real,
-    #     inputs=[v2v_ref_img],
-    #     outputs=[v2v_ref_img]
-    # )
 demo.launch()

     gr.Markdown(description)
     with gr.Tab("Audio2video"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    a2v_input_audio = gr.Audio(sources=["upload", "microphone"], type="filepath", editable=True, label="Input audio", interactive=True)
+                    a2v_ref_img = gr.Image(label="Upload reference image", sources="upload")
+                    a2v_headpose_video = gr.Video(label="Option: upload head pose reference video", sources="upload")
+                with gr.Row():
+                    a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
+                    a2v_step_slider = gr.Slider(minimum=5, maximum=50, step=1, value=20, label="Steps (--steps)")
+                with gr.Row():
+                    a2v_length = gr.Slider(minimum=0, maximum=300, step=1, value=150, label="Length (-L) (Set 0 to automatically calculate video length.)")
+                    a2v_seed = gr.Number(value=42, label="Seed (--seed)")
+                a2v_botton = gr.Button("Generate", variant="primary")
             a2v_output_video = gr.PlayableVideo(label="Result", interactive=False)
+        gr.Examples(
+            examples=[
+                ["configs/inference/audio/lyl.wav", "configs/inference/ref_images/Aragaki.png", None],
+                ["configs/inference/audio/lyl.wav", "configs/inference/ref_images/solo.png", None],
+                ["configs/inference/audio/lyl.wav", "configs/inference/ref_images/lyl.png", "configs/inference/head_pose_temp/pose_ref_video.mp4"],
+                ],
+            inputs=[a2v_input_audio, a2v_ref_img, a2v_headpose_video],
+        )
     with gr.Tab("Video2video"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    v2v_ref_img = gr.Image(label="Upload reference image", sources="upload")
+                    v2v_source_video = gr.Video(label="Upload source video", sources="upload")
+                with gr.Row():
+                    v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
+                    v2v_step_slider = gr.Slider(minimum=5, maximum=50, step=1, value=20, label="Steps (--steps)")
+                with gr.Row():
+                    v2v_length = gr.Slider(minimum=0, maximum=300, step=1, value=150, label="Length (-L) (Set 0 to automatically calculate video length.)")
+                    v2v_seed = gr.Number(value=42, label="Seed (--seed)")
+                v2v_botton = gr.Button("Generate", variant="primary")
             v2v_output_video = gr.PlayableVideo(label="Result", interactive=False)
+        gr.Examples(
+            examples=[
+                ["configs/inference/ref_images/Aragaki.png", "configs/inference/video/Aragaki_song.mp4"],
+                ["configs/inference/ref_images/solo.png", "configs/inference/video/Aragaki_song.mp4"],
+                ["configs/inference/ref_images/lyl.png", "configs/inference/head_pose_temp/pose_ref_video.mp4"],
+                ],
+            inputs=[v2v_ref_img, v2v_source_video, a2v_headpose_video],
+        )
     a2v_botton.click(
         fn=audio2video,
         inputs=[a2v_input_audio, a2v_ref_img, a2v_headpose_video,
                 a2v_size_slider, a2v_step_slider, a2v_length, a2v_seed],
+        outputs=[a2v_output_video, a2v_ref_img]
     )
     v2v_botton.click(
         fn=video2video,
         inputs=[v2v_ref_img, v2v_source_video,
                 v2v_size_slider, v2v_step_slider, v2v_length, v2v_seed],
+        outputs=[v2v_output_video, v2v_ref_img]
     )
 demo.launch()

src/audio2vid.py CHANGED Viewed

@@ -9,25 +9,27 @@ import spaces
 from scipy.spatial.transform import Rotation as R
 from scipy.interpolate import interp1d
-from diffusers import AutoencoderKL, DDIMScheduler
-from einops import repeat
 from omegaconf import OmegaConf
 from PIL import Image
 from torchvision import transforms
-from transformers import CLIPVisionModelWithProjection
-from src.models.pose_guider import PoseGuider
-from src.models.unet_2d_condition import UNet2DConditionModel
-from src.models.unet_3d import UNet3DConditionModel
-from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
 from src.utils.util import save_videos_grid
-from src.audio_models.model import Audio2MeshModel
 from src.utils.audio_util import prepare_audio_feature
-from src.utils.mp_utils  import LMKExtractor
-from src.utils.draw_util import FaceMeshVisualizer
 from src.utils.pose_util import project_points
 def matrix_to_euler_and_translation(matrix):
@@ -49,7 +51,7 @@ def smooth_pose_seq(pose_seq, window_size=5):
     return smoothed_pose_seq
 def get_headpose_temp(input_video):
-    lmk_extractor = LMKExtractor()
     cap = cv2.VideoCapture(input_video)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -98,70 +100,70 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
     config = OmegaConf.load('./configs/prompts/animation_audio.yaml')
-    if config.weight_dtype == "fp16":
-        weight_dtype = torch.float16
-    else:
-        weight_dtype = torch.float32
     audio_infer_config = OmegaConf.load(config.audio_inference_config)
-    # prepare model
-    a2m_model = Audio2MeshModel(audio_infer_config['a2m_model'])
-    a2m_model.load_state_dict(torch.load(audio_infer_config['pretrained_model']['a2m_ckpt']), strict=False)
-    a2m_model.cuda().eval()
-    vae = AutoencoderKL.from_pretrained(
-        config.pretrained_vae_path,
-    ).to("cuda", dtype=weight_dtype)
-    reference_unet = UNet2DConditionModel.from_pretrained(
-        config.pretrained_base_model_path,
-        subfolder="unet",
-    ).to(dtype=weight_dtype, device="cuda")
-    inference_config_path = config.inference_config
-    infer_config = OmegaConf.load(inference_config_path)
-    denoising_unet = UNet3DConditionModel.from_pretrained_2d(
-        config.pretrained_base_model_path,
-        config.motion_module_path,
-        subfolder="unet",
-        unet_additional_kwargs=infer_config.unet_additional_kwargs,
-    ).to(dtype=weight_dtype, device="cuda")
-    pose_guider = PoseGuider(noise_latent_channels=320, use_ca=True).to(device="cuda", dtype=weight_dtype) # not use cross attention
-    image_enc = CLIPVisionModelWithProjection.from_pretrained(
-        config.image_encoder_path
-    ).to(dtype=weight_dtype, device="cuda")
-    sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
-    scheduler = DDIMScheduler(**sched_kwargs)
     generator = torch.manual_seed(seed)
     width, height = size, size
-    # load pretrained weights
-    denoising_unet.load_state_dict(
-        torch.load(config.denoising_unet_path, map_location="cpu"),
-        strict=False,
-    )
-    reference_unet.load_state_dict(
-        torch.load(config.reference_unet_path, map_location="cpu"),
-    )
-    pose_guider.load_state_dict(
-        torch.load(config.pose_guider_path, map_location="cpu"),
-    )
-    pipe = Pose2VideoPipeline(
-        vae=vae,
-        image_encoder=image_enc,
-        reference_unet=reference_unet,
-        denoising_unet=denoising_unet,
-        pose_guider=pose_guider,
-        scheduler=scheduler,
-    )
-    pipe = pipe.to("cuda", dtype=weight_dtype)
     date_str = datetime.now().strftime("%Y%m%d")
     time_str = datetime.now().strftime("%H%M")
@@ -170,17 +172,20 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
     save_dir = Path(f"output/{date_str}/{save_dir_name}")
     save_dir.mkdir(exist_ok=True, parents=True)
-    lmk_extractor = LMKExtractor()
-    vis = FaceMeshVisualizer(forehead_edge=False)
     ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
-    # TODO: 人脸检测+裁剪
     ref_image_np = cv2.resize(ref_image_np, (size, size))
     ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
     face_result = lmk_extractor(ref_image_np)
     if face_result is None:
-        return None
     lmks = face_result['lmks'].astype(np.float32)
     ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
@@ -217,6 +222,7 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
         [transforms.Resize((height, width)), transforms.ToTensor()]
     )
     args_L = len(pose_images) if length==0 or length > len(pose_images) else length
     for pose_image_np in pose_images[: args_L]:
         pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
         pose_tensor_list.append(pose_transform(pose_image_pil))
@@ -249,7 +255,7 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
     stream = ffmpeg.input(save_path)
     audio = ffmpeg.input(input_audio)
-    ffmpeg.output(stream.video, audio.audio, save_path.replace('_noaudio.mp4', '.mp4'), vcodec='copy', acodec='aac').run()
     os.remove(save_path)
-    return save_path.replace('_noaudio.mp4', '.mp4')

 from scipy.spatial.transform import Rotation as R
 from scipy.interpolate import interp1d
+# from diffusers import AutoencoderKL, DDIMScheduler
+# from einops import repeat
 from omegaconf import OmegaConf
 from PIL import Image
 from torchvision import transforms
+# from transformers import CLIPVisionModelWithProjection
+# from src.models.pose_guider import PoseGuider
+# from src.models.unet_2d_condition import UNet2DConditionModel
+# from src.models.unet_3d import UNet3DConditionModel
+# from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
 from src.utils.util import save_videos_grid
+# from src.audio_models.model import Audio2MeshModel
 from src.utils.audio_util import prepare_audio_feature
+# from src.utils.mp_utils  import LMKExtractor
+# from src.utils.draw_util import FaceMeshVisualizer
 from src.utils.pose_util import project_points
+from src.utils.crop_face_single import crop_face
+from src.create_modules import lmk_extractor, vis, a2m_model, pipe
 def matrix_to_euler_and_translation(matrix):
     return smoothed_pose_seq
 def get_headpose_temp(input_video):
+    # lmk_extractor = LMKExtractor()
     cap = cv2.VideoCapture(input_video)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     config = OmegaConf.load('./configs/prompts/animation_audio.yaml')
+    # if config.weight_dtype == "fp16":
+    #     weight_dtype = torch.float16
+    # else:
+    #     weight_dtype = torch.float32
     audio_infer_config = OmegaConf.load(config.audio_inference_config)
+    # # prepare model
+    # a2m_model = Audio2MeshModel(audio_infer_config['a2m_model'])
+    # a2m_model.load_state_dict(torch.load(audio_infer_config['pretrained_model']['a2m_ckpt']), strict=False)
+    # a2m_model.cuda().eval()
+    # vae = AutoencoderKL.from_pretrained(
+    #     config.pretrained_vae_path,
+    # ).to("cuda", dtype=weight_dtype)
+    # reference_unet = UNet2DConditionModel.from_pretrained(
+    #     config.pretrained_base_model_path,
+    #     subfolder="unet",
+    # ).to(dtype=weight_dtype, device="cuda")
+    # inference_config_path = config.inference_config
+    # infer_config = OmegaConf.load(inference_config_path)
+    # denoising_unet = UNet3DConditionModel.from_pretrained_2d(
+    #     config.pretrained_base_model_path,
+    #     config.motion_module_path,
+    #     subfolder="unet",
+    #     unet_additional_kwargs=infer_config.unet_additional_kwargs,
+    # ).to(dtype=weight_dtype, device="cuda")
+    # pose_guider = PoseGuider(noise_latent_channels=320, use_ca=True).to(device="cuda", dtype=weight_dtype) # not use cross attention
+    # image_enc = CLIPVisionModelWithProjection.from_pretrained(
+    #     config.image_encoder_path
+    # ).to(dtype=weight_dtype, device="cuda")
+    # sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
+    # scheduler = DDIMScheduler(**sched_kwargs)
     generator = torch.manual_seed(seed)
     width, height = size, size
+    # # load pretrained weights
+    # denoising_unet.load_state_dict(
+    #     torch.load(config.denoising_unet_path, map_location="cpu"),
+    #     strict=False,
+    # )
+    # reference_unet.load_state_dict(
+    #     torch.load(config.reference_unet_path, map_location="cpu"),
+    # )
+    # pose_guider.load_state_dict(
+    #     torch.load(config.pose_guider_path, map_location="cpu"),
+    # )
+    # pipe = Pose2VideoPipeline(
+    #     vae=vae,
+    #     image_encoder=image_enc,
+    #     reference_unet=reference_unet,
+    #     denoising_unet=denoising_unet,
+    #     pose_guider=pose_guider,
+    #     scheduler=scheduler,
+    # )
+    # pipe = pipe.to("cuda", dtype=weight_dtype)
     date_str = datetime.now().strftime("%Y%m%d")
     time_str = datetime.now().strftime("%H%M")
     save_dir = Path(f"output/{date_str}/{save_dir_name}")
     save_dir.mkdir(exist_ok=True, parents=True)
+    # lmk_extractor = LMKExtractor()
+    # vis = FaceMeshVisualizer(forehead_edge=False)
     ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
+    ref_image_np = crop_face(ref_image_np, lmk_extractor)
+    if ref_image_np is None:
+        return None, Image.fromarray(ref_img)
     ref_image_np = cv2.resize(ref_image_np, (size, size))
     ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
     face_result = lmk_extractor(ref_image_np)
     if face_result is None:
+        return None, ref_image_pil
     lmks = face_result['lmks'].astype(np.float32)
     ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
         [transforms.Resize((height, width)), transforms.ToTensor()]
     )
     args_L = len(pose_images) if length==0 or length > len(pose_images) else length
+    args_L = min(args_L, 300)
     for pose_image_np in pose_images[: args_L]:
         pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
         pose_tensor_list.append(pose_transform(pose_image_pil))
     stream = ffmpeg.input(save_path)
     audio = ffmpeg.input(input_audio)
+    ffmpeg.output(stream.video, audio.audio, save_path.replace('_noaudio.mp4', '.mp4'), vcodec='copy', acodec='aac', shortest=None).run()
     os.remove(save_path)
+    return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil

src/utils/crop_face_single.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import numpy as np
+import cv2
+def crop_face(img, lmk_extractor, expand=1.5):
+    result = lmk_extractor(img)  # cv2 BGR
+    if result is None:
+        return None
+    H, W, _ = img.shape
+    lmks = result['lmks']
+    lmks[:, 0] *= W
+    lmks[:, 1] *= H
+    x_min = np.min(lmks[:, 0])
+    x_max = np.max(lmks[:, 0])
+    y_min = np.min(lmks[:, 1])
+    y_max = np.max(lmks[:, 1])
+    width = x_max - x_min
+    height = y_max - y_min
+    center_x = x_min + width / 2
+    center_y = y_min + height / 2
+    width *= expand
+    height *= expand
+    size = max(width, height)
+    x_min = int(center_x - size / 2)
+    x_max = int(center_x + size / 2)
+    y_min = int(center_y - size / 2)
+    y_max = int(center_y + size / 2)
+    top = max(0, -y_min)
+    bottom = max(0, y_max - img.shape[0])
+    left = max(0, -x_min)
+    right = max(0, x_max - img.shape[1])
+    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
+    cropped_img = img[y_min + top:y_max + top, x_min + left:x_max + left]
+    return cropped_img

src/vid2vid.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import argparse
 import os
 import shutil
 import ffmpeg
@@ -8,88 +7,89 @@ import numpy as np
 import cv2
 import torch
 import spaces
-from diffusers import AutoencoderKL, DDIMScheduler
-from einops import repeat
-from omegaconf import OmegaConf
 from PIL import Image
 from torchvision import transforms
-from transformers import CLIPVisionModelWithProjection
-from src.models.pose_guider import PoseGuider
-from src.models.unet_2d_condition import UNet2DConditionModel
-from src.models.unet_3d import UNet3DConditionModel
-from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
 from src.utils.util import get_fps, read_frames, save_videos_grid
-from src.utils.mp_utils  import LMKExtractor
-from src.utils.draw_util import FaceMeshVisualizer
 from src.utils.pose_util import project_points_with_trans, matrix_to_euler_and_translation, euler_and_translation_to_matrix
 from src.audio2vid import smooth_pose_seq
 @spaces.GPU
 def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
     cfg = 3.5
-    config = OmegaConf.load('./configs/prompts/animation_facereenac.yaml')
-    if config.weight_dtype == "fp16":
-        weight_dtype = torch.float16
-    else:
-        weight_dtype = torch.float32
-    vae = AutoencoderKL.from_pretrained(
-        config.pretrained_vae_path,
-    ).to("cuda", dtype=weight_dtype)
-    reference_unet = UNet2DConditionModel.from_pretrained(
-        config.pretrained_base_model_path,
-        subfolder="unet",
-    ).to(dtype=weight_dtype, device="cuda")
-    inference_config_path = config.inference_config
-    infer_config = OmegaConf.load(inference_config_path)
-    denoising_unet = UNet3DConditionModel.from_pretrained_2d(
-        config.pretrained_base_model_path,
-        config.motion_module_path,
-        subfolder="unet",
-        unet_additional_kwargs=infer_config.unet_additional_kwargs,
-    ).to(dtype=weight_dtype, device="cuda")
-    pose_guider = PoseGuider(noise_latent_channels=320, use_ca=True).to(device="cuda", dtype=weight_dtype) # not use cross attention
-    image_enc = CLIPVisionModelWithProjection.from_pretrained(
-        config.image_encoder_path
-    ).to(dtype=weight_dtype, device="cuda")
-    sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
-    scheduler = DDIMScheduler(**sched_kwargs)
     generator = torch.manual_seed(seed)
     width, height = size, size
-    # load pretrained weights
-    denoising_unet.load_state_dict(
-        torch.load(config.denoising_unet_path, map_location="cpu"),
-        strict=False,
-    )
-    reference_unet.load_state_dict(
-        torch.load(config.reference_unet_path, map_location="cpu"),
-    )
-    pose_guider.load_state_dict(
-        torch.load(config.pose_guider_path, map_location="cpu"),
-    )
-    pipe = Pose2VideoPipeline(
-        vae=vae,
-        image_encoder=image_enc,
-        reference_unet=reference_unet,
-        denoising_unet=denoising_unet,
-        pose_guider=pose_guider,
-        scheduler=scheduler,
-    )
-    pipe = pipe.to("cuda", dtype=weight_dtype)
     date_str = datetime.now().strftime("%Y%m%d")
     time_str = datetime.now().strftime("%H%M")
@@ -99,24 +99,25 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
     save_dir.mkdir(exist_ok=True, parents=True)
-    lmk_extractor = LMKExtractor()
-    vis = FaceMeshVisualizer(forehead_edge=False)
     ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
-    # TODO: 人脸检测+裁剪
     ref_image_np = cv2.resize(ref_image_np, (size, size))
     ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
     face_result = lmk_extractor(ref_image_np)
     if face_result is None:
-        return None
     lmks = face_result['lmks'].astype(np.float32)
     ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
     source_images = read_frames(source_video)
     src_fps = get_fps(source_video)
@@ -134,6 +135,7 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
     bs_list = []
     src_tensor_list = []
     args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
     for src_image_pil in source_images[: args_L: step]:
         src_tensor_list.append(pose_transform(src_image_pil))
         src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
@@ -209,7 +211,7 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
         # merge audio and video
         stream = ffmpeg.input(save_path)
         audio = ffmpeg.input(audio_output)
-        ffmpeg.output(stream.video, audio.audio, save_path.replace('_noaudio.mp4', '.mp4'), vcodec='copy', acodec='aac').run()
         os.remove(save_path)
         os.remove(audio_output)
@@ -219,4 +221,4 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
             save_path.replace('_noaudio.mp4', '.mp4')
         )
-    return save_path.replace('_noaudio.mp4', '.mp4')

 import os
 import shutil
 import ffmpeg
 import cv2
 import torch
 import spaces
+# from diffusers import AutoencoderKL, DDIMScheduler
+# from einops import repeat
+# from omegaconf import OmegaConf
 from PIL import Image
 from torchvision import transforms
+# from transformers import CLIPVisionModelWithProjection
+# from src.models.pose_guider import PoseGuider
+# from src.models.unet_2d_condition import UNet2DConditionModel
+# from src.models.unet_3d import UNet3DConditionModel
+# from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
 from src.utils.util import get_fps, read_frames, save_videos_grid
+# from src.utils.mp_utils  import LMKExtractor
+# from src.utils.draw_util import FaceMeshVisualizer
 from src.utils.pose_util import project_points_with_trans, matrix_to_euler_and_translation, euler_and_translation_to_matrix
 from src.audio2vid import smooth_pose_seq
+from src.utils.crop_face_single import crop_face
+from src.create_modules import lmk_extractor, vis, pipe
 @spaces.GPU
 def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
     cfg = 3.5
+    # config = OmegaConf.load('./configs/prompts/animation_facereenac.yaml')
+    # if config.weight_dtype == "fp16":
+    #     weight_dtype = torch.float16
+    # else:
+    #     weight_dtype = torch.float32
+    # vae = AutoencoderKL.from_pretrained(
+    #     config.pretrained_vae_path,
+    # ).to("cuda", dtype=weight_dtype)
+    # reference_unet = UNet2DConditionModel.from_pretrained(
+    #     config.pretrained_base_model_path,
+    #     subfolder="unet",
+    # ).to(dtype=weight_dtype, device="cuda")
+    # inference_config_path = config.inference_config
+    # infer_config = OmegaConf.load(inference_config_path)
+    # denoising_unet = UNet3DConditionModel.from_pretrained_2d(
+    #     config.pretrained_base_model_path,
+    #     config.motion_module_path,
+    #     subfolder="unet",
+    #     unet_additional_kwargs=infer_config.unet_additional_kwargs,
+    # ).to(dtype=weight_dtype, device="cuda")
+    # pose_guider = PoseGuider(noise_latent_channels=320, use_ca=True).to(device="cuda", dtype=weight_dtype) # not use cross attention
+    # image_enc = CLIPVisionModelWithProjection.from_pretrained(
+    #     config.image_encoder_path
+    # ).to(dtype=weight_dtype, device="cuda")
+    # sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
+    # scheduler = DDIMScheduler(**sched_kwargs)
     generator = torch.manual_seed(seed)
     width, height = size, size
+    # # load pretrained weights
+    # denoising_unet.load_state_dict(
+    #     torch.load(config.denoising_unet_path, map_location="cpu"),
+    #     strict=False,
+    # )
+    # reference_unet.load_state_dict(
+    #     torch.load(config.reference_unet_path, map_location="cpu"),
+    # )
+    # pose_guider.load_state_dict(
+    #     torch.load(config.pose_guider_path, map_location="cpu"),
+    # )
+    # pipe = Pose2VideoPipeline(
+    #     vae=vae,
+    #     image_encoder=image_enc,
+    #     reference_unet=reference_unet,
+    #     denoising_unet=denoising_unet,
+    #     pose_guider=pose_guider,
+    #     scheduler=scheduler,
+    # )
+    # pipe = pipe.to("cuda", dtype=weight_dtype)
     date_str = datetime.now().strftime("%Y%m%d")
     time_str = datetime.now().strftime("%H%M")
     save_dir.mkdir(exist_ok=True, parents=True)
+    # lmk_extractor = LMKExtractor()
+    # vis = FaceMeshVisualizer(forehead_edge=False)
     ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
+    ref_image_np = crop_face(ref_image_np, lmk_extractor)
+    if ref_image_np is None:
+        return None, Image.fromarray(ref_img)
     ref_image_np = cv2.resize(ref_image_np, (size, size))
     ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
     face_result = lmk_extractor(ref_image_np)
     if face_result is None:
+        return None, ref_image_pil
     lmks = face_result['lmks'].astype(np.float32)
     ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
     source_images = read_frames(source_video)
     src_fps = get_fps(source_video)
     bs_list = []
     src_tensor_list = []
     args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
+    args_L = min(args_L, 300*step)
     for src_image_pil in source_images[: args_L: step]:
         src_tensor_list.append(pose_transform(src_image_pil))
         src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
         # merge audio and video
         stream = ffmpeg.input(save_path)
         audio = ffmpeg.input(audio_output)
+        ffmpeg.output(stream.video, audio.audio, save_path.replace('_noaudio.mp4', '.mp4'), vcodec='copy', acodec='aac', shortest=None).run()
         os.remove(save_path)
         os.remove(audio_output)
             save_path.replace('_noaudio.mp4', '.mp4')
         )
+    return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil