import logging import cv2 import numpy as np import torch from insightface.app import FaceAnalysis from moviepy.editor import AudioFileClip, VideoClip from PIL import Image from torchvision import transforms logger = logging.getLogger(__name__) def tensor_to_video(tensor, output_video_path, input_audio_path, fps=30): """ Converts a Tensor with shape [c, f, h, w] into a video and adds an audio track from the specified audio file. Args: tensor (Tensor): The Tensor to be converted, shaped [c, f, h, w]. output_video_path (str): The file path where the output video will be saved. input_audio_path (str): The path to the audio file (WAV file) that contains the audio track to be added. fps (int): The frame rate of the output video. Default is 30 fps. """ tensor = tensor.permute(1, 2, 3, 0).cpu().numpy() # convert to [f, h, w, c] tensor = np.clip(tensor * 255, 0, 255).astype(np.uint8) # to [0, 255] def make_frame(t): frame_index = min(int(t * fps), tensor.shape[0] - 1) return tensor[frame_index] video_duration = tensor.shape[0] / fps audio_clip = AudioFileClip(input_audio_path) audio_duration = audio_clip.duration final_duration = min(video_duration, audio_duration) audio_clip = audio_clip.subclip(0, final_duration) new_video_clip = VideoClip(make_frame, duration=final_duration) new_video_clip = new_video_clip.set_audio(audio_clip) new_video_clip.write_videofile(output_video_path, fps=fps, codec="libx264", audio_codec="aac") @torch.no_grad() def preprocess_image(face_analysis_model: str, image_path: str, image_size: int = 512): """ Preprocess the image and extract face embedding. Args: face_analysis_model (str): Path to the FaceAnalysis model directory. image_path (str): Path to the image file. image_size (int, optional): Target size for resizing the image. Default is 512. Returns: tuple: A tuple containing: - pixel_values (torch.Tensor): Tensor of the preprocessed image. - face_emb (torch.Tensor): Tensor of the face embedding. """ # Define the image transformation transform = transforms.Compose( [ transforms.Resize((image_size, image_size)), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]), ] ) # Initialize the FaceAnalysis model face_analysis = FaceAnalysis( # name='', root=face_analysis_model, providers=["CUDAExecutionProvider", "CPUExecutionProvider"], ) face_analysis.prepare(ctx_id=0, det_size=(640, 640)) # Load and preprocess the image image = Image.open(image_path).convert("RGB") pixel_values = transform(image) pixel_values = pixel_values.unsqueeze(0) # Detect faces and extract the face embedding image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) faces = face_analysis.get(image_bgr) if not faces: logger.warning("No faces detected in the image. Using a zero vector as the face embedding.") face_emb = np.zeros(512) else: # Sort faces by size and select the largest one faces_sorted = sorted( faces, key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]), reverse=True, ) face_emb = faces_sorted[0]["embedding"] # Convert face embedding to a PyTorch tensor face_emb = face_emb.reshape(1, -1) face_emb = torch.tensor(face_emb) del face_analysis return pixel_values, face_emb