|
import os
|
|
import subprocess
|
|
import gradio as gr
|
|
from retinaface import RetinaFace
|
|
from PIL import Image
|
|
import filetype
|
|
from datetime import datetime
|
|
import re
|
|
import sys
|
|
import torch
|
|
import argparse
|
|
|
|
import platform, os
|
|
|
|
def open_folder():
|
|
open_folder_path = os.path.abspath("outputs")
|
|
if platform.system() == "Windows":
|
|
os.startfile(open_folder_path)
|
|
elif platform.system() == "Linux":
|
|
os.system(f'xdg-open "{open_folder_path}"')
|
|
|
|
|
|
|
|
python_executable = sys.executable
|
|
|
|
def display_media(file):
|
|
|
|
if file is None:
|
|
return gr.update(visible=False), gr.update(visible=False)
|
|
kind = filetype.guess(file.name)
|
|
|
|
if kind is None:
|
|
return gr.update(visible=False), gr.update(visible=False)
|
|
|
|
if kind.mime.startswith('video'):
|
|
return gr.update(value=file.name, visible=True), gr.update(visible=False)
|
|
elif kind.mime.startswith('audio'):
|
|
return gr.update(visible=False), gr.update(value=file.name, visible=True)
|
|
else:
|
|
return gr.update(visible=False), gr.update(visible=False)
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--share", type=str, default=False, help="Set to True to share the app publicly.")
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
def extract_audio(video_path, audio_path):
|
|
command = [python_executable, "-m", "ffmpeg", "-i", video_path, "-vn", "-acodec", "libmp3lame", "-q:a", "2", audio_path]
|
|
subprocess.call(command)
|
|
|
|
|
|
def convert_audio_to_mp3(audio_path, mp3_path):
|
|
command = ["ffmpeg", "-i", audio_path, "-acodec", "libmp3lame", "-q:a", "2", mp3_path]
|
|
subprocess.call(command)
|
|
|
|
def crop_and_save_image(image_path, auto_crop, crop_width, crop_height, crop_expansion):
|
|
cropped_image = auto_crop_image(image_path, crop_expansion, crop_size=(crop_width, crop_height))
|
|
if cropped_image is not None:
|
|
cropped_folder = os.path.join("outputs", "cropped_images")
|
|
os.makedirs(cropped_folder, exist_ok=True)
|
|
|
|
|
|
base_name, extension = os.path.splitext(os.path.basename(image_path))
|
|
|
|
|
|
counter = 1
|
|
|
|
|
|
new_image_name = f"{base_name}_{counter:04d}{extension}"
|
|
cropped_image_path = os.path.join(cropped_folder, new_image_name)
|
|
|
|
|
|
while os.path.exists(cropped_image_path):
|
|
counter += 1
|
|
new_image_name = f"{base_name}_{counter:04d}{extension}"
|
|
cropped_image_path = os.path.join(cropped_folder, new_image_name)
|
|
|
|
|
|
cropped_image.save(cropped_image_path, format='PNG')
|
|
return cropped_image_path
|
|
return None
|
|
|
|
|
|
def generate_kps_sequence_and_audio(video_path, kps_sequence_save_path, audio_save_path):
|
|
command = [python_executable, "scripts/extract_kps_sequence_and_audio.py", "--video_path", video_path, "--kps_sequence_save_path", kps_sequence_save_path, "--audio_save_path", audio_save_path]
|
|
subprocess.call(command)
|
|
|
|
def auto_crop_image(image_path, expand_percent, crop_size=(512, 512)):
|
|
|
|
if torch.cuda.is_available():
|
|
device = 'cuda'
|
|
print("Using GPU for RetinaFace detection.")
|
|
else:
|
|
device = 'cpu'
|
|
print("Using CPU for RetinaFace detection.")
|
|
|
|
|
|
img = Image.open(image_path)
|
|
|
|
|
|
faces = RetinaFace.detect_faces(image_path)
|
|
|
|
if not faces:
|
|
print("No faces detected.")
|
|
return None
|
|
|
|
|
|
|
|
face = list(faces.values())[0]
|
|
landmarks = face['landmarks']
|
|
|
|
|
|
right_eye = landmarks['right_eye']
|
|
left_eye = landmarks['left_eye']
|
|
right_mouth = landmarks['mouth_right']
|
|
left_mouth = landmarks['mouth_left']
|
|
|
|
|
|
eye_distance = abs(right_eye[0] - left_eye[0])
|
|
|
|
|
|
head_width = eye_distance * 4.5
|
|
head_height = eye_distance * 6.5
|
|
|
|
|
|
eye_center_x = (right_eye[0] + left_eye[0]) // 2
|
|
eye_center_y = (right_eye[1] + left_eye[1]) // 2
|
|
|
|
|
|
head_left = max(0, int(eye_center_x - head_width // 2))
|
|
head_top = max(0, int(eye_center_y - head_height // 2))
|
|
head_right = min(img.width, int(eye_center_x + head_width // 2))
|
|
head_bottom = min(img.height, int(eye_center_y + head_height // 2))
|
|
|
|
|
|
assumed_head_img = img.crop((head_left, head_top, head_right, head_bottom))
|
|
assumed_head_img.save("assumed_head.png", format='PNG')
|
|
|
|
|
|
expanded_w = int(head_width * (1 + expand_percent))
|
|
expanded_h = int(head_height * (1 + expand_percent))
|
|
|
|
|
|
center_x, center_y = head_left + head_width // 2, head_top + head_height // 2
|
|
left = max(0, center_x - expanded_w // 2)
|
|
right = min(img.width, center_x + expanded_w // 2)
|
|
top = max(0, center_y - expanded_h // 2)
|
|
bottom = min(img.height, center_y + expanded_h // 2)
|
|
|
|
|
|
cropped_img = img.crop((left, top, right, bottom))
|
|
cropped_img.save("expanded_face.png", format='PNG')
|
|
|
|
|
|
cropped_width, cropped_height = cropped_img.size
|
|
aspect_ratio = cropped_width / cropped_height
|
|
|
|
|
|
target_width = crop_size[0]
|
|
target_height = crop_size[1]
|
|
|
|
|
|
if aspect_ratio > target_width / target_height:
|
|
|
|
new_width = int(cropped_height * target_width / target_height)
|
|
left_crop = (cropped_width - new_width) // 2
|
|
right_crop = left_crop + new_width
|
|
top_crop = 0
|
|
bottom_crop = cropped_height
|
|
else:
|
|
|
|
new_height = int(cropped_width * target_height / target_width)
|
|
top_crop = (cropped_height - new_height) // 2
|
|
bottom_crop = top_crop + new_height
|
|
left_crop = 0
|
|
right_crop = cropped_width
|
|
|
|
|
|
final_cropped_img = cropped_img.crop((left_crop, top_crop, right_crop, bottom_crop))
|
|
final_cropped_img.save("final_cropped_img.png", format='PNG')
|
|
|
|
|
|
resized_img = final_cropped_img.resize(crop_size, resample=Image.LANCZOS)
|
|
|
|
|
|
resized_img.save(image_path, format='PNG')
|
|
return resized_img
|
|
|
|
|
|
def generate_output_video(reference_image_path, audio_path, kps_path, output_path, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop, crop_width, crop_height, crop_expansion,image_width,image_height, low_vram):
|
|
print("auto cropping...")
|
|
if auto_crop:
|
|
auto_crop_image(reference_image_path,crop_expansion, crop_size=(crop_width, crop_height))
|
|
|
|
print("starting inference...")
|
|
command = [
|
|
python_executable, "inference.py",
|
|
"--reference_image_path", reference_image_path,
|
|
"--audio_path", audio_path,
|
|
"--kps_path", kps_path,
|
|
"--output_path", output_path,
|
|
"--retarget_strategy", retarget_strategy,
|
|
"--num_inference_steps", str(num_inference_steps),
|
|
"--reference_attention_weight", str(reference_attention_weight),
|
|
"--audio_attention_weight", str(audio_attention_weight),
|
|
"--image_width", str(image_width),
|
|
"--image_height", str(image_height)
|
|
]
|
|
|
|
if low_vram:
|
|
command.append("--save_gpu_memory")
|
|
|
|
with open("executed_command.txt", "w") as file:
|
|
file.write(" ".join(command))
|
|
|
|
subprocess.call(command)
|
|
return output_path, reference_image_path
|
|
|
|
def sanitize_folder_name(name):
|
|
|
|
invalid_chars = r'[<>:"/\\|?*\x00-\x1F]'
|
|
|
|
sanitized_name = re.sub(invalid_chars, '_', name)
|
|
return sanitized_name
|
|
|
|
|
|
def process_input(reference_image, target_input, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop, crop_width, crop_height, crop_expansion,image_width,image_height,low_vram):
|
|
|
|
temp_process_dir = "temp_process"
|
|
os.makedirs(temp_process_dir, exist_ok=True)
|
|
|
|
input_file_name = os.path.splitext(os.path.basename(reference_image))[0]
|
|
input_file_name=sanitize_folder_name(input_file_name)
|
|
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
|
temp_dir = os.path.join(temp_process_dir, f"{input_file_name}_{timestamp}")
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
kind = filetype.guess(target_input)
|
|
if not kind:
|
|
raise ValueError("Cannot determine file type. Please provide a valid video or audio file.")
|
|
|
|
mime_type = kind.mime
|
|
|
|
if mime_type.startswith("video/"):
|
|
audio_path = os.path.join(temp_dir, "target_audio.mp3")
|
|
kps_path = os.path.join(temp_dir, "kps.pth")
|
|
print("generating generate_kps_sequence_and_audio...")
|
|
generate_kps_sequence_and_audio(target_input, kps_path, audio_path)
|
|
elif mime_type.startswith("audio/"):
|
|
audio_path = target_input
|
|
if mime_type != "audio/mpeg":
|
|
mp3_path = os.path.join(temp_dir, "target_audio_converted.mp3")
|
|
convert_audio_to_mp3(target_input, mp3_path)
|
|
audio_path = mp3_path
|
|
kps_path = ""
|
|
else:
|
|
raise ValueError("Unsupported file type. Please provide a video or audio file.")
|
|
|
|
output_dir = "outputs"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
output_file_name = f"{input_file_name}_result_"
|
|
output_file_name=sanitize_folder_name(output_file_name)
|
|
output_file_ext = ".mp4"
|
|
output_file_count = 1
|
|
while os.path.exists(os.path.join(output_dir, f"{output_file_name}{output_file_count:04d}{output_file_ext}")):
|
|
output_file_count += 1
|
|
output_path = os.path.join(output_dir, f"{output_file_name}{output_file_count:04d}{output_file_ext}")
|
|
|
|
|
|
output_video_path, cropped_image_path = generate_output_video(reference_image, audio_path, kps_path, output_path, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop,crop_width,crop_height, crop_expansion,image_width,image_height,low_vram)
|
|
|
|
return output_video_path, cropped_image_path
|
|
|
|
def launch_interface():
|
|
retarget_strategies = ["fix_face", "no_retarget", "offset_retarget", "naive_retarget"]
|
|
|
|
with gr.Blocks() as demo:
|
|
gr.Markdown("# Tencent AI Lab - V-Express Image to Animation V4 : https://www.patreon.com/posts/105251204")
|
|
with gr.Row():
|
|
with gr.Column():
|
|
input_image = gr.Image(label="Reference Image", format="png", type="filepath", height=512)
|
|
generate_button = gr.Button("Generate Talking Video")
|
|
low_vram = gr.Checkbox(label="Low VRAM - Greatly reduces VRAM usage but takes longer", value=False,visible=False)
|
|
crop_button = gr.Button("Crop Image")
|
|
with gr.Row():
|
|
|
|
with gr.Column(min_width=0):
|
|
image_width = gr.Number(label="Target Video Width", value=512)
|
|
|
|
with gr.Column(min_width=0):
|
|
image_height = gr.Number(label="Target Video Height", value=512)
|
|
|
|
with gr.Row():
|
|
with gr.Column(min_width=0):
|
|
retarget_strategy = gr.Dropdown(retarget_strategies, label="Retarget Strategy", value="fix_face")
|
|
with gr.Column(min_width=0):
|
|
inference_steps = gr.Slider(10, 90, step=1, label="Number of Inference Steps", value=30)
|
|
|
|
with gr.Row():
|
|
with gr.Column(min_width=0):
|
|
reference_attention = gr.Slider(0.80, 1.1, step=0.01, label="Reference Attention Weight", value=0.95)
|
|
with gr.Column(min_width=0):
|
|
audio_attention = gr.Slider(1.0, 5.0, step=0.1, label="Audio Attention Weight", value=3.0)
|
|
|
|
with gr.Row(visible=True) as crop_size_row:
|
|
with gr.Column(min_width=0):
|
|
auto_crop = gr.Checkbox(label="Auto Crop Image", value=True)
|
|
with gr.Column(min_width=0):
|
|
crop_expansion = gr.Slider(0.0, 1.0, step=0.01, label="Face Focus Expansion Percent", value=0.15)
|
|
with gr.Row():
|
|
with gr.Column(min_width=0):
|
|
crop_width = gr.Number(label="Crop Width", value=512)
|
|
with gr.Column(min_width=0):
|
|
crop_height = gr.Number(label="Crop Height", value=512)
|
|
|
|
with gr.Column():
|
|
input_video = gr.File(
|
|
label="Target Input (Image or Video)",
|
|
type="filepath",
|
|
file_count="single",
|
|
file_types=[
|
|
".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv", ".webm",
|
|
".3gp", ".m4v", ".mpg", ".mpeg", ".m2v", ".m4v", ".mts",
|
|
".mp3", ".wav", ".aac", ".flac", ".m4a", ".wma", ".ogg"
|
|
],
|
|
height=512 )
|
|
video_output = gr.Video(visible=False)
|
|
audio_output = gr.Audio(visible=False)
|
|
|
|
input_video.change(display_media, inputs=input_video, outputs=[video_output, audio_output])
|
|
btn_open_outputs = gr.Button("Open Outputs Folder")
|
|
btn_open_outputs.click(fn=open_folder)
|
|
gr.Markdown("""
|
|
|
|
Retarget Strategies
|
|
|
|
Only target audio : fix_face
|
|
|
|
Input picture and target video (same person - best practice) select : no_retarget
|
|
|
|
Input picture and target video (different person) select : offset_retarget or naive_retarget
|
|
|
|
Please look examples in Tests folder to see which settings you like most. I feel like offset_retarget is best
|
|
|
|
You can turn up reference_attention_weight to make the model maintain higher character consistency, and turn down audio_attention_weight to reduce mouth artifacts. E.g. setting both values to 1.0
|
|
""")
|
|
|
|
|
|
|
|
with gr.Column():
|
|
output_video = gr.Video(label="Generated Video", height=512)
|
|
output_image = gr.Image(label="Cropped Image")
|
|
|
|
|
|
generate_button.click(
|
|
fn=process_input,
|
|
inputs=[
|
|
input_image,
|
|
input_video,
|
|
retarget_strategy,
|
|
inference_steps,
|
|
reference_attention,
|
|
audio_attention,
|
|
auto_crop,
|
|
crop_width,
|
|
crop_height,
|
|
crop_expansion,
|
|
image_width,
|
|
image_height,
|
|
low_vram
|
|
],
|
|
outputs=[output_video, output_image]
|
|
)
|
|
|
|
crop_button.click(
|
|
fn=crop_and_save_image,
|
|
inputs=[
|
|
input_image,
|
|
auto_crop,
|
|
crop_width,
|
|
crop_height,
|
|
crop_expansion
|
|
],
|
|
outputs=output_image
|
|
)
|
|
|
|
demo.queue()
|
|
demo.launch(inbrowser=True,share=args.share)
|
|
|
|
|
|
launch_interface() |