File size: 5,038 Bytes
9f49ca0
 
 
 
88590fc
97551e8
9f49ca0
 
 
 
 
88590fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86716b3
 
88590fc
 
 
 
86716b3
88590fc
 
 
 
 
 
 
9f49ca0
 
 
 
 
 
 
 
 
 
 
88590fc
9f49ca0
 
 
 
 
 
97551e8
 
 
 
 
 
 
9f49ca0
 
 
 
 
 
88590fc
9f49ca0
 
 
 
 
 
 
88590fc
 
 
 
 
 
 
 
9f49ca0
 
88590fc
9f49ca0
 
 
 
 
 
 
88590fc
9f49ca0
 
62ea058
9f49ca0
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import gradio as gr
import shutil
import subprocess

from inference import InferenceEngine
from sequence_utils import extract_kps_sequence_from_video

output_dir = "output"
temp_audio_path = "temp.mp3"


DEFAULT_MODEL_ARGS = {
    'unet_config_path': './model_ckpts/stable-diffusion-v1-5/unet/config.json',
    'vae_path': './model_ckpts/sd-vae-ft-mse/',
    'audio_encoder_path': './model_ckpts/wav2vec2-base-960h/',
    'insightface_model_path': './model_ckpts/insightface_models/',
    'denoising_unet_path': './model_ckpts/v-express/denoising_unet.pth',
    'reference_net_path': './model_ckpts/v-express/reference_net.pth',
    'v_kps_guider_path': './model_ckpts/v-express/v_kps_guider.pth',
    'audio_projection_path': './model_ckpts/v-express/audio_projection.pth',
    'motion_module_path': './model_ckpts/v-express/motion_module.pth',
    #'retarget_strategy': 'fix_face',  # fix_face, no_retarget, offset_retarget, naive_retarget
    'device': 'cuda',
    'gpu_id': 0,
    'dtype': 'fp16',
    'num_pad_audio_frames': 2,
    'standard_audio_sampling_rate': 16000,
    #'reference_image_path': './test_samples/emo/talk_emotion/ref.jpg',
    #'audio_path': './test_samples/emo/talk_emotion/aud.mp3',
    #'kps_path': './test_samples/emo/talk_emotion/kps.pth',
    #'output_path': './output/emo/talk_emotion.mp4',
    'image_width': 512,
    'image_height': 512,
    'fps': 30.0,
    'seed': 42,
    'num_inference_steps': 25,
    'guidance_scale': 3.5,
    'context_frames': 12,
    'context_stride': 1,
    'context_overlap': 4,
    #'reference_attention_weight': 0.95,
    #'audio_attention_weight': 3.0
}

INFERENCE_ENGINE = InferenceEngine(DEFAULT_MODEL_ARGS)

def infer(reference_image, audio_path, kps_sequence_save_path,
        output_path,
        retarget_strategy,
        reference_attention_weight, audio_attention_weight):
    global INFERENCE_ENGINE 
    INFERENCE_ENGINE.infer(
        reference_image, audio_path, kps_sequence_save_path,
        output_path,
        retarget_strategy,
        reference_attention_weight, audio_attention_weight
    )
    return output_path, kps_sequence_save_path

# Function to run V-Express demo
def run_demo(
        reference_image, audio, video,
        kps_path, output_path, retarget_strategy,
        reference_attention_weight=0.95,
        audio_attention_weight=3.0,
        progress=gr.Progress()):
    # Step 1: Extract Keypoints from Video
    progress((0,100), desc="Starting...")

    kps_sequence_save_path = f"{output_dir}/kps.pth"

    if video is not None:
        # Run the script to extract keypoints and audio from the video
        progress((25,100), desc="Extract keypoints and audio...")
        audio_path = video.replace(".mp4", ".mp3")

        extract_kps_sequence_from_video(
            INFERENCE_ENGINE.app,
            video,
            audio_path,
            kps_sequence_save_path
        )

        progress((50,100), desc="Keypoints and audio extracted successfully.")
        #return "Keypoints and audio extracted successfully."
        rem_progress = (75,100)
    else:
        rem_progress = (50,100)
        audio_path = audio
        shutil.copy(kps_path.name, kps_sequence_save_path)

    subprocess.run(["ffmpeg", "-i", audio_path, "-c:v", "libx264", "-crf", "18", "-preset", "slow", temp_audio_path])
    shutil.move(temp_audio_path, audio_path)
    
    # Step 2: Run Inference with Reference Image and Audio
    # Determine the inference script and parameters based on the selected retargeting strategy
    progress(rem_progress, desc="Inference...")

    output_path, kps_sequence_save_path = infer(
        reference_image, audio_path, kps_sequence_save_path,
        output_path,
        retarget_strategy,
        reference_attention_weight, audio_attention_weight
    )

    status = f"Video generated successfully. Saved at: {output_path}"
    progress((100,100), desc=status)
    return output_path, kps_sequence_save_path

# Create Gradio interface
inputs = [
    gr.Image(label="Reference Image", type="filepath"),
    gr.Audio(label="Audio", type="filepath"),
    gr.Video(label="Video"),
    gr.File(label="KPS sequences", value=f"test_samples/short_case/10/kps.pth"),
    gr.Textbox(label="Output Path for generated video", value=f"{output_dir}/output_video.mp4"),
    gr.Dropdown(label="Retargeting Strategy", choices=["no_retarget", "fix_face", "offset_retarget", "naive_retarget"], value="no_retarget"),
    gr.Slider(label="Reference Attention Weight", minimum=0.0, maximum=1.0, step=0.01, value=0.95),
    gr.Slider(label="Audio Attention Weight", minimum=1.0, maximum=5.0, step=0.1, value=3.0)
]

output = [
    gr.Video(label="Generated Video"),
    gr.File(label="Generated KPS Sequences File (kps.pth)")
]

# Title and description for the interface
title = "V-Express Gradio Interface"
description = "An interactive interface for generating talking face videos using V-Express."

# Launch Gradio app
demo = gr.Interface(run_demo, inputs, output, title=title, description=description)
demo.queue().launch()