import spaces import os import shutil from huggingface_hub import snapshot_download import gradio as gr os.chdir(os.path.dirname(os.path.abspath(__file__))) hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") joyhallo_dir = snapshot_download(repo_id="jdh-algo/JoyHallo-v1", local_dir="pretrained_models/joyhallo") wav_dir = snapshot_download(repo_id="TencentGameMate/chinese-wav2vec2-base", local_dir="pretrained_models/chinese-wav2vec2-base") print(hallo_dir, joyhallo_dir) print(os.listdir(hallo_dir)) from scripts.inference import predict @spaces.GPU(duration=120) def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)): return predict(source_image, driving_audio, 1.0, 1.0, 1.0, 1.2) css = ''' div#warning-ready { background-color: #ecfdf5; padding: 0 16px 16px; margin: 20px 0; color: #030303!important; } div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { color: #057857!important; } div#warning-duplicate { background-color: #ebf5ff; padding: 0 16px 16px; margin: 20px 0; color: #030303!important; } div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { color: #0f4592!important; } div#warning-duplicate strong { color: #0f4592; } p.actions { display: flex; align-items: center; margin: 20px 0; } div#warning-duplicate .actions a { display: inline-block; margin-right: 10px; } .dark #warning-duplicate { background-color: #0c0c0c !important; border: 1px solid white !important; } ''' with gr.Blocks(css=css) as demo: gr.Markdown("# JoyHallo: Digital human model for Mandarin") gr.Markdown("Generate talking head avatars driven by Mandarin speech. Data requirements:") gr.Markdown(""" Image: 1. Cropped to square shape. 2. Face should be facing forward and occupy 50%-70% of the image. Audio: 1. Use wav format. 2. Mandarin, English or mixed, with clear audio and suitable background music. ! Important: Too long audio will casue a very long processing time, please keep the audio length within 5s. """) with gr.Row(): with gr.Column(): avatar_face = gr.Image(type="filepath", label="Face") driving_audio = gr.Audio(type="filepath", label="Driving audio") generate = gr.Button("Generate") with gr.Column(): output_video = gr.Video(label="Your talking head") generate.click( fn=run_inference, inputs=[avatar_face, driving_audio], outputs=output_video ) demo.launch()