Spaces:
Sleeping
Sleeping
import torch, uuid | |
import os, sys, shutil, platform | |
# from src.facerender.pirender_animate import AnimateFromCoeff_PIRender | |
from src.utils.preprocess import CropAndExtract | |
from src.test_audio2coeff import Audio2Coeff | |
from src.facerender.animate import AnimateFromCoeff | |
from src.generate_batch import get_data | |
from src.generate_facerender_batch import get_facerender_data | |
from src.utils.init_path import init_path | |
# from pydub import AudioSegment | |
# def mp3_to_wav(mp3_filename,wav_filename,frame_rate): | |
# mp3_file = AudioSegment.from_file(file=mp3_filename) | |
# mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav") | |
class SadTalker(): | |
def __init__(self, checkpoint_path='checkpoints', config_path='src/config', lazy_load=False): | |
if torch.cuda.is_available(): | |
device = "cuda" | |
elif platform.system() == 'Darwin': # macos | |
device = "mps" | |
else: | |
device = "cpu" | |
self.device = device | |
os.environ['TORCH_HOME']= checkpoint_path | |
self.checkpoint_path = checkpoint_path | |
self.config_path = config_path | |
self.sadtalker_paths = init_path(checkpoint_path, self.config_path, 256, False, 'crop') | |
self.animate_from_coeff = AnimateFromCoeff(self.sadtalker_paths, self.device) | |
self.audio_to_coeff = Audio2Coeff(self.sadtalker_paths, self.device) | |
def test(self, | |
pic_path, | |
crop_pic_path, | |
first_coeff_path, | |
crop_info, | |
source_image, driven_audio, preprocess='crop', | |
still_mode=False, use_enhancer=False, batch_size=1, size=256, | |
pose_style = 0, | |
facerender='facevid2vid', | |
exp_scale=1.0, | |
use_ref_video = False, | |
ref_video = None, | |
ref_info = None, | |
use_idle_mode = False, | |
length_of_audio = 0, use_blink=True, fps=20, | |
result_dir='./results/'): | |
# print(self.sadtalker_paths) | |
# self.preprocess_model = CropAndExtract(self.sadtalker_paths, self.device) | |
# if facerender == 'facevid2vid' and self.device != 'mps': | |
# self.animate_from_coeff = AnimateFromCoeff(self.sadtalker_paths, self.device) | |
# elif facerender == 'pirender' or self.device == 'mps': | |
# self.animate_from_coeff = AnimateFromCoeff_PIRender(self.sadtalker_paths, self.device) | |
# facerender = 'pirender' | |
# else: | |
# raise(RuntimeError('Unknown model: {}'.format(facerender))) | |
# time_tag = str(uuid.uuid4()) | |
# save_dir = os.path.join(result_dir, time_tag) | |
# os.makedirs(save_dir, exist_ok=True) | |
save_dir = result_dir | |
os.makedirs(save_dir, exist_ok=True) | |
# input_dir = os.path.join(save_dir, 'input') | |
# os.makedirs(input_dir, exist_ok=True) | |
# print(source_image) | |
# pic_path = os.path.join(input_dir, os.path.basename(source_image)) | |
# shutil.copy(source_image, input_dir) | |
# if driven_audio is not None and os.path.isfile(driven_audio): | |
# audio_path = os.path.join(input_dir, os.path.basename(driven_audio)) | |
# #### mp3 to wav | |
# if '.mp3' in audio_path: | |
# mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000) | |
# audio_path = audio_path.replace('.mp3', '.wav') | |
# else: | |
# shutil.move(driven_audio, input_dir) | |
# elif use_idle_mode: | |
# audio_path = os.path.join(input_dir, 'idlemode_'+str(length_of_audio)+'.wav') ## generate audio from this new audio_path | |
# from pydub import AudioSegment | |
# one_sec_segment = AudioSegment.silent(duration=1000*length_of_audio) #duration in milliseconds | |
# one_sec_segment.export(audio_path, format="wav") | |
# else: | |
# print(use_ref_video, ref_info) | |
# assert use_ref_video == True and ref_info == 'all' | |
# if use_ref_video and ref_info == 'all': # full ref mode | |
# ref_video_videoname = os.path.basename(ref_video) | |
# audio_path = os.path.join(save_dir, ref_video_videoname+'.wav') | |
# print('new audiopath:',audio_path) | |
# # if ref_video contains audio, set the audio from ref_video. | |
# cmd = r"ffmpeg -y -hide_banner -loglevel error -i %s %s"%(ref_video, audio_path) | |
# os.system(cmd) | |
# os.makedirs(save_dir, exist_ok=True) | |
#crop image and extract 3dmm from image | |
# first_frame_dir = os.path.join(save_dir, 'first_frame_dir') | |
# os.makedirs(first_frame_dir, exist_ok=True) | |
# first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(pic_path, first_frame_dir, preprocess, True, size) | |
# if first_coeff_path is None: | |
# raise AttributeError("No face is detected") | |
# if use_ref_video: | |
# print('using ref video for genreation') | |
# ref_video_videoname = os.path.splitext(os.path.split(ref_video)[-1])[0] | |
# ref_video_frame_dir = os.path.join(save_dir, ref_video_videoname) | |
# os.makedirs(ref_video_frame_dir, exist_ok=True) | |
# print('3DMM Extraction for the reference video providing pose') | |
# ref_video_coeff_path, _, _ = self.preprocess_model.generate(ref_video, ref_video_frame_dir, preprocess, source_image_flag=False) | |
# else: | |
# ref_video_coeff_path = None | |
# if use_ref_video: | |
# if ref_info == 'pose': | |
# ref_pose_coeff_path = ref_video_coeff_path | |
# ref_eyeblink_coeff_path = None | |
# elif ref_info == 'blink': | |
# ref_pose_coeff_path = None | |
# ref_eyeblink_coeff_path = ref_video_coeff_path | |
# elif ref_info == 'pose+blink': | |
# ref_pose_coeff_path = ref_video_coeff_path | |
# ref_eyeblink_coeff_path = ref_video_coeff_path | |
# elif ref_info == 'all': | |
# ref_pose_coeff_path = None | |
# ref_eyeblink_coeff_path = None | |
# else: | |
# raise('error in refinfo') | |
# else: | |
# ref_pose_coeff_path = None | |
# ref_eyeblink_coeff_path = None | |
ref_pose_coeff_path = None | |
ref_eyeblink_coeff_path = None | |
audio_path = driven_audio | |
# fps = 25 | |
#audio2ceoff | |
# if use_ref_video and ref_info == 'all': | |
# coeff_path = ref_video_coeff_path # self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path) | |
# else: | |
batch = get_data(first_coeff_path, audio_path, self.device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path, still=still_mode, \ | |
idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink, fps = fps) # longer audio? | |
coeff = self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path) | |
#coeff2video | |
data = get_facerender_data(coeff, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode, \ | |
preprocess=preprocess, size=size, expression_scale = exp_scale, facemodel=facerender) | |
return_path = self.animate_from_coeff.generate(data, save_dir, pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None, preprocess=preprocess, img_size=size, fps = fps) | |
# video_name = data['video_name'] | |
# print(f'The generated video is named {video_name} in {save_dir}') | |
# del self.preprocess_model | |
# del self.audio_to_coeff | |
# del self.animate_from_coeff | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
torch.cuda.synchronize() | |
import gc; gc.collect() | |
return return_path | |
def test2(self, source_image, driven_audio, preprocess='crop', | |
still_mode=False, use_enhancer=False, batch_size=1, size=256, | |
pose_style = 0, | |
facerender='facevid2vid', | |
exp_scale=1.0, | |
use_ref_video = False, | |
ref_video = None, | |
ref_info = None, | |
use_idle_mode = False, | |
length_of_audio = 0, use_blink=True, fps = 20, | |
result_dir='./results/'): | |
os.makedirs(result_dir, exist_ok=True) | |
self.sadtalker_paths = init_path(self.checkpoint_path, self.config_path, size, False, preprocess) | |
print(self.sadtalker_paths) | |
self.audio_to_coeff = Audio2Coeff(self.sadtalker_paths, self.device) | |
self.preprocess_model = CropAndExtract(self.sadtalker_paths, self.device) | |
self.animate_from_coeff = AnimateFromCoeff(self.sadtalker_paths, self.device) | |
time_tag = str(uuid.uuid4()) | |
save_dir = os.path.join(result_dir, time_tag) | |
os.makedirs(save_dir, exist_ok=True) | |
input_dir = os.path.join(save_dir, 'input') | |
os.makedirs(input_dir, exist_ok=True) | |
print(source_image) | |
pic_path = os.path.join(input_dir, os.path.basename(source_image)) | |
shutil.copy(source_image, input_dir) | |
if driven_audio is not None and os.path.isfile(driven_audio): | |
audio_path = os.path.join(input_dir, os.path.basename(driven_audio)) | |
shutil.copy(driven_audio, input_dir) | |
elif use_idle_mode: | |
audio_path = os.path.join(input_dir, 'idlemode_'+str(length_of_audio)+'.wav') ## generate audio from this new audio_path | |
from pydub import AudioSegment | |
one_sec_segment = AudioSegment.silent(duration=1000*length_of_audio) #duration in milliseconds | |
one_sec_segment.export(audio_path, format="wav") | |
else: | |
assert driven_audio is not None, "No audio is given" | |
print(use_ref_video, ref_info) | |
assert use_ref_video == True and ref_info == 'all' | |
if use_ref_video and ref_info == 'all': # full ref mode | |
ref_video_videoname = os.path.basename(ref_video) | |
audio_path = os.path.join(save_dir, ref_video_videoname+'.wav') | |
print('new audiopath:',audio_path) | |
# if ref_video contains audio, set the audio from ref_video. | |
cmd = r"ffmpeg -y -hide_banner -loglevel error -i %s %s"%(ref_video, audio_path) | |
os.system(cmd) | |
os.makedirs(save_dir, exist_ok=True) | |
#crop image and extract 3dmm from image | |
first_frame_dir = os.path.join(save_dir, 'first_frame_dir') | |
os.makedirs(first_frame_dir, exist_ok=True) | |
first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(pic_path, first_frame_dir, preprocess, True, size) | |
print(first_coeff_path, crop_info) | |
if first_coeff_path is None: | |
raise AttributeError("No face is detected") | |
if use_ref_video: | |
print('using ref video for genreation') | |
ref_video_videoname = os.path.splitext(os.path.split(ref_video)[-1])[0] | |
ref_video_frame_dir = os.path.join(save_dir, ref_video_videoname) | |
os.makedirs(ref_video_frame_dir, exist_ok=True) | |
print('3DMM Extraction for the reference video providing pose') | |
ref_video_coeff_path, _, _ = self.preprocess_model.generate(ref_video, ref_video_frame_dir, preprocess, source_image_flag=False) | |
else: | |
ref_video_coeff_path = None | |
if use_ref_video: | |
if ref_info == 'pose': | |
ref_pose_coeff_path = ref_video_coeff_path | |
ref_eyeblink_coeff_path = None | |
elif ref_info == 'blink': | |
ref_pose_coeff_path = None | |
ref_eyeblink_coeff_path = ref_video_coeff_path | |
elif ref_info == 'pose+blink': | |
ref_pose_coeff_path = ref_video_coeff_path | |
ref_eyeblink_coeff_path = ref_video_coeff_path | |
elif ref_info == 'all': | |
ref_pose_coeff_path = None | |
ref_eyeblink_coeff_path = None | |
else: | |
raise('error in refinfo') | |
else: | |
ref_pose_coeff_path = None | |
ref_eyeblink_coeff_path = None | |
#audio2ceoff | |
if use_ref_video and ref_info == 'all': | |
coeff_path = ref_video_coeff_path # self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path) | |
else: | |
batch = get_data(first_coeff_path, audio_path, self.device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path, still=still_mode, \ | |
idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink, fps = fps) # longer audio? | |
coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path) | |
#coeff2video | |
data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode, \ | |
preprocess=preprocess, size=size, expression_scale = exp_scale, facemodel=facerender) | |
return_path = self.animate_from_coeff.generate(data, save_dir, pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None, preprocess=preprocess, img_size=size, fps = fps) | |
# video_name = data['video_name'] | |
print(f'The generated video is saved in {return_path}') | |
del self.preprocess_model | |
# del self.audio_to_coeff | |
# del self.animate_from_coeff | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
torch.cuda.synchronize() | |
import gc; gc.collect() | |
return return_path |