Spaces:

pragnakalp
/

one_shot_talking_face_from_text

Build error

App Files Files Community

one_shot_talking_face_from_text / app.py

pragnakalp

Update app.py

dbdb292 verified about 2 months ago

raw

history blame

8.37 kB

	import gradio as gr
	import os, subprocess, torchaudio
	import torch
	from PIL import Image
	import gradio as gr
	import soundfile
	from gtts import gTTS
	import tempfile
	from pydub.generators import Sine
	from pydub import AudioSegment
	import cv2
	import imageio
	import ffmpeg
	from io import BytesIO
	import requests
	import sys
	import mediapipe as mp

	python_path = sys.executable

	from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
	from fairseq.models.text_to_speech.hub_interface import TTSHubInterface

	block = gr.Blocks()

	def crop_src_image(src_img):
	mp_face_detection = mp.solutions.face_detection
	mp_drawing = mp.solutions.drawing_utils

	save_img = '/content/image_pre.png'
	img = cv2.imread(src_img)
	h, width, _ = img.shape

	with mp_face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5) as face_detection:
	results = face_detection.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
	if results.detections:
	detection = results.detections[0] # Use the first detected face
	bboxC = detection.location_data.relative_bounding_box
	x = int(bboxC.xmin * width)
	y = int(bboxC.ymin * h)
	w = int(bboxC.width * width)
	h = int(bboxC.height * h)

	# Ensure bbox dimensions are within image boundaries
	x, y = max(0, x), max(0, y)
	w, h = min(width - x, w), min(h - y, h)

	img = img[y:y + h, x:x + w]
	img = cv2.resize(img, (256, 256))
	cv2.imwrite(save_img, img)
	else:
	# If no face is detected, resize the original image
	img = cv2.resize(img, (256, 256))
	cv2.imwrite(save_img, img)
	return save_img

	def pad_image(image):
	w, h = image.size
	if w == h:
	return image
	elif w > h:
	new_image = Image.new(image.mode, (w, w), (0, 0, 0))
	new_image.paste(image, (0, (w - h) // 2))
	return new_image
	else:
	new_image = Image.new(image.mode, (h, h), (0, 0, 0))
	new_image.paste(image, ((h - w) // 2, 0))
	return new_image

	def calculate(image_in, audio_in):
	waveform, sample_rate = torchaudio.load(audio_in)
	waveform = torch.mean(waveform, dim=0, keepdim=True)
	torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
	image_in = image_in.replace("results/", "")
	print("***"100)
	print(f" ### original image => {image_in} ### ")
	if os.path.exists(image_in):
	print(f"image exists => {image_in}")
	image = Image.open(image_in)
	else:
	print("image not exists reading web image")
	image_url = "http://labelme.csail.mit.edu/Release3.0/Images/users/DNguyen91/face/m_unsexy_gr.jpg"
	response = requests.get(image_url)
	image = Image.open(BytesIO(response.content))
	print("***"100)
	image = pad_image(image)
	image.save("image.png")

	pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
	jq_run = subprocess.run(['jq', '[.w[]\|{word: (.t \| ascii_upcase \| sub("<S>"; "sil") \| sub("<SIL>"; "sil") \| sub("\\\(2\\\)"; "") \| sub("\\\(3\\\)"; "") \| sub("\\\(4\\\)"; "") \| sub("\\\[SPEECH\\\]"; "SIL") \| sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]\|{ph: .t \| sub("\\\+SPN\\\+"; "SIL") \| sub("\\\+NSN\\\+"; "SIL"), bg: (.b100)\|floor, ed: (.b100+.d*100)\|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
	with open("test.json", "w") as f:
	f.write(jq_run.stdout.decode('utf-8').strip())
	os.system(f"cd /content/one-shot-talking-face && {python_path} -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
	return "/content/train/image_audio.mp4"

	def merge_frames():
	path = '/content/video_results/restored_imgs'

	if not os.path.exists(path):
	os.makedirs(path)

	image_folder = os.fsencode(path)
	filenames = []

	for file in os.listdir(image_folder):
	filename = os.fsdecode(file)
	if filename.endswith(('.jpg', '.png', '.gif')):
	filenames.append(filename)

	filenames.sort()
	images = list(map(lambda filename: imageio.imread("/content/video_results/restored_imgs/" + filename), filenames))
	imageio.mimsave('/content/video_output.mp4', images, fps=25.0)
	return "/content/video_output.mp4"

	def audio_video():
	input_video = ffmpeg.input('/content/video_output.mp4')
	input_audio = ffmpeg.input('/content/audio.wav')
	os.system(f"rm -rf /content/final_output.mp4")
	ffmpeg.concat(input_video, input_audio, v=1, a=1).output('/content/final_output.mp4').run()
	return "/content/final_output.mp4"

	def one_shot_talking(image_in, audio_in):
	crop_img = crop_src_image(image_in)

	if os.path.exists("/content/results/restored_imgs/image_pre.png"):
	os.system(f"rm -rf /content/results/restored_imgs/image_pre.png")

	if not os.path.exists("/content/results"):
	os.makedirs("/content/results")

	os.system(f"{python_path} /content/GFPGAN/inference_gfpgan.py --upscale 2 -i /content/image_pre.png -o /content/results --bg_upsampler realesrgan")
	image_in_one_shot = '/content/results/image_pre.png'

	calculate(image_in_one_shot, audio_in)
	os.system(f"{python_path} /content/PyVideoFramesExtractor/extract.py --video=/content/train/image_audio.mp4")
	os.system(f"rm -rf /content/video_results/")
	os.system(f"{python_path} /content/GFPGAN/inference_gfpgan.py --upscale 2 -i /content/extracted_frames/image_audio_frames -o /content/video_results --bg_upsampler realesrgan")
	merge_frames()
	return audio_video()

	def one_shot(image_in, input_text, gender):
	if gender == "Female":
	tts = gTTS(input_text)
	with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
	tts.write_to_fp(f)
	f.seek(0)
	sound = AudioSegment.from_file(f.name, format="mp3")
	os.system(f"rm -rf /content/audio.wav")
	sound.export("/content/audio.wav", format="wav")
	audio_in = "/content/audio.wav"
	return one_shot_talking(image_in, audio_in)
	elif gender == 'Male':
	models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
	"Voicemod/fastspeech2-en-male1",
	arg_overrides={"vocoder": "hifigan", "fp16": False}
	)
	model = models[0]
	TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
	generator = task.build_generator([model], cfg)

	sample = TTSHubInterface.get_model_input(task, input_text)
	sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"]
	sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"]
	sample["speaker"] = sample["speaker"]

	wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
	os.system(f"rm -rf /content/audio_before.wav")
	soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
	os.system(f"rm -rf /content/audio.wav")
	cmd = 'ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
	os.system(cmd)
	audio_in = "/content/audio.wav"
	return one_shot_talking(image_in, audio_in)

	def run():
	with gr.Blocks(css=".gradio-container {background-color: lightgray} #radio_div {background-color: #FFD8B4; font-size: 40px;}") as demo:
	gr.Markdown("<h1 style='text-align: center;'>One Shot Talking Face from Text</h1><br/><br/>")
	with gr.Group():
	with gr.Row():
	image_in = gr.Image(show_label=True, type="filepath", label="Input Image")
	input_text = gr.Textbox(show_label=True, label="Input Text")
	gender = gr.Radio(["Female", "Male"], value="Female", label="Gender")
	video_out = gr.Video(show_label=True, label="Output")
	with gr.Row():
	btn = gr.Button("Generate")
	btn.click(one_shot, inputs=[image_in, input_text, gender], outputs=[video_out])
	demo.queue()
	demo.launch(server_name="0.0.0.0", server_port=7860)

	if __name__ == "__main__":
	run()