Spaces:

Vedits
/

Fvds

Runtime error

App Files Files Community

Fvds / app.py

pragnakalp

Update app.py

2c79431 over 2 years ago

raw

history blame

3.17 kB

	import gradio as gr
	import os, subprocess, torchaudio
	import torch
	from PIL import Image
	import gradio as gr
	import os, subprocess, torchaudio
	import torch
	from PIL import Image
	import soundfile
	from gtts import gTTS
	import tempfile
	from pydub import AudioSegment
	from pydub.generators import Sine


	block = gr.Blocks()

	def pad_image(image):
	w, h = image.size
	if w == h:
	return image
	elif w > h:
	new_image = Image.new(image.mode, (w, w), (0, 0, 0))
	new_image.paste(image, (0, (w - h) // 2))
	return new_image
	else:
	new_image = Image.new(image.mode, (h, h), (0, 0, 0))
	new_image.paste(image, ((h - w) // 2, 0))
	return new_image

	def calculate(image_in, audio_in):
	waveform, sample_rate = torchaudio.load(audio_in)
	waveform = torch.mean(waveform, dim=0, keepdim=True)
	torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
	image = Image.open(image_in)
	image = pad_image(image)
	image.save("image.png")

	pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
	jq_run = subprocess.run(['jq', '[.w[]\|{word: (.t \| ascii_upcase \| sub("<S>"; "sil") \| sub("<SIL>"; "sil") \| sub("\\\(2\\\)"; "") \| sub("\\\(3\\\)"; "") \| sub("\\\(4\\\)"; "") \| sub("\\\[SPEECH\\\]"; "SIL") \| sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]\|{ph: .t \| sub("\\\+SPN\\\+"; "SIL") \| sub("\\\+NSN\\\+"; "SIL"), bg: (.b100)\|floor, ed: (.b100+.d*100)\|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
	with open("test.json", "w") as f:
	f.write(jq_run.stdout.decode('utf-8').strip())
	# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
	return "/content/train/image_audio.mp4"

	def one_shot(image_in,input_text,gender):
	if gender == "Female":
	tts = gTTS(input_text)
	with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
	tts.write_to_fp(f)
	f.seek(0)
	sound = AudioSegment.from_file(f.name, format="mp3")
	sound.export("/content/audio.wav", format="wav")
	audio_in="/content/audio.wav"
	return calculate(image_in,audio_in)

	def run():
	with block:

	with gr.Group():
	with gr.Box():
	with gr.Row().style(equal_height=True):
	image_in = gr.Image(show_label=False, type="filepath")
	input_text = gr.Textbox(show_label=False)
	gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
	video_out = gr.Video(show_label=False)
	with gr.Row().style(equal_height=True):
	btn = gr.Button("Generate")


	btn.click(one_shot, inputs=[image_in,input_text,gender], outputs=[video_out])
	block.queue()
	block.launch(server_name="0.0.0.0", server_port=7860)

	if __name__ == "__main__":
	run()