pragnakalp commited on
Commit
9695387
·
1 Parent(s): 1625f2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -64
app.py CHANGED
@@ -23,73 +23,13 @@ import ffmpeg
23
 
24
  block = gr.Blocks()
25
 
26
- def pad_image(image):
27
- w, h = image.size
28
- if w == h:
29
- return image
30
- elif w > h:
31
- new_image = Image.new(image.mode, (w, w), (0, 0, 0))
32
- new_image.paste(image, (0, (w - h) // 2))
33
- return new_image
34
- else:
35
- new_image = Image.new(image.mode, (h, h), (0, 0, 0))
36
- new_image.paste(image, ((h - w) // 2, 0))
37
- return new_image
38
-
39
- def calculate(image_in, audio_in):
40
- waveform, sample_rate = torchaudio.load(audio_in)
41
- waveform = torch.mean(waveform, dim=0, keepdim=True)
42
- torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
43
- image = Image.open(image_in)
44
- image = pad_image(image)
45
- image.save("image.png")
46
- print("Inside calculate")
47
- return audio_in
48
- pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
49
- jq_run = subprocess.run(['jq', '[.w[]|{word: (.t | ascii_upcase | sub("<S>"; "sil") | sub("<SIL>"; "sil") | sub("\\\(2\\\)"; "") | sub("\\\(3\\\)"; "") | sub("\\\(4\\\)"; "") | sub("\\\[SPEECH\\\]"; "SIL") | sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]|{ph: .t | sub("\\\+SPN\\\+"; "SIL") | sub("\\\+NSN\\\+"; "SIL"), bg: (.b*100)|floor, ed: (.b*100+.d*100)|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
50
- with open("test.json", "w") as f:
51
- f.write(jq_run.stdout.decode('utf-8').strip())
52
-
53
- os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
54
- return "/content/train/image_audio.mp4"
55
 
56
  def one_shot(image,input_text,gender):
57
  if gender == 'Female' or gender == 'female':
58
- print(gender,input_text)
59
- tts = gTTS(input_text)
60
- with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
61
- tts.write_to_fp(f)
62
- f.seek(0)
63
- sound = AudioSegment.from_file(f.name, format="mp3")
64
- sound.export("/content/audio.wav", format="wav")
65
- audio_in="/content/audio.wav"
66
- calculate(image,audio_in)
67
 
68
-
69
-
70
- elif gender == 'Male' or gender == 'male':
71
- print(gender)
72
- models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
73
- "Voicemod/fastspeech2-en-male1",
74
- arg_overrides={"vocoder": "hifigan", "fp16": False}
75
- )
76
-
77
- model = models[0].cuda()
78
- TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
79
- generator = task.build_generator([model], cfg)
80
-
81
- sample = TTSHubInterface.get_model_input(task, input_text)
82
- sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
83
- sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
84
- sample["speaker"] = sample["speaker"].cuda()
85
-
86
- wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
87
- # soundfile.write("/content/audio_before.wav", wav, rate)
88
- soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
89
- cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
90
- os.system(cmd)
91
- one_shot_talking(image,'audio.wav')
92
-
93
 
94
 
95
 
@@ -106,7 +46,7 @@ def run():
106
  # audio_in = gr.Audio(show_label=False, type='filepath')
107
  input_text=gr.Textbox(lines=3, value="Hello How are you?", label="Input Text")
108
  gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
109
- video_out = gr.Audio(label="output")
110
  # video_out = gr.Video(show_label=False)
111
  with gr.Row().style(equal_height=True):
112
  btn = gr.Button("Generate")
 
23
 
24
  block = gr.Blocks()
25
 
26
+ def cal(gender,input_text):
27
+ return gender+input_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def one_shot(image,input_text,gender):
30
  if gender == 'Female' or gender == 'female':
31
+ cal(gender,input_text)
 
 
 
 
 
 
 
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
 
 
46
  # audio_in = gr.Audio(show_label=False, type='filepath')
47
  input_text=gr.Textbox(lines=3, value="Hello How are you?", label="Input Text")
48
  gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
49
+ video_out = gr.Textbox(label="output")
50
  # video_out = gr.Video(show_label=False)
51
  with gr.Row().style(equal_height=True):
52
  btn = gr.Button("Generate")