kimjy0411 commited on
Commit
1ef2055
ยท
verified ยท
1 Parent(s): 9600e7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -180
app.py CHANGED
@@ -31,7 +31,6 @@ from src.utils.crop_face_single import crop_face
31
  from src.audio2vid import get_headpose_temp, smooth_pose_seq
32
  from src.utils.frame_interpolation import init_frame_interpolation_model, batch_images_interpolation_tool
33
 
34
-
35
  config = OmegaConf.load('./configs/prompts/animation_audio.yaml')
36
  if config.weight_dtype == "fp16":
37
  weight_dtype = torch.float16
@@ -99,7 +98,7 @@ pipe = pipe.to("cuda", dtype=weight_dtype)
99
  frame_inter_model = init_frame_interpolation_model()
100
 
101
  @spaces.GPU
102
- def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
103
  fps = 30
104
  cfg = 3.5
105
  fi_step = 3
@@ -161,16 +160,9 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
161
  pose_images.append(lmk_img)
162
 
163
  pose_list = []
164
- # pose_tensor_list = []
165
-
166
- # pose_transform = transforms.Compose(
167
- # [transforms.Resize((height, width)), transforms.ToTensor()]
168
- # )
169
  args_L = len(pose_images) if length==0 or length > len(pose_images) else length
170
  args_L = min(args_L, 90)
171
  for pose_image_np in pose_images[: args_L : fi_step]:
172
- # pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
173
- # pose_tensor_list.append(pose_transform(pose_image_pil))
174
  pose_image_np = cv2.resize(pose_image_np, (width, height))
175
  pose_list.append(pose_image_np)
176
 
@@ -200,11 +192,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
200
  fps=fps,
201
  )
202
 
203
- # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
204
- # save_pil_imgs(video, save_path)
205
-
206
- # save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
207
-
208
  stream = ffmpeg.input(save_path)
209
  audio = ffmpeg.input(input_audio)
210
  ffmpeg.output(stream.video, audio.audio, save_path.replace('_noaudio.mp4', '.mp4'), vcodec='copy', acodec='aac', shortest=None).run()
@@ -212,149 +199,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
212
 
213
  return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
214
 
215
- @spaces.GPU
216
- def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
217
- cfg = 3.5
218
- fi_step = 3
219
-
220
- generator = torch.manual_seed(seed)
221
-
222
- lmk_extractor = LMKExtractor()
223
- vis = FaceMeshVisualizer()
224
-
225
- width, height = size, size
226
-
227
- date_str = datetime.now().strftime("%Y%m%d")
228
- time_str = datetime.now().strftime("%H%M")
229
- save_dir_name = f"{time_str}--seed_{seed}-{size}x{size}"
230
-
231
- save_dir = Path(f"v2v_output/{date_str}/{save_dir_name}")
232
- while os.path.exists(save_dir):
233
- save_dir = Path(f"v2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
234
- save_dir.mkdir(exist_ok=True, parents=True)
235
-
236
- ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
237
- ref_image_np = crop_face(ref_image_np, lmk_extractor)
238
- if ref_image_np is None:
239
- return None, Image.fromarray(ref_img)
240
-
241
- ref_image_np = cv2.resize(ref_image_np, (size, size))
242
- ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
243
-
244
- face_result = lmk_extractor(ref_image_np)
245
- if face_result is None:
246
- return None, ref_image_pil
247
-
248
- lmks = face_result['lmks'].astype(np.float32)
249
- ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
250
-
251
- source_images = read_frames(source_video)
252
- src_fps = get_fps(source_video)
253
- pose_transform = transforms.Compose(
254
- [transforms.Resize((height, width)), transforms.ToTensor()]
255
- )
256
-
257
- step = 1
258
- if src_fps == 60:
259
- src_fps = 30
260
- step = 2
261
-
262
- pose_trans_list = []
263
- verts_list = []
264
- bs_list = []
265
- args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
266
- args_L = min(args_L, 90*step)
267
- for src_image_pil in source_images[: args_L : step*fi_step]:
268
- src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
269
- frame_height, frame_width, _ = src_img_np.shape
270
- src_img_result = lmk_extractor(src_img_np)
271
- if src_img_result is None:
272
- break
273
- pose_trans_list.append(src_img_result['trans_mat'])
274
- verts_list.append(src_img_result['lmks3d'])
275
- bs_list.append(src_img_result['bs'])
276
-
277
- trans_mat_arr = np.array(pose_trans_list)
278
- verts_arr = np.array(verts_list)
279
- bs_arr = np.array(bs_list)
280
- min_bs_idx = np.argmin(bs_arr.sum(1))
281
-
282
- # compute delta pose
283
- pose_arr = np.zeros([trans_mat_arr.shape[0], 6])
284
-
285
- for i in range(pose_arr.shape[0]):
286
- euler_angles, translation_vector = matrix_to_euler_and_translation(trans_mat_arr[i]) # real pose of source
287
- pose_arr[i, :3] = euler_angles
288
- pose_arr[i, 3:6] = translation_vector
289
-
290
- init_tran_vec = face_result['trans_mat'][:3, 3] # init translation of tgt
291
- pose_arr[:, 3:6] = pose_arr[:, 3:6] - pose_arr[0, 3:6] + init_tran_vec # (relative translation of source) + (init translation of tgt)
292
-
293
- pose_arr_smooth = smooth_pose_seq(pose_arr, window_size=3)
294
- pose_mat_smooth = [euler_and_translation_to_matrix(pose_arr_smooth[i][:3], pose_arr_smooth[i][3:6]) for i in range(pose_arr_smooth.shape[0])]
295
- pose_mat_smooth = np.array(pose_mat_smooth)
296
-
297
- # face retarget
298
- verts_arr = verts_arr - verts_arr[min_bs_idx] + face_result['lmks3d']
299
- # project 3D mesh to 2D landmark
300
- projected_vertices = project_points_with_trans(verts_arr, pose_mat_smooth, [frame_height, frame_width])
301
-
302
- pose_list = []
303
- for i, verts in enumerate(projected_vertices):
304
- lmk_img = vis.draw_landmarks((frame_width, frame_height), verts, normed=False)
305
- pose_image_np = cv2.resize(lmk_img, (width, height))
306
- pose_list.append(pose_image_np)
307
-
308
- pose_list = np.array(pose_list)
309
-
310
- video_length = len(pose_list)
311
-
312
- video = pipe(
313
- ref_image_pil,
314
- pose_list,
315
- ref_pose,
316
- width,
317
- height,
318
- video_length,
319
- steps,
320
- cfg,
321
- generator=generator,
322
- ).videos
323
-
324
- video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
325
-
326
- save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
327
- save_videos_grid(
328
- video,
329
- save_path,
330
- n_rows=1,
331
- fps=src_fps,
332
- )
333
-
334
- # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
335
- # save_pil_imgs(video, save_path)
336
-
337
- # save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
338
-
339
- audio_output = f'{save_dir}/audio_from_video.aac'
340
- # extract audio
341
- try:
342
- ffmpeg.input(source_video).output(audio_output, acodec='copy').run()
343
- # merge audio and video
344
- stream = ffmpeg.input(save_path)
345
- audio = ffmpeg.input(audio_output)
346
- ffmpeg.output(stream.video, audio.audio, save_path.replace('_noaudio.mp4', '.mp4'), vcodec='copy', acodec='aac', shortest=None).run()
347
-
348
- os.remove(save_path)
349
- os.remove(audio_output)
350
- except:
351
- shutil.move(
352
- save_path,
353
- save_path.replace('_noaudio.mp4', '.mp4')
354
- )
355
-
356
- return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
357
-
358
 
359
  ################# GUI ################
360
 
@@ -405,32 +249,23 @@ with gr.Blocks() as demo:
405
  )
406
 
407
 
408
- with gr.Tab("Video2video"):
409
  with gr.Row():
410
  with gr.Column():
411
  with gr.Row():
412
- v2v_ref_img = gr.Image(label="Upload reference image", sources="upload")
413
- v2v_source_video = gr.Video(label="Upload source video", sources="upload")
414
 
415
  with gr.Row():
416
- v2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=384, label="Video size (-W & -H)")
417
- v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
418
 
419
  with gr.Row():
420
- v2v_length = gr.Slider(minimum=0, maximum=90, step=1, value=30, label="Length (-L)")
421
- v2v_seed = gr.Number(value=42, label="Seed (--seed)")
422
 
423
- v2v_botton = gr.Button("Generate", variant="primary")
424
- v2v_output_video = gr.PlayableVideo(label="Result", interactive=False)
425
-
426
- gr.Examples(
427
- examples=[
428
- ["configs/inference/ref_images/Aragaki.png", "configs/inference/video/Aragaki_song.mp4"],
429
- ["configs/inference/ref_images/solo.png", "configs/inference/video/Aragaki_song.mp4"],
430
- ["configs/inference/ref_images/lyl.png", "configs/inference/head_pose_temp/pose_ref_video.mp4"],
431
- ],
432
- inputs=[v2v_ref_img, v2v_source_video, a2v_headpose_video],
433
- )
434
 
435
  a2v_botton.click(
436
  fn=audio2video,
@@ -438,11 +273,11 @@ with gr.Blocks() as demo:
438
  a2v_size_slider, a2v_step_slider, a2v_length, a2v_seed],
439
  outputs=[a2v_output_video, a2v_ref_img]
440
  )
441
- v2v_botton.click(
442
- fn=video2video,
443
- inputs=[v2v_ref_img, v2v_source_video,
444
- v2v_size_slider, v2v_step_slider, v2v_length, v2v_seed],
445
- outputs=[v2v_output_video, v2v_ref_img]
446
  )
447
 
448
  demo.launch()
 
31
  from src.audio2vid import get_headpose_temp, smooth_pose_seq
32
  from src.utils.frame_interpolation import init_frame_interpolation_model, batch_images_interpolation_tool
33
 
 
34
  config = OmegaConf.load('./configs/prompts/animation_audio.yaml')
35
  if config.weight_dtype == "fp16":
36
  weight_dtype = torch.float16
 
98
  frame_inter_model = init_frame_interpolation_model()
99
 
100
  @spaces.GPU
101
+ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
102
  fps = 30
103
  cfg = 3.5
104
  fi_step = 3
 
160
  pose_images.append(lmk_img)
161
 
162
  pose_list = []
 
 
 
 
 
163
  args_L = len(pose_images) if length==0 or length > len(pose_images) else length
164
  args_L = min(args_L, 90)
165
  for pose_image_np in pose_images[: args_L : fi_step]:
 
 
166
  pose_image_np = cv2.resize(pose_image_np, (width, height))
167
  pose_list.append(pose_image_np)
168
 
 
192
  fps=fps,
193
  )
194
 
 
 
 
 
 
195
  stream = ffmpeg.input(save_path)
196
  audio = ffmpeg.input(input_audio)
197
  ffmpeg.output(stream.video, audio.audio, save_path.replace('_noaudio.mp4', '.mp4'), vcodec='copy', acodec='aac', shortest=None).run()
 
199
 
200
  return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  ################# GUI ################
204
 
 
249
  )
250
 
251
 
252
+ with gr.Tab("TTS"):
253
  with gr.Row():
254
  with gr.Column():
255
  with gr.Row():
256
+ tts_text_input = gr.Textbox(lines=5, label="Input text", placeholder="Enter text to synthesize...")
257
+ tts_ref_img = gr.Image(label="Upload reference image", sources="upload")
258
 
259
  with gr.Row():
260
+ tts_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=384, label="Video size (-W & -H)")
261
+ tts_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
262
 
263
  with gr.Row():
264
+ tts_length = gr.Slider(minimum=0, maximum=90, step=1, value=30, label="Length (-L)")
265
+ tts_seed = gr.Number(value=42, label="Seed (--seed)")
266
 
267
+ tts_button = gr.Button("Generate", variant="primary")
268
+ tts_output_video = gr.PlayableVideo(label="Result", interactive=False)
 
 
 
 
 
 
 
 
 
269
 
270
  a2v_botton.click(
271
  fn=audio2video,
 
273
  a2v_size_slider, a2v_step_slider, a2v_length, a2v_seed],
274
  outputs=[a2v_output_video, a2v_ref_img]
275
  )
276
+ tts_button.click(
277
+ fn=audio2video, # ์ถ”ํ›„ TTS ๊ด€๋ จ ํ•จ์ˆ˜๋กœ ๋Œ€์ฒด ํ•„์š”
278
+ inputs=[tts_text_input, tts_ref_img, None,
279
+ tts_size_slider, tts_step_slider, tts_length, tts_seed],
280
+ outputs=[tts_output_video, tts_ref_img]
281
  )
282
 
283
  demo.launch()