Spaces:

Chaerin5
/

FoundHand

Running on Zero

App Files Files Community

Chaerin5 commited on Dec 29, 2024

Commit

6097648

1 Parent(s): 32fa016

fix vae nan bug

Browse files

Files changed (1) hide show

app.py +19 -136

app.py CHANGED Viewed

@@ -217,21 +217,7 @@ if NEW_MODEL:
     model.eval()
     print(missing_keys, extra_keys)
     assert len(missing_keys) == 0
-    vae_state_dict = torch.load(vae_path, map_location='cpu')['state_dict']
-    print(f"vae_state_dict encoder dtype: {vae_state_dict['encoder.conv_in.weight'].dtype}")
-    autoencoder = vqvae.create_model(3, 3, opts.latent_dim).eval().requires_grad_(False)
-    print(f"autoencoder encoder dtype: {next(autoencoder.encoder.parameters()).dtype}")
-    print(f"encoder before load_state_dict parameters min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
-    print(f"encoder before load_state_dict parameters max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
-    missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False)
-    print(f"encoder after load_state_dict parameters min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
-    print(f"encoder after load_state_dict parameters max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
-    autoencoder = autoencoder.to(device)
-    autoencoder.eval()
-    print(f"encoder after eval() min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
-    print(f"encoder after eval() max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
-    print(f"autoencoder encoder after eval() dtype: {next(autoencoder.encoder.parameters()).dtype}")
-    assert len(missing_keys) == 0
 # else:
 #     opts = HandDiffOpts()
 #     model_path = './finetune_epoch=5-step=130000.ckpt'
@@ -266,127 +252,6 @@ hands = mp_hands.Hands(
     min_detection_confidence=0.1,
 )
-# def make_ref_cond(
-#     image
-# ):
-#     print("ready to run autoencoder")
-#     # print(f"image.device: {image.device}, type(image): {type(image)}")
-#     # image = image.to("cuda")
-#     print(f"autoencoder device: {next(autoencoder.parameters()).device}")
-#     latent = opts.latent_scaling_factor * autoencoder.encode(image[None, ...]).sample()
-#     return image[None, ...], latent
-# def get_ref_anno(ref):
-#     print("inside get_ref_anno")
-#     if ref is None:
-#         return (
-#             None,
-#             None,
-#             None,
-#             None,
-#             None,
-#         )
-#     img = ref["composite"][..., :3]
-#     img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA)
-#     keypts = np.zeros((42, 2))
-#     print("ready to run mediapipe")
-#     if REF_POSE_MASK:
-#         print(f"type(img): {type(img)}, img.shape: {img.shape}, img.dtype: {img.dtype}")
-#         mp_pose = hands.process(img)
-#         print("processed mediapipe")
-#         detected = np.array([0, 0])
-#         start_idx = 0
-#         if mp_pose.multi_hand_landmarks:
-#             # handedness is flipped assuming the input image is mirrored in MediaPipe
-#             for hand_landmarks, handedness in zip(
-#                 mp_pose.multi_hand_landmarks, mp_pose.multi_handedness
-#             ):
-#                 # actually right hand
-#                 if handedness.classification[0].label == "Left":
-#                     start_idx = 0
-#                     detected[0] = 1
-#                 # actually left hand
-#                 elif handedness.classification[0].label == "Right":
-#                     start_idx = 21
-#                     detected[1] = 1
-#                 for i, landmark in enumerate(hand_landmarks.landmark):
-#                     keypts[start_idx + i] = [
-#                         landmark.x * opts.image_size[1],
-#                         landmark.y * opts.image_size[0],
-#                     ]
-#             sam_predictor.set_image(img)
-#             l = keypts[:21].shape[0]
-#             if keypts[0].sum() != 0 and keypts[21].sum() != 0:
-#                 input_point = np.array([keypts[0], keypts[21]])
-#                 input_label = np.array([1, 1])
-#             elif keypts[0].sum() != 0:
-#                 input_point = np.array(keypts[:1])
-#                 input_label = np.array([1])
-#             elif keypts[21].sum() != 0:
-#                 input_point = np.array(keypts[21:22])
-#                 input_label = np.array([1])
-#             print("ready to run SAM")
-#             masks, _, _ = sam_predictor.predict(
-#                 point_coords=input_point,
-#                 point_labels=input_label,
-#                 multimask_output=False,
-#             )
-#             print("finished SAM")
-#             hand_mask = masks[0]
-#             masked_img = img * hand_mask[..., None] + 255 * (1 - hand_mask[..., None])
-#             ref_pose = visualize_hand(keypts, masked_img)
-#         else:
-#             raise gr.Error("No hands detected in the reference image.")
-#     else:
-#         hand_mask = np.zeros_like(img[:,:, 0])
-#         ref_pose = np.zeros_like(img)
-#     image_transform = Compose(
-#         [
-#             ToTensor(),
-#             Resize(opts.image_size),
-#             Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-#         ]
-#     )
-#     image = image_transform(img)
-#     kpts_valid = check_keypoints_validity(keypts, opts.image_size)
-#     heatmaps = torch.tensor(
-#         keypoint_heatmap(
-#             scale_keypoint(keypts, opts.image_size, opts.latent_size), opts.latent_size, var=1.0
-#         )
-#         * kpts_valid[:, None, None],
-#         dtype=torch.float,
-#         # device=device,
-#     )[None, ...]
-#     mask = torch.tensor(
-#         cv2.resize(
-#             hand_mask.astype(int),
-#             dsize=opts.latent_size,
-#             interpolation=cv2.INTER_NEAREST,
-#         ),
-#         dtype=torch.float,
-#         # device=device,
-#     ).unsqueeze(0)[None, ...]
-#     image, latent = make_ref_cond(
-#         image,
-#         # keypts,
-#         # hand_mask,
-#         # device=device,
-#         # target_size=opts.image_size,
-#         # latent_size=opts.latent_size,
-#     )
-#     print("finished autoencoder")
-#     if not REF_POSE_MASK:
-#         heatmaps = torch.zeros_like(heatmaps)
-#         mask = torch.zeros_like(mask)
-#     ref_cond = torch.cat([latent, heatmaps, mask], 1)
-#     return img, ref_pose, ref_cond
 def get_ref_anno(ref):
     if ref is None:
         return (
@@ -396,6 +261,24 @@ def get_ref_anno(ref):
             None,
             None,
         )
     img = ref["composite"][..., :3]
     img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA)
     keypts = np.zeros((42, 2))

     model.eval()
     print(missing_keys, extra_keys)
     assert len(missing_keys) == 0
 # else:
 #     opts = HandDiffOpts()
 #     model_path = './finetune_epoch=5-step=130000.ckpt'
     min_detection_confidence=0.1,
 )
 def get_ref_anno(ref):
     if ref is None:
         return (
             None,
             None,
         )
+    vae_state_dict = torch.load(vae_path, map_location='cpu')['state_dict']
+    print(f"vae_state_dict encoder dtype: {vae_state_dict['encoder.conv_in.weight'].dtype}")
+    autoencoder = vqvae.create_model(3, 3, opts.latent_dim).eval().requires_grad_(False)
+    print(f"autoencoder encoder dtype: {next(autoencoder.encoder.parameters()).dtype}")
+    print(f"encoder before load_state_dict parameters min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
+    print(f"encoder before load_state_dict parameters max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
+    missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False)
+    print(f"encoder after load_state_dict parameters min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
+    print(f"encoder after load_state_dict parameters max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
+    autoencoder = autoencoder.to(device)
+    autoencoder.eval()
+    print(f"encoder after eval() min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
+    print(f"encoder after eval() max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
+    print(f"autoencoder encoder after eval() dtype: {next(autoencoder.encoder.parameters()).dtype}")
+    assert len(missing_keys) == 0
     img = ref["composite"][..., :3]
     img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA)
     keypts = np.zeros((42, 2))