HaMeR / mmpose /core /evaluation /top_down_eval.py
geopavlakos's picture
Initial commit
history blame
24.9 kB
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
import cv2
import numpy as np
from mmpose.core.post_processing import transform_preds
def _calc_distances(preds, targets, mask, normalize):
"""Calculate the normalized distances between preds and target.
batch_size: N
num_keypoints: K
dimension of keypoints: D (normally, D=2 or D=3)
preds (np.ndarray[N, K, D]): Predicted keypoint location.
targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
normalize (np.ndarray[N, D]): Typical value is heatmap_size
np.ndarray[K, N]: The normalized distances. \
If target keypoints are missing, the distance is -1.
N, K, _ = preds.shape
# set mask=0 when normalize==0
_mask = mask.copy()
_mask[np.where((normalize == 0).sum(1))[0], :] = False
distances = np.full((N, K), -1, dtype=np.float32)
# handle invalid values
normalize[np.where(normalize <= 0)] = 1e6
distances[_mask] = np.linalg.norm(
((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
return distances.T
def _distance_acc(distances, thr=0.5):
"""Return the percentage below the distance threshold, while ignoring
distances values with -1.
batch_size: N
distances (np.ndarray[N, ]): The normalized distances.
thr (float): Threshold of the distances.
float: Percentage of distances below the threshold. \
If all target keypoints are missing, return -1.
distance_valid = distances != -1
num_distance_valid = distance_valid.sum()
if num_distance_valid > 0:
return (distances[distance_valid] < thr).sum() / num_distance_valid
return -1
def _get_max_preds(heatmaps):
"""Get keypoint predictions from score maps.
batch_size: N
num_keypoints: K
heatmap height: H
heatmap width: W
heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
tuple: A tuple containing aggregated results.
- preds (np.ndarray[N, K, 2]): Predicted keypoint location.
- maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
assert isinstance(heatmaps,
np.ndarray), ('heatmaps should be numpy.ndarray')
assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
N, K, _, W = heatmaps.shape
heatmaps_reshaped = heatmaps.reshape((N, K, -1))
idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
preds[:, :, 0] = preds[:, :, 0] % W
preds[:, :, 1] = preds[:, :, 1] // W
preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
return preds, maxvals
def _get_max_preds_3d(heatmaps):
"""Get keypoint predictions from 3D score maps.
batch size: N
num keypoints: K
heatmap depth size: D
heatmap height: H
heatmap width: W
heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
tuple: A tuple containing aggregated results.
- preds (np.ndarray[N, K, 3]): Predicted keypoint location.
- maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
assert isinstance(heatmaps, np.ndarray), \
('heatmaps should be numpy.ndarray')
assert heatmaps.ndim == 5, 'heatmaps should be 5-ndim'
N, K, D, H, W = heatmaps.shape
heatmaps_reshaped = heatmaps.reshape((N, K, -1))
idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
preds = np.zeros((N, K, 3), dtype=np.float32)
_idx = idx[..., 0]
preds[..., 2] = _idx // (H * W)
preds[..., 1] = (_idx // W) % H
preds[..., 0] = _idx % W
preds = np.where(maxvals > 0.0, preds, -1)
return preds, maxvals
def pose_pck_accuracy(output, target, mask, thr=0.05, normalize=None):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints from heatmaps.
PCK metric measures accuracy of the localization of the body joints.
The distances between predicted positions and the ground-truth ones
are typically normalized by the bounding box size.
The threshold (thr) of the normalized distance is commonly set
as 0.05, 0.1 or 0.2 etc.
- batch_size: N
- num_keypoints: K
- heatmap height: H
- heatmap width: W
output (np.ndarray[N, K, H, W]): Model output heatmaps.
target (np.ndarray[N, K, H, W]): Groundtruth heatmaps.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
thr (float): Threshold of PCK calculation. Default 0.05.
normalize (np.ndarray[N, 2]): Normalization factor for H&W.
tuple: A tuple containing keypoint accuracy.
- np.ndarray[K]: Accuracy of each keypoint.
- float: Averaged accuracy across all keypoints.
- int: Number of valid keypoints.
N, K, H, W = output.shape
if K == 0:
return None, 0, 0
if normalize is None:
normalize = np.tile(np.array([[H, W]]), (N, 1))
pred, _ = _get_max_preds(output)
gt, _ = _get_max_preds(target)
return keypoint_pck_accuracy(pred, gt, mask, thr, normalize)
def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints for coordinates.
PCK metric measures accuracy of the localization of the body joints.
The distances between predicted positions and the ground-truth ones
are typically normalized by the bounding box size.
The threshold (thr) of the normalized distance is commonly set
as 0.05, 0.1 or 0.2 etc.
- batch_size: N
- num_keypoints: K
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
thr (float): Threshold of PCK calculation.
normalize (np.ndarray[N, 2]): Normalization factor for H&W.
tuple: A tuple containing keypoint accuracy.
- acc (np.ndarray[K]): Accuracy of each keypoint.
- avg_acc (float): Averaged accuracy across all keypoints.
- cnt (int): Number of valid keypoints.
distances = _calc_distances(pred, gt, mask, normalize)
acc = np.array([_distance_acc(d, thr) for d in distances])
valid_acc = acc[acc >= 0]
cnt = len(valid_acc)
avg_acc = valid_acc.mean() if cnt > 0 else 0
return acc, avg_acc, cnt
def keypoint_auc(pred, gt, mask, normalize, num_step=20):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints for coordinates.
- batch_size: N
- num_keypoints: K
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
normalize (float): Normalization factor.
float: Area under curve.
nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
x = [1.0 * i / num_step for i in range(num_step)]
y = []
for thr in x:
_, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
auc = 0
for i in range(num_step):
auc += 1.0 / num_step * y[i]
return auc
def keypoint_nme(pred, gt, mask, normalize_factor):
"""Calculate the normalized mean error (NME).
- batch_size: N
- num_keypoints: K
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
normalize_factor (np.ndarray[N, 2]): Normalization factor.
float: normalized mean error
distances = _calc_distances(pred, gt, mask, normalize_factor)
distance_valid = distances[distances != -1]
return distance_valid.sum() / max(1, len(distance_valid))
def keypoint_epe(pred, gt, mask):
"""Calculate the end-point error.
- batch_size: N
- num_keypoints: K
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
float: Average end-point error.
distances = _calc_distances(
pred, gt, mask,
np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32))
distance_valid = distances[distances != -1]
return distance_valid.sum() / max(1, len(distance_valid))
def _taylor(heatmap, coord):
"""Distribution aware coordinate decoding method.
- heatmap height: H
- heatmap width: W
heatmap (np.ndarray[H, W]): Heatmap of a particular joint type.
coord (np.ndarray[2,]): Coordinates of the predicted keypoints.
np.ndarray[2,]: Updated coordinates.
H, W = heatmap.shape[:2]
px, py = int(coord[0]), int(coord[1])
if 1 < px < W - 2 and 1 < py < H - 2:
dx = 0.5 * (heatmap[py][px + 1] - heatmap[py][px - 1])
dy = 0.5 * (heatmap[py + 1][px] - heatmap[py - 1][px])
dxx = 0.25 * (
heatmap[py][px + 2] - 2 * heatmap[py][px] + heatmap[py][px - 2])
dxy = 0.25 * (
heatmap[py + 1][px + 1] - heatmap[py - 1][px + 1] -
heatmap[py + 1][px - 1] + heatmap[py - 1][px - 1])
dyy = 0.25 * (
heatmap[py + 2 * 1][px] - 2 * heatmap[py][px] +
heatmap[py - 2 * 1][px])
derivative = np.array([[dx], [dy]])
hessian = np.array([[dxx, dxy], [dxy, dyy]])
if dxx * dyy - dxy**2 != 0:
hessianinv = np.linalg.inv(hessian)
offset = -hessianinv @ derivative
offset = np.squeeze(np.array(offset.T), axis=0)
coord += offset
return coord
def post_dark_udp(coords, batch_heatmaps, kernel=3):
"""DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
Devil is in the Details: Delving into Unbiased Data Processing for Human
Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
Representation for Human Pose Estimation (CVPR 2020).
- batch size: B
- num keypoints: K
- num persons: N
- height of heatmaps: H
- width of heatmaps: W
B=1 for bottom_up paradigm where all persons share the same heatmap.
B=N for top_down paradigm where each person has its own heatmaps.
coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
kernel (int): Gaussian kernel size (K) for modulation.
np.ndarray([N, K, 2]): Refined coordinates.
if not isinstance(batch_heatmaps, np.ndarray):
batch_heatmaps = batch_heatmaps.cpu().numpy()
B, K, H, W = batch_heatmaps.shape
N = coords.shape[0]
assert (B == 1 or B == N)
for heatmaps in batch_heatmaps:
for heatmap in heatmaps:
cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
np.log(batch_heatmaps, batch_heatmaps)
batch_heatmaps_pad = np.pad(
batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)),
index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
index = index.astype(int).reshape(-1, 1)
i_ = batch_heatmaps_pad[index]
ix1 = batch_heatmaps_pad[index + 1]
iy1 = batch_heatmaps_pad[index + W + 2]
ix1y1 = batch_heatmaps_pad[index + W + 3]
ix1_y1_ = batch_heatmaps_pad[index - W - 3]
ix1_ = batch_heatmaps_pad[index - 1]
iy1_ = batch_heatmaps_pad[index - 2 - W]
dx = 0.5 * (ix1 - ix1_)
dy = 0.5 * (iy1 - iy1_)
derivative = np.concatenate([dx, dy], axis=1)
derivative = derivative.reshape(N, K, 2, 1)
dxx = ix1 - 2 * i_ + ix1_
dyy = iy1 - 2 * i_ + iy1_
dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
hessian = hessian.reshape(N, K, 2, 2)
hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
return coords
def _gaussian_blur(heatmaps, kernel=11):
"""Modulate heatmap distribution with Gaussian.
sigma = 0.3*((kernel_size-1)*0.5-1)+0.8
sigma~=3 if k=17
sigma=2 if k=11;
sigma~=1.5 if k=7;
sigma~=1 if k=3;
- batch_size: N
- num_keypoints: K
- heatmap height: H
- heatmap width: W
heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
kernel (int): Gaussian kernel size (K) for modulation, which should
match the heatmap gaussian sigma when training.
K=17 for sigma=3 and k=11 for sigma=2.
np.ndarray ([N, K, H, W]): Modulated heatmap distribution.
assert kernel % 2 == 1
border = (kernel - 1) // 2
batch_size = heatmaps.shape[0]
num_joints = heatmaps.shape[1]
height = heatmaps.shape[2]
width = heatmaps.shape[3]
for i in range(batch_size):
for j in range(num_joints):
origin_max = np.max(heatmaps[i, j])
dr = np.zeros((height + 2 * border, width + 2 * border),
dr[border:-border, border:-border] = heatmaps[i, j].copy()
dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
heatmaps[i, j] = dr[border:-border, border:-border].copy()
heatmaps[i, j] *= origin_max / np.max(heatmaps[i, j])
return heatmaps
def keypoints_from_regression(regression_preds, center, scale, img_size):
"""Get final keypoint predictions from regression vectors and transform
them back to the image.
- batch_size: N
- num_keypoints: K
regression_preds (np.ndarray[N, K, 2]): model prediction.
center (np.ndarray[N, 2]): Center of the bounding box (x, y).
scale (np.ndarray[N, 2]): Scale of the bounding box
wrt height/width.
img_size (list(img_width, img_height)): model input image size.
- preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
- maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
N, K, _ = regression_preds.shape
preds, maxvals = regression_preds, np.ones((N, K, 1), dtype=np.float32)
preds = preds * img_size
# Transform back to the image
for i in range(N):
preds[i] = transform_preds(preds[i], center[i], scale[i], img_size)
return preds, maxvals
def keypoints_from_heatmaps(heatmaps,
"""Get final keypoint predictions from heatmaps and transform them back to
the image.
- batch size: N
- num keypoints: K
- heatmap height: H
- heatmap width: W
heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
center (np.ndarray[N, 2]): Center of the bounding box (x, y).
scale (np.ndarray[N, 2]): Scale of the bounding box
wrt height/width.
post_process (str/None): Choice of methods to post-process
heatmaps. Currently supported: None, 'default', 'unbiased',
unbiased (bool): Option to use unbiased decoding. Mutually
exclusive with megvii.
Note: this arg is deprecated and unbiased=True can be replaced
by post_process='unbiased'
Paper ref: Zhang et al. Distribution-Aware Coordinate
Representation for Human Pose Estimation (CVPR 2020).
kernel (int): Gaussian kernel size (K) for modulation, which should
match the heatmap gaussian sigma when training.
K=17 for sigma=3 and k=11 for sigma=2.
valid_radius_factor (float): The radius factor of the positive area
in classification heatmap for UDP.
use_udp (bool): Use unbiased data processing.
target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
GaussianHeatmap: Classification target with gaussian distribution.
CombinedTarget: The combination of classification target
(response map) and regression target (offset map).
Paper ref: Huang et al. The Devil is in the Details: Delving into
Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
tuple: A tuple containing keypoint predictions and scores.
- preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
- maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
# Avoid being affected
heatmaps = heatmaps.copy()
# detect conflicts
if unbiased:
assert post_process not in [False, None, 'megvii']
if post_process in ['megvii', 'unbiased']:
assert kernel > 0
if use_udp:
assert not post_process == 'megvii'
# normalize configs
if post_process is False:
'post_process=False is deprecated, '
'please use post_process=None instead', DeprecationWarning)
post_process = None
elif post_process is True:
if unbiased is True:
'post_process=True, unbiased=True is deprecated,'
" please use post_process='unbiased' instead",
post_process = 'unbiased'
'post_process=True, unbiased=False is deprecated, '
"please use post_process='default' instead",
post_process = 'default'
elif post_process == 'default':
if unbiased is True:
'unbiased=True is deprecated, please use '
"post_process='unbiased' instead", DeprecationWarning)
post_process = 'unbiased'
# start processing
if post_process == 'megvii':
heatmaps = _gaussian_blur(heatmaps, kernel=kernel)
N, K, H, W = heatmaps.shape
if use_udp:
if target_type.lower() == 'GaussianHeatMap'.lower():
preds, maxvals = _get_max_preds(heatmaps)
preds = post_dark_udp(preds, heatmaps, kernel=kernel)
elif target_type.lower() == 'CombinedTarget'.lower():
for person_heatmaps in heatmaps:
for i, heatmap in enumerate(person_heatmaps):
kt = 2 * kernel + 1 if i % 3 == 0 else kernel
cv2.GaussianBlur(heatmap, (kt, kt), 0, heatmap)
# valid radius is in direct proportion to the height of heatmap.
valid_radius = valid_radius_factor * H
offset_x = heatmaps[:, 1::3, :].flatten() * valid_radius
offset_y = heatmaps[:, 2::3, :].flatten() * valid_radius
heatmaps = heatmaps[:, ::3, :]
preds, maxvals = _get_max_preds(heatmaps)
index = preds[..., 0] + preds[..., 1] * W
index += W * H * np.arange(0, N * K / 3)
index = index.astype(int).reshape(N, K // 3, 1)
preds += np.concatenate((offset_x[index], offset_y[index]), axis=2)
raise ValueError('target_type should be either '
"'GaussianHeatmap' or 'CombinedTarget'")
preds, maxvals = _get_max_preds(heatmaps)
if post_process == 'unbiased': # alleviate biased coordinate
# apply Gaussian distribution modulation.
heatmaps = np.log(
np.maximum(_gaussian_blur(heatmaps, kernel), 1e-10))
for n in range(N):
for k in range(K):
preds[n][k] = _taylor(heatmaps[n][k], preds[n][k])
elif post_process is not None:
# add +/-0.25 shift to the predicted locations for higher acc.
for n in range(N):
for k in range(K):
heatmap = heatmaps[n][k]
px = int(preds[n][k][0])
py = int(preds[n][k][1])
if 1 < px < W - 1 and 1 < py < H - 1:
diff = np.array([
heatmap[py][px + 1] - heatmap[py][px - 1],
heatmap[py + 1][px] - heatmap[py - 1][px]
preds[n][k] += np.sign(diff) * .25
if post_process == 'megvii':
preds[n][k] += 0.5
# Transform back to the image
for i in range(N):
preds[i] = transform_preds(
preds[i], center[i], scale[i], [W, H], use_udp=use_udp)
if post_process == 'megvii':
maxvals = maxvals / 255.0 + 0.5
return preds, maxvals
def keypoints_from_heatmaps3d(heatmaps, center, scale):
"""Get final keypoint predictions from 3d heatmaps and transform them back
to the image.
- batch size: N
- num keypoints: K
- heatmap depth size: D
- heatmap height: H
- heatmap width: W
heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
center (np.ndarray[N, 2]): Center of the bounding box (x, y).
scale (np.ndarray[N, 2]): Scale of the bounding box
wrt height/width.
tuple: A tuple containing keypoint predictions and scores.
- preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location \
in images.
- maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
N, K, D, H, W = heatmaps.shape
preds, maxvals = _get_max_preds_3d(heatmaps)
# Transform back to the image
for i in range(N):
preds[i, :, :2] = transform_preds(preds[i, :, :2], center[i], scale[i],
[W, H])
return preds, maxvals
def multilabel_classification_accuracy(pred, gt, mask, thr=0.5):
"""Get multi-label classification accuracy.
- batch size: N
- label number: L
pred (np.ndarray[N, L, 2]): model predicted labels.
gt (np.ndarray[N, L, 2]): ground-truth labels.
mask (np.ndarray[N, 1] or np.ndarray[N, L] ): reliability of
ground-truth labels.
float: multi-label classification accuracy.
# we only compute accuracy on the samples with ground-truth of all labels.
valid = (mask > 0).min(axis=1) if mask.ndim == 2 else (mask > 0)
pred, gt = pred[valid], gt[valid]
if pred.shape[0] == 0:
acc = 0.0 # when no sample is with gt labels, set acc to 0.
# The classification of a sample is regarded as correct
# only if it's correct for all labels.
acc = (((pred - thr) * (gt - thr)) > 0).all(axis=1).mean()
return acc