MohamedIFQ
/

AI_Avatar_Anim

Model card Files Files and versions Community

MohamedIFQ commited on Jun 2

Commit

1103598

•

1 Parent(s): f9a80d8

Upload 118 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

src/audio2exp_models/audio2exp.py +41 -0
src/audio2exp_models/networks.py +74 -0
src/audio2pose_models/audio2pose.py +94 -0
src/audio2pose_models/audio_encoder.py +64 -0
src/audio2pose_models/cvae.py +149 -0
src/audio2pose_models/discriminator.py +76 -0
src/audio2pose_models/networks.py +140 -0
src/audio2pose_models/res_unet.py +65 -0
src/config/auido2exp.yaml +58 -0
src/config/auido2pose.yaml +49 -0
src/config/facerender.yaml +45 -0
src/config/facerender_still.yaml +45 -0
src/config/similarity_Lm3D_all.mat +0 -0
src/face3d/data/__init__.py +116 -0
src/face3d/data/base_dataset.py +125 -0
src/face3d/data/flist_dataset.py +125 -0
src/face3d/data/image_folder.py +66 -0
src/face3d/data/template_dataset.py +75 -0
src/face3d/extract_kp_videos.py +108 -0
src/face3d/extract_kp_videos_safe.py +151 -0
src/face3d/models/__init__.py +67 -0
src/face3d/models/arcface_torch/README.md +164 -0
src/face3d/models/arcface_torch/backbones/__init__.py +25 -0
src/face3d/models/arcface_torch/backbones/iresnet.py +187 -0
src/face3d/models/arcface_torch/backbones/iresnet2060.py +176 -0
src/face3d/models/arcface_torch/backbones/mobilefacenet.py +130 -0
src/face3d/models/arcface_torch/configs/3millions.py +23 -0
src/face3d/models/arcface_torch/configs/3millions_pfc.py +23 -0
src/face3d/models/arcface_torch/configs/__init__.py +0 -0
src/face3d/models/arcface_torch/configs/base.py +56 -0
src/face3d/models/arcface_torch/configs/glint360k_mbf.py +26 -0
src/face3d/models/arcface_torch/configs/glint360k_r100.py +26 -0
src/face3d/models/arcface_torch/configs/glint360k_r18.py +26 -0
src/face3d/models/arcface_torch/configs/glint360k_r34.py +26 -0
src/face3d/models/arcface_torch/configs/glint360k_r50.py +26 -0
src/face3d/models/arcface_torch/configs/ms1mv3_mbf.py +26 -0
src/face3d/models/arcface_torch/configs/ms1mv3_r18.py +26 -0
src/face3d/models/arcface_torch/configs/ms1mv3_r2060.py +26 -0
src/face3d/models/arcface_torch/configs/ms1mv3_r34.py +26 -0
src/face3d/models/arcface_torch/configs/ms1mv3_r50.py +26 -0
src/face3d/models/arcface_torch/configs/speed.py +23 -0
src/face3d/models/arcface_torch/dataset.py +124 -0
src/face3d/models/arcface_torch/docs/eval.md +31 -0
src/face3d/models/arcface_torch/docs/install.md +51 -0
src/face3d/models/arcface_torch/docs/modelzoo.md +0 -0
src/face3d/models/arcface_torch/docs/speed_benchmark.md +93 -0
src/face3d/models/arcface_torch/eval/__init__.py +0 -0
src/face3d/models/arcface_torch/eval/verification.py +407 -0
src/face3d/models/arcface_torch/eval_ijbc.py +483 -0
src/face3d/models/arcface_torch/inference.py +35 -0

src/audio2exp_models/audio2exp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from tqdm import tqdm
+import torch
+from torch import nn
+class Audio2Exp(nn.Module):
+ def __init__(self, netG, cfg, device, prepare_training_loss=False):
+ super(Audio2Exp, self).__init__()
+ self.cfg = cfg
+ self.device = device
+ self.netG = netG.to(device)
+ def test(self, batch):
+ mel_input = batch['indiv_mels'] # bs T 1 80 16
+ bs = mel_input.shape[0]
+ T = mel_input.shape[1]
+ exp_coeff_pred = []
+ for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
+ current_mel_input = mel_input[:,i:i+10]
+ #ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1)) #bs T 64
+ ref = batch['ref'][:, :, :64][:, i:i+10]
+ ratio = batch['ratio_gt'][:, i:i+10] #bs T
+ audiox = current_mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16
+ curr_exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64
+ exp_coeff_pred += [curr_exp_coeff_pred]
+ # BS x T x 64
+ results_dict = {
+ 'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
+ }
+ return results_dict

src/audio2exp_models/networks.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+class Conv2d(nn.Module):
+ def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, use_act = True, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.conv_block = nn.Sequential(
+ nn.Conv2d(cin, cout, kernel_size, stride, padding),
+ nn.BatchNorm2d(cout)
+ )
+ self.act = nn.ReLU()
+ self.residual = residual
+ self.use_act = use_act
+ def forward(self, x):
+ out = self.conv_block(x)
+ if self.residual:
+ out += x
+ if self.use_act:
+ return self.act(out)
+ else:
+ return out
+class SimpleWrapperV2(nn.Module):
+ def __init__(self) -> None:
+ super().__init__()
+ self.audio_encoder = nn.Sequential(
+ Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+ Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
+ Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
+ Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
+ Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
+ Conv2d(512, 512, kernel_size=1, stride=1, padding=0),
+ )
+ #### load the pre-trained audio_encoder
+ #self.audio_encoder = self.audio_encoder.to(device)
+ '''
+ wav2lip_state_dict = torch.load('/apdcephfs_cq2/share_1290939/wenxuazhang/checkpoints/wav2lip.pth')['state_dict']
+ state_dict = self.audio_encoder.state_dict()
+ for k,v in wav2lip_state_dict.items():
+ if 'audio_encoder' in k:
+ print('init:', k)
+ state_dict[k.replace('module.audio_encoder.', '')] = v
+ self.audio_encoder.load_state_dict(state_dict)
+ '''
+ self.mapping1 = nn.Linear(512+64+1, 64)
+ #self.mapping2 = nn.Linear(30, 64)
+ #nn.init.constant_(self.mapping1.weight, 0.)
+ nn.init.constant_(self.mapping1.bias, 0.)
+ def forward(self, x, ref, ratio):
+ x = self.audio_encoder(x).view(x.size(0), -1)
+ ref_reshape = ref.reshape(x.size(0), -1)
+ ratio = ratio.reshape(x.size(0), -1)
+ y = self.mapping1(torch.cat([x, ref_reshape, ratio], dim=1))
+ out = y.reshape(ref.shape[0], ref.shape[1], -1) #+ ref # resudial
+ return out

src/audio2pose_models/audio2pose.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+from torch import nn
+from src.audio2pose_models.cvae import CVAE
+from src.audio2pose_models.discriminator import PoseSequenceDiscriminator
+from src.audio2pose_models.audio_encoder import AudioEncoder
+class Audio2Pose(nn.Module):
+ def __init__(self, cfg, wav2lip_checkpoint, device='cuda'):
+ super().__init__()
+ self.cfg = cfg
+ self.seq_len = cfg.MODEL.CVAE.SEQ_LEN
+ self.latent_dim = cfg.MODEL.CVAE.LATENT_SIZE
+ self.device = device
+ self.audio_encoder = AudioEncoder(wav2lip_checkpoint, device)
+ self.audio_encoder.eval()
+ for param in self.audio_encoder.parameters():
+ param.requires_grad = False
+ self.netG = CVAE(cfg)
+ self.netD_motion = PoseSequenceDiscriminator(cfg)
+ def forward(self, x):
+ batch = {}
+ coeff_gt = x['gt'].cuda().squeeze(0) #bs frame_len+1 73
+ batch['pose_motion_gt'] = coeff_gt[:, 1:, 64:70] - coeff_gt[:, :1, 64:70] #bs frame_len 6
+ batch['ref'] = coeff_gt[:, 0, 64:70] #bs 6
+ batch['class'] = x['class'].squeeze(0).cuda() # bs
+ indiv_mels= x['indiv_mels'].cuda().squeeze(0) # bs seq_len+1 80 16
+ # forward
+ audio_emb_list = []
+ audio_emb = self.audio_encoder(indiv_mels[:, 1:, :, :].unsqueeze(2)) #bs seq_len 512
+ batch['audio_emb'] = audio_emb
+ batch = self.netG(batch)
+ pose_motion_pred = batch['pose_motion_pred'] # bs frame_len 6
+ pose_gt = coeff_gt[:, 1:, 64:70].clone() # bs frame_len 6
+ pose_pred = coeff_gt[:, :1, 64:70] + pose_motion_pred # bs frame_len 6
+ batch['pose_pred'] = pose_pred
+ batch['pose_gt'] = pose_gt
+ return batch
+ def test(self, x):
+ batch = {}
+ ref = x['ref'] #bs 1 70
+ batch['ref'] = x['ref'][:,0,-6:]
+ batch['class'] = x['class']
+ bs = ref.shape[0]
+ indiv_mels= x['indiv_mels'] # bs T 1 80 16
+ indiv_mels_use = indiv_mels[:, 1:] # we regard the ref as the first frame
+ num_frames = x['num_frames']
+ num_frames = int(num_frames) - 1
+ #
+ div = num_frames//self.seq_len
+ re = num_frames%self.seq_len
+ audio_emb_list = []
+ pose_motion_pred_list = [torch.zeros(batch['ref'].unsqueeze(1).shape, dtype=batch['ref'].dtype,
+ device=batch['ref'].device)]
+ for i in range(div):
+ z = torch.randn(bs, self.latent_dim).to(ref.device)
+ batch['z'] = z
+ audio_emb = self.audio_encoder(indiv_mels_use[:, i*self.seq_len:(i+1)*self.seq_len,:,:,:]) #bs seq_len 512
+ batch['audio_emb'] = audio_emb
+ batch = self.netG.test(batch)
+ pose_motion_pred_list.append(batch['pose_motion_pred']) #list of bs seq_len 6
+ if re != 0:
+ z = torch.randn(bs, self.latent_dim).to(ref.device)
+ batch['z'] = z
+ audio_emb = self.audio_encoder(indiv_mels_use[:, -1*self.seq_len:,:,:,:]) #bs seq_len 512
+ if audio_emb.shape[1] != self.seq_len:
+ pad_dim = self.seq_len-audio_emb.shape[1]
+ pad_audio_emb = audio_emb[:, :1].repeat(1, pad_dim, 1)
+ audio_emb = torch.cat([pad_audio_emb, audio_emb], 1)
+ batch['audio_emb'] = audio_emb
+ batch = self.netG.test(batch)
+ pose_motion_pred_list.append(batch['pose_motion_pred'][:,-1*re:,:])
+ pose_motion_pred = torch.cat(pose_motion_pred_list, dim = 1)
+ batch['pose_motion_pred'] = pose_motion_pred
+ pose_pred = ref[:, :1, -6:] + pose_motion_pred # bs T 6
+ batch['pose_pred'] = pose_pred
+ return batch

src/audio2pose_models/audio_encoder.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+class Conv2d(nn.Module):
+ def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.conv_block = nn.Sequential(
+ nn.Conv2d(cin, cout, kernel_size, stride, padding),
+ nn.BatchNorm2d(cout)
+ )
+ self.act = nn.ReLU()
+ self.residual = residual
+ def forward(self, x):
+ out = self.conv_block(x)
+ if self.residual:
+ out += x
+ return self.act(out)
+class AudioEncoder(nn.Module):
+ def __init__(self, wav2lip_checkpoint, device):
+ super(AudioEncoder, self).__init__()
+ self.audio_encoder = nn.Sequential(
+ Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+ Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
+ Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
+ Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
+ Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+ Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
+ Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
+ #### load the pre-trained audio_encoder, we do not need to load wav2lip model here.
+ # wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=torch.device(device))['state_dict']
+ # state_dict = self.audio_encoder.state_dict()
+ # for k,v in wav2lip_state_dict.items():
+ # if 'audio_encoder' in k:
+ # state_dict[k.replace('module.audio_encoder.', '')] = v
+ # self.audio_encoder.load_state_dict(state_dict)
+ def forward(self, audio_sequences):
+ # audio_sequences = (B, T, 1, 80, 16)
+ B = audio_sequences.size(0)
+ audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0)
+ audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1
+ dim = audio_embedding.shape[1]
+ audio_embedding = audio_embedding.reshape((B, -1, dim, 1, 1))
+ return audio_embedding.squeeze(-1).squeeze(-1) #B seq_len+1 512

src/audio2pose_models/cvae.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from src.audio2pose_models.res_unet import ResUnet
+def class2onehot(idx, class_num):
+ assert torch.max(idx).item() < class_num
+ onehot = torch.zeros(idx.size(0), class_num).to(idx.device)
+ onehot.scatter_(1, idx, 1)
+ return onehot
+class CVAE(nn.Module):
+ def __init__(self, cfg):
+ super().__init__()
+ encoder_layer_sizes = cfg.MODEL.CVAE.ENCODER_LAYER_SIZES
+ decoder_layer_sizes = cfg.MODEL.CVAE.DECODER_LAYER_SIZES
+ latent_size = cfg.MODEL.CVAE.LATENT_SIZE
+ num_classes = cfg.DATASET.NUM_CLASSES
+ audio_emb_in_size = cfg.MODEL.CVAE.AUDIO_EMB_IN_SIZE
+ audio_emb_out_size = cfg.MODEL.CVAE.AUDIO_EMB_OUT_SIZE
+ seq_len = cfg.MODEL.CVAE.SEQ_LEN
+ self.latent_size = latent_size
+ self.encoder = ENCODER(encoder_layer_sizes, latent_size, num_classes,
+ audio_emb_in_size, audio_emb_out_size, seq_len)
+ self.decoder = DECODER(decoder_layer_sizes, latent_size, num_classes,
+ audio_emb_in_size, audio_emb_out_size, seq_len)
+ def reparameterize(self, mu, logvar):
+ std = torch.exp(0.5 * logvar)
+ eps = torch.randn_like(std)
+ return mu + eps * std
+ def forward(self, batch):
+ batch = self.encoder(batch)
+ mu = batch['mu']
+ logvar = batch['logvar']
+ z = self.reparameterize(mu, logvar)
+ batch['z'] = z
+ return self.decoder(batch)
+ def test(self, batch):
+ '''
+ class_id = batch['class']
+ z = torch.randn([class_id.size(0), self.latent_size]).to(class_id.device)
+ batch['z'] = z
+ '''
+ return self.decoder(batch)
+class ENCODER(nn.Module):
+ def __init__(self, layer_sizes, latent_size, num_classes,
+ audio_emb_in_size, audio_emb_out_size, seq_len):
+ super().__init__()
+ self.resunet = ResUnet()
+ self.num_classes = num_classes
+ self.seq_len = seq_len
+ self.MLP = nn.Sequential()
+ layer_sizes[0] += latent_size + seq_len*audio_emb_out_size + 6
+ for i, (in_size, out_size) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
+ self.MLP.add_module(
+ name="L{:d}".format(i), module=nn.Linear(in_size, out_size))
+ self.MLP.add_module(name="A{:d}".format(i), module=nn.ReLU())
+ self.linear_means = nn.Linear(layer_sizes[-1], latent_size)
+ self.linear_logvar = nn.Linear(layer_sizes[-1], latent_size)
+ self.linear_audio = nn.Linear(audio_emb_in_size, audio_emb_out_size)
+ self.classbias = nn.Parameter(torch.randn(self.num_classes, latent_size))
+ def forward(self, batch):
+ class_id = batch['class']
+ pose_motion_gt = batch['pose_motion_gt'] #bs seq_len 6
+ ref = batch['ref'] #bs 6
+ bs = pose_motion_gt.shape[0]
+ audio_in = batch['audio_emb'] # bs seq_len audio_emb_in_size
+ #pose encode
+ pose_emb = self.resunet(pose_motion_gt.unsqueeze(1)) #bs 1 seq_len 6
+ pose_emb = pose_emb.reshape(bs, -1) #bs seq_len*6
+ #audio mapping
+ print(audio_in.shape)
+ audio_out = self.linear_audio(audio_in) # bs seq_len audio_emb_out_size
+ audio_out = audio_out.reshape(bs, -1)
+ class_bias = self.classbias[class_id] #bs latent_size
+ x_in = torch.cat([ref, pose_emb, audio_out, class_bias], dim=-1) #bs seq_len*(audio_emb_out_size+6)+latent_size
+ x_out = self.MLP(x_in)
+ mu = self.linear_means(x_out)
+ logvar = self.linear_means(x_out) #bs latent_size
+ batch.update({'mu':mu, 'logvar':logvar})
+ return batch
+class DECODER(nn.Module):
+ def __init__(self, layer_sizes, latent_size, num_classes,
+ audio_emb_in_size, audio_emb_out_size, seq_len):
+ super().__init__()
+ self.resunet = ResUnet()
+ self.num_classes = num_classes
+ self.seq_len = seq_len
+ self.MLP = nn.Sequential()
+ input_size = latent_size + seq_len*audio_emb_out_size + 6
+ for i, (in_size, out_size) in enumerate(zip([input_size]+layer_sizes[:-1], layer_sizes)):
+ self.MLP.add_module(
+ name="L{:d}".format(i), module=nn.Linear(in_size, out_size))
+ if i+1 < len(layer_sizes):
+ self.MLP.add_module(name="A{:d}".format(i), module=nn.ReLU())
+ else:
+ self.MLP.add_module(name="sigmoid", module=nn.Sigmoid())
+ self.pose_linear = nn.Linear(6, 6)
+ self.linear_audio = nn.Linear(audio_emb_in_size, audio_emb_out_size)
+ self.classbias = nn.Parameter(torch.randn(self.num_classes, latent_size))
+ def forward(self, batch):
+ z = batch['z'] #bs latent_size
+ bs = z.shape[0]
+ class_id = batch['class']
+ ref = batch['ref'] #bs 6
+ audio_in = batch['audio_emb'] # bs seq_len audio_emb_in_size
+ #print('audio_in: ', audio_in[:, :, :10])
+ audio_out = self.linear_audio(audio_in) # bs seq_len audio_emb_out_size
+ #print('audio_out: ', audio_out[:, :, :10])
+ audio_out = audio_out.reshape([bs, -1]) # bs seq_len*audio_emb_out_size
+ class_bias = self.classbias[class_id] #bs latent_size
+ z = z + class_bias
+ x_in = torch.cat([ref, z, audio_out], dim=-1)
+ x_out = self.MLP(x_in) # bs layer_sizes[-1]
+ x_out = x_out.reshape((bs, self.seq_len, -1))
+ #print('x_out: ', x_out)
+ pose_emb = self.resunet(x_out.unsqueeze(1)) #bs 1 seq_len 6
+ pose_motion_pred = self.pose_linear(pose_emb.squeeze(1)) #bs seq_len 6
+ batch.update({'pose_motion_pred':pose_motion_pred})
+ return batch

src/audio2pose_models/discriminator.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+class ConvNormRelu(nn.Module):
+ def __init__(self, conv_type='1d', in_channels=3, out_channels=64, downsample=False,
+ kernel_size=None, stride=None, padding=None, norm='BN', leaky=False):
+ super().__init__()
+ if kernel_size is None:
+ if downsample:
+ kernel_size, stride, padding = 4, 2, 1
+ else:
+ kernel_size, stride, padding = 3, 1, 1
+ if conv_type == '2d':
+ self.conv = nn.Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ bias=False,
+ )
+ if norm == 'BN':
+ self.norm = nn.BatchNorm2d(out_channels)
+ elif norm == 'IN':
+ self.norm = nn.InstanceNorm2d(out_channels)
+ else:
+ raise NotImplementedError
+ elif conv_type == '1d':
+ self.conv = nn.Conv1d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ bias=False,
+ )
+ if norm == 'BN':
+ self.norm = nn.BatchNorm1d(out_channels)
+ elif norm == 'IN':
+ self.norm = nn.InstanceNorm1d(out_channels)
+ else:
+ raise NotImplementedError
+ nn.init.kaiming_normal_(self.conv.weight)
+ self.act = nn.LeakyReLU(negative_slope=0.2, inplace=False) if leaky else nn.ReLU(inplace=True)
+ def forward(self, x):
+ x = self.conv(x)
+ if isinstance(self.norm, nn.InstanceNorm1d):
+ x = self.norm(x.permute((0, 2, 1))).permute((0, 2, 1)) # normalize on [C]
+ else:
+ x = self.norm(x)
+ x = self.act(x)
+ return x
+class PoseSequenceDiscriminator(nn.Module):
+ def __init__(self, cfg):
+ super().__init__()
+ self.cfg = cfg
+ leaky = self.cfg.MODEL.DISCRIMINATOR.LEAKY_RELU
+ self.seq = nn.Sequential(
+ ConvNormRelu('1d', cfg.MODEL.DISCRIMINATOR.INPUT_CHANNELS, 256, downsample=True, leaky=leaky), # B, 256, 64
+ ConvNormRelu('1d', 256, 512, downsample=True, leaky=leaky), # B, 512, 32
+ ConvNormRelu('1d', 512, 1024, kernel_size=3, stride=1, padding=1, leaky=leaky), # B, 1024, 16
+ nn.Conv1d(1024, 1, kernel_size=3, stride=1, padding=1, bias=True) # B, 1, 16
+ )
+ def forward(self, x):
+ x = x.reshape(x.size(0), x.size(1), -1).transpose(1, 2)
+ x = self.seq(x)
+ x = x.squeeze(1)
+ return x

src/audio2pose_models/networks.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch.nn as nn
+import torch
+class ResidualConv(nn.Module):
+ def __init__(self, input_dim, output_dim, stride, padding):
+ super(ResidualConv, self).__init__()
+ self.conv_block = nn.Sequential(
+ nn.BatchNorm2d(input_dim),
+ nn.ReLU(),
+ nn.Conv2d(
+ input_dim, output_dim, kernel_size=3, stride=stride, padding=padding
+ ),
+ nn.BatchNorm2d(output_dim),
+ nn.ReLU(),
+ nn.Conv2d(output_dim, output_dim, kernel_size=3, padding=1),
+ )
+ self.conv_skip = nn.Sequential(
+ nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=stride, padding=1),
+ nn.BatchNorm2d(output_dim),
+ )
+ def forward(self, x):
+ return self.conv_block(x) + self.conv_skip(x)
+class Upsample(nn.Module):
+ def __init__(self, input_dim, output_dim, kernel, stride):
+ super(Upsample, self).__init__()
+ self.upsample = nn.ConvTranspose2d(
+ input_dim, output_dim, kernel_size=kernel, stride=stride
+ )
+ def forward(self, x):
+ return self.upsample(x)
+class Squeeze_Excite_Block(nn.Module):
+ def __init__(self, channel, reduction=16):
+ super(Squeeze_Excite_Block, self).__init__()
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
+ self.fc = nn.Sequential(
+ nn.Linear(channel, channel // reduction, bias=False),
+ nn.ReLU(inplace=True),
+ nn.Linear(channel // reduction, channel, bias=False),
+ nn.Sigmoid(),
+ )
+ def forward(self, x):
+ b, c, _, _ = x.size()
+ y = self.avg_pool(x).view(b, c)
+ y = self.fc(y).view(b, c, 1, 1)
+ return x * y.expand_as(x)
+class ASPP(nn.Module):
+ def __init__(self, in_dims, out_dims, rate=[6, 12, 18]):
+ super(ASPP, self).__init__()
+ self.aspp_block1 = nn.Sequential(
+ nn.Conv2d(
+ in_dims, out_dims, 3, stride=1, padding=rate[0], dilation=rate[0]
+ ),
+ nn.ReLU(inplace=True),
+ nn.BatchNorm2d(out_dims),
+ )
+ self.aspp_block2 = nn.Sequential(
+ nn.Conv2d(
+ in_dims, out_dims, 3, stride=1, padding=rate[1], dilation=rate[1]
+ ),
+ nn.ReLU(inplace=True),
+ nn.BatchNorm2d(out_dims),
+ )
+ self.aspp_block3 = nn.Sequential(
+ nn.Conv2d(
+ in_dims, out_dims, 3, stride=1, padding=rate[2], dilation=rate[2]
+ ),
+ nn.ReLU(inplace=True),
+ nn.BatchNorm2d(out_dims),
+ )
+ self.output = nn.Conv2d(len(rate) * out_dims, out_dims, 1)
+ self._init_weights()
+ def forward(self, x):
+ x1 = self.aspp_block1(x)
+ x2 = self.aspp_block2(x)
+ x3 = self.aspp_block3(x)
+ out = torch.cat([x1, x2, x3], dim=1)
+ return self.output(out)
+ def _init_weights(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight)
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1)
+ m.bias.data.zero_()
+class Upsample_(nn.Module):
+ def __init__(self, scale=2):
+ super(Upsample_, self).__init__()
+ self.upsample = nn.Upsample(mode="bilinear", scale_factor=scale)
+ def forward(self, x):
+ return self.upsample(x)
+class AttentionBlock(nn.Module):
+ def __init__(self, input_encoder, input_decoder, output_dim):
+ super(AttentionBlock, self).__init__()
+ self.conv_encoder = nn.Sequential(
+ nn.BatchNorm2d(input_encoder),
+ nn.ReLU(),
+ nn.Conv2d(input_encoder, output_dim, 3, padding=1),
+ nn.MaxPool2d(2, 2),
+ )
+ self.conv_decoder = nn.Sequential(
+ nn.BatchNorm2d(input_decoder),
+ nn.ReLU(),
+ nn.Conv2d(input_decoder, output_dim, 3, padding=1),
+ )
+ self.conv_attn = nn.Sequential(
+ nn.BatchNorm2d(output_dim),
+ nn.ReLU(),
+ nn.Conv2d(output_dim, 1, 1),
+ )
+ def forward(self, x1, x2):
+ out = self.conv_encoder(x1) + self.conv_decoder(x2)
+ out = self.conv_attn(out)
+ return out * x2

src/audio2pose_models/res_unet.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from src.audio2pose_models.networks import ResidualConv, Upsample
+class ResUnet(nn.Module):
+ def __init__(self, channel=1, filters=[32, 64, 128, 256]):
+ super(ResUnet, self).__init__()
+ self.input_layer = nn.Sequential(
+ nn.Conv2d(channel, filters[0], kernel_size=3, padding=1),
+ nn.BatchNorm2d(filters[0]),
+ nn.ReLU(),
+ nn.Conv2d(filters[0], filters[0], kernel_size=3, padding=1),
+ )
+ self.input_skip = nn.Sequential(
+ nn.Conv2d(channel, filters[0], kernel_size=3, padding=1)
+ )
+ self.residual_conv_1 = ResidualConv(filters[0], filters[1], stride=(2,1), padding=1)
+ self.residual_conv_2 = ResidualConv(filters[1], filters[2], stride=(2,1), padding=1)
+ self.bridge = ResidualConv(filters[2], filters[3], stride=(2,1), padding=1)
+ self.upsample_1 = Upsample(filters[3], filters[3], kernel=(2,1), stride=(2,1))
+ self.up_residual_conv1 = ResidualConv(filters[3] + filters[2], filters[2], stride=1, padding=1)
+ self.upsample_2 = Upsample(filters[2], filters[2], kernel=(2,1), stride=(2,1))
+ self.up_residual_conv2 = ResidualConv(filters[2] + filters[1], filters[1], stride=1, padding=1)
+ self.upsample_3 = Upsample(filters[1], filters[1], kernel=(2,1), stride=(2,1))
+ self.up_residual_conv3 = ResidualConv(filters[1] + filters[0], filters[0], stride=1, padding=1)
+ self.output_layer = nn.Sequential(
+ nn.Conv2d(filters[0], 1, 1, 1),
+ nn.Sigmoid(),
+ )
+ def forward(self, x):
+ # Encode
+ x1 = self.input_layer(x) + self.input_skip(x)
+ x2 = self.residual_conv_1(x1)
+ x3 = self.residual_conv_2(x2)
+ # Bridge
+ x4 = self.bridge(x3)
+ # Decode
+ x4 = self.upsample_1(x4)
+ x5 = torch.cat([x4, x3], dim=1)
+ x6 = self.up_residual_conv1(x5)
+ x6 = self.upsample_2(x6)
+ x7 = torch.cat([x6, x2], dim=1)
+ x8 = self.up_residual_conv2(x7)
+ x8 = self.upsample_3(x8)
+ x9 = torch.cat([x8, x1], dim=1)
+ x10 = self.up_residual_conv3(x9)
+ output = self.output_layer(x10)
+ return output

src/config/auido2exp.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+DATASET:
+ TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/train.txt
+ EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/val.txt
+ TRAIN_BATCH_SIZE: 32
+ EVAL_BATCH_SIZE: 32
+ EXP: True
+ EXP_DIM: 64
+ FRAME_LEN: 32
+ COEFF_LEN: 73
+ NUM_CLASSES: 46
+ AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav
+ COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav2lip_3dmm
+ LMDB_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb
+ DEBUG: True
+ NUM_REPEATS: 2
+ T: 40
+MODEL:
+ FRAMEWORK: V2
+ AUDIOENCODER:
+ LEAKY_RELU: True
+ NORM: 'IN'
+ DISCRIMINATOR:
+ LEAKY_RELU: False
+ INPUT_CHANNELS: 6
+ CVAE:
+ AUDIO_EMB_IN_SIZE: 512
+ AUDIO_EMB_OUT_SIZE: 128
+ SEQ_LEN: 32
+ LATENT_SIZE: 256
+ ENCODER_LAYER_SIZES: [192, 1024]
+ DECODER_LAYER_SIZES: [1024, 192]
+TRAIN:
+ MAX_EPOCH: 300
+ GENERATOR:
+ LR: 2.0e-5
+ DISCRIMINATOR:
+ LR: 1.0e-5
+ LOSS:
+ W_FEAT: 0
+ W_COEFF_EXP: 2
+ W_LM: 1.0e-2
+ W_LM_MOUTH: 0
+ W_REG: 0
+ W_SYNC: 0
+ W_COLOR: 0
+ W_EXPRESSION: 0
+ W_LIPREADING: 0.01
+ W_LIPREADING_VV: 0
+ W_EYE_BLINK: 4
+TAG:
+ NAME: small_dataset

src/config/auido2pose.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+DATASET:
+ TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/train_33.txt
+ EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/val.txt
+ TRAIN_BATCH_SIZE: 64
+ EVAL_BATCH_SIZE: 1
+ EXP: True
+ EXP_DIM: 64
+ FRAME_LEN: 32
+ COEFF_LEN: 73
+ NUM_CLASSES: 46
+ AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav
+ COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb
+ DEBUG: True
+MODEL:
+ AUDIOENCODER:
+ LEAKY_RELU: True
+ NORM: 'IN'
+ DISCRIMINATOR:
+ LEAKY_RELU: False
+ INPUT_CHANNELS: 6
+ CVAE:
+ AUDIO_EMB_IN_SIZE: 512
+ AUDIO_EMB_OUT_SIZE: 6
+ SEQ_LEN: 32
+ LATENT_SIZE: 64
+ ENCODER_LAYER_SIZES: [192, 128]
+ DECODER_LAYER_SIZES: [128, 192]
+TRAIN:
+ MAX_EPOCH: 150
+ GENERATOR:
+ LR: 1.0e-4
+ DISCRIMINATOR:
+ LR: 1.0e-4
+ LOSS:
+ LAMBDA_REG: 1
+ LAMBDA_LANDMARKS: 0
+ LAMBDA_VERTICES: 0
+ LAMBDA_GAN_MOTION: 0.7
+ LAMBDA_GAN_COEFF: 0
+ LAMBDA_KL: 1
+TAG:
+ NAME: cvae_UNET_useAudio_usewav2lipAudioEncoder

src/config/facerender.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+model_params:
+ common_params:
+ num_kp: 15
+ image_channel: 3
+ feature_channel: 32
+ estimate_jacobian: False # True
+ kp_detector_params:
+ temperature: 0.1
+ block_expansion: 32
+ max_features: 1024
+ scale_factor: 0.25 # 0.25
+ num_blocks: 5
+ reshape_channel: 16384 # 16384 = 1024 * 16
+ reshape_depth: 16
+ he_estimator_params:
+ block_expansion: 64
+ max_features: 2048
+ num_bins: 66
+ generator_params:
+ block_expansion: 64
+ max_features: 512
+ num_down_blocks: 2
+ reshape_channel: 32
+ reshape_depth: 16 # 512 = 32 * 16
+ num_resblocks: 6
+ estimate_occlusion_map: True
+ dense_motion_params:
+ block_expansion: 32
+ max_features: 1024
+ num_blocks: 5
+ reshape_depth: 16
+ compress: 4
+ discriminator_params:
+ scales: [1]
+ block_expansion: 32
+ max_features: 512
+ num_blocks: 4
+ sn: True
+ mapping_params:
+ coeff_nc: 70
+ descriptor_nc: 1024
+ layer: 3
+ num_kp: 15
+ num_bins: 66

src/config/facerender_still.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+model_params:
+ common_params:
+ num_kp: 15
+ image_channel: 3
+ feature_channel: 32
+ estimate_jacobian: False # True
+ kp_detector_params:
+ temperature: 0.1
+ block_expansion: 32
+ max_features: 1024
+ scale_factor: 0.25 # 0.25
+ num_blocks: 5
+ reshape_channel: 16384 # 16384 = 1024 * 16
+ reshape_depth: 16
+ he_estimator_params:
+ block_expansion: 64
+ max_features: 2048
+ num_bins: 66
+ generator_params:
+ block_expansion: 64
+ max_features: 512
+ num_down_blocks: 2
+ reshape_channel: 32
+ reshape_depth: 16 # 512 = 32 * 16
+ num_resblocks: 6
+ estimate_occlusion_map: True
+ dense_motion_params:
+ block_expansion: 32
+ max_features: 1024
+ num_blocks: 5
+ reshape_depth: 16
+ compress: 4
+ discriminator_params:
+ scales: [1]
+ block_expansion: 32
+ max_features: 512
+ num_blocks: 4
+ sn: True
+ mapping_params:
+ coeff_nc: 73
+ descriptor_nc: 1024
+ layer: 3
+ num_kp: 15
+ num_bins: 66

src/config/similarity_Lm3D_all.mat ADDED Viewed

Binary file (994 Bytes). View file

src/face3d/data/__init__.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""This package includes all the modules related to data loading and preprocessing
+ To add a custom dataset class called 'dummy', you need to add a file called 'dummy_dataset.py' and define a subclass 'DummyDataset' inherited from BaseDataset.
+ You need to implement four functions:
+ -- <__init__>: initialize the class, first call BaseDataset.__init__(self, opt).
+ -- <__len__>: return the size of dataset.
+ -- <__getitem__>: get a data point from data loader.
+ -- <modify_commandline_options>: (optionally) add dataset-specific options and set default options.
+Now you can use the dataset class by specifying flag '--dataset_mode dummy'.
+See our template dataset class 'template_dataset.py' for more details.
+"""
+import numpy as np
+import importlib
+import torch.utils.data
+from face3d.data.base_dataset import BaseDataset
+def find_dataset_using_name(dataset_name):
+ """Import the module "data/[dataset_name]_dataset.py".
+ In the file, the class called DatasetNameDataset() will
+ be instantiated. It has to be a subclass of BaseDataset,
+ and it is case-insensitive.
+ """
+ dataset_filename = "data." + dataset_name + "_dataset"
+ datasetlib = importlib.import_module(dataset_filename)
+ dataset = None
+ target_dataset_name = dataset_name.replace('_', '') + 'dataset'
+ for name, cls in datasetlib.__dict__.items():
+ if name.lower() == target_dataset_name.lower() \
+ and issubclass(cls, BaseDataset):
+ dataset = cls
+ if dataset is None:
+ raise NotImplementedError("In %s.py, there should be a subclass of BaseDataset with class name that matches %s in lowercase." % (dataset_filename, target_dataset_name))
+ return dataset
+def get_option_setter(dataset_name):
+ """Return the static method <modify_commandline_options> of the dataset class."""
+ dataset_class = find_dataset_using_name(dataset_name)
+ return dataset_class.modify_commandline_options
+def create_dataset(opt, rank=0):
+ """Create a dataset given the option.
+ This function wraps the class CustomDatasetDataLoader.
+ This is the main interface between this package and 'train.py'/'test.py'
+ Example:
+ >>> from data import create_dataset
+ >>> dataset = create_dataset(opt)
+ """
+ data_loader = CustomDatasetDataLoader(opt, rank=rank)
+ dataset = data_loader.load_data()
+ return dataset
+class CustomDatasetDataLoader():
+ """Wrapper class of Dataset class that performs multi-threaded data loading"""
+ def __init__(self, opt, rank=0):
+ """Initialize this class
+ Step 1: create a dataset instance given the name [dataset_mode]
+ Step 2: create a multi-threaded data loader.
+ """
+ self.opt = opt
+ dataset_class = find_dataset_using_name(opt.dataset_mode)
+ self.dataset = dataset_class(opt)
+ self.sampler = None
+ print("rank %d %s dataset [%s] was created" % (rank, self.dataset.name, type(self.dataset).__name__))
+ if opt.use_ddp and opt.isTrain:
+ world_size = opt.world_size
+ self.sampler = torch.utils.data.distributed.DistributedSampler(
+ self.dataset,
+ num_replicas=world_size,
+ rank=rank,
+ shuffle=not opt.serial_batches
+ )
+ self.dataloader = torch.utils.data.DataLoader(
+ self.dataset,
+ sampler=self.sampler,
+ num_workers=int(opt.num_threads / world_size),
+ batch_size=int(opt.batch_size / world_size),
+ drop_last=True)
+ else:
+ self.dataloader = torch.utils.data.DataLoader(
+ self.dataset,
+ batch_size=opt.batch_size,
+ shuffle=(not opt.serial_batches) and opt.isTrain,
+ num_workers=int(opt.num_threads),
+ drop_last=True
+ )
+ def set_epoch(self, epoch):
+ self.dataset.current_epoch = epoch
+ if self.sampler is not None:
+ self.sampler.set_epoch(epoch)
+ def load_data(self):
+ return self
+ def __len__(self):
+ """Return the number of data in the dataset"""
+ return min(len(self.dataset), self.opt.max_dataset_size)
+ def __iter__(self):
+ """Return a batch of data"""
+ for i, data in enumerate(self.dataloader):
+ if i * self.opt.batch_size >= self.opt.max_dataset_size:
+ break
+ yield data

src/face3d/data/base_dataset.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""This module implements an abstract base class (ABC) 'BaseDataset' for datasets.
+It also includes common transformation functions (e.g., get_transform, __scale_width), which can be later used in subclasses.
+"""
+import random
+import numpy as np
+import torch.utils.data as data
+from PIL import Image
+import torchvision.transforms as transforms
+from abc import ABC, abstractmethod
+class BaseDataset(data.Dataset, ABC):
+ """This class is an abstract base class (ABC) for datasets.
+ To create a subclass, you need to implement the following four functions:
+ -- <__init__>: initialize the class, first call BaseDataset.__init__(self, opt).
+ -- <__len__>: return the size of dataset.
+ -- <__getitem__>: get a data point.
+ -- <modify_commandline_options>: (optionally) add dataset-specific options and set default options.
+ """
+ def __init__(self, opt):
+ """Initialize the class; save the options in the class
+ Parameters:
+ opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
+ """
+ self.opt = opt
+ # self.root = opt.dataroot
+ self.current_epoch = 0
+ @staticmethod
+ def modify_commandline_options(parser, is_train):
+ """Add new dataset-specific options, and rewrite default values for existing options.
+ Parameters:
+ parser -- original option parser
+ is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+ Returns:
+ the modified parser.
+ """
+ return parser
+ @abstractmethod
+ def __len__(self):
+ """Return the total number of images in the dataset."""
+ return 0
+ @abstractmethod
+ def __getitem__(self, index):
+ """Return a data point and its metadata information.
+ Parameters:
+ index - - a random integer for data indexing
+ Returns:
+ a dictionary of data with their names. It ususally contains the data itself and its metadata information.
+ """
+ pass
+def get_transform(grayscale=False):
+ transform_list = []
+ if grayscale:
+ transform_list.append(transforms.Grayscale(1))
+ transform_list += [transforms.ToTensor()]
+ return transforms.Compose(transform_list)
+def get_affine_mat(opt, size):
+ shift_x, shift_y, scale, rot_angle, flip = 0., 0., 1., 0., False
+ w, h = size
+ if 'shift' in opt.preprocess:
+ shift_pixs = int(opt.shift_pixs)
+ shift_x = random.randint(-shift_pixs, shift_pixs)
+ shift_y = random.randint(-shift_pixs, shift_pixs)
+ if 'scale' in opt.preprocess:
+ scale = 1 + opt.scale_delta * (2 * random.random() - 1)
+ if 'rot' in opt.preprocess:
+ rot_angle = opt.rot_angle * (2 * random.random() - 1)
+ rot_rad = -rot_angle * np.pi/180
+ if 'flip' in opt.preprocess:
+ flip = random.random() > 0.5
+ shift_to_origin = np.array([1, 0, -w//2, 0, 1, -h//2, 0, 0, 1]).reshape([3, 3])
+ flip_mat = np.array([-1 if flip else 1, 0, 0, 0, 1, 0, 0, 0, 1]).reshape([3, 3])
+ shift_mat = np.array([1, 0, shift_x, 0, 1, shift_y, 0, 0, 1]).reshape([3, 3])
+ rot_mat = np.array([np.cos(rot_rad), np.sin(rot_rad), 0, -np.sin(rot_rad), np.cos(rot_rad), 0, 0, 0, 1]).reshape([3, 3])
+ scale_mat = np.array([scale, 0, 0, 0, scale, 0, 0, 0, 1]).reshape([3, 3])
+ shift_to_center = np.array([1, 0, w//2, 0, 1, h//2, 0, 0, 1]).reshape([3, 3])
+ affine = shift_to_center @ scale_mat @ rot_mat @ shift_mat @ flip_mat @ shift_to_origin
+ affine_inv = np.linalg.inv(affine)
+ return affine, affine_inv, flip
+def apply_img_affine(img, affine_inv, method=Image.BICUBIC):
+ return img.transform(img.size, Image.AFFINE, data=affine_inv.flatten()[:6], resample=Image.BICUBIC)
+def apply_lm_affine(landmark, affine, flip, size):
+ _, h = size
+ lm = landmark.copy()
+ lm[:, 1] = h - 1 - lm[:, 1]
+ lm = np.concatenate((lm, np.ones([lm.shape[0], 1])), -1)
+ lm = lm @ np.transpose(affine)
+ lm[:, :2] = lm[:, :2] / lm[:, 2:]
+ lm = lm[:, :2]
+ lm[:, 1] = h - 1 - lm[:, 1]
+ if flip:
+ lm_ = lm.copy()
+ lm_[:17] = lm[16::-1]
+ lm_[17:22] = lm[26:21:-1]
+ lm_[22:27] = lm[21:16:-1]
+ lm_[31:36] = lm[35:30:-1]
+ lm_[36:40] = lm[45:41:-1]
+ lm_[40:42] = lm[47:45:-1]
+ lm_[42:46] = lm[39:35:-1]
+ lm_[46:48] = lm[41:39:-1]
+ lm_[48:55] = lm[54:47:-1]
+ lm_[55:60] = lm[59:54:-1]
+ lm_[60:65] = lm[64:59:-1]
+ lm_[65:68] = lm[67:64:-1]
+ lm = lm_
+ return lm

src/face3d/data/flist_dataset.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""This script defines the custom dataset for Deep3DFaceRecon_pytorch
+"""
+import os.path
+from data.base_dataset import BaseDataset, get_transform, get_affine_mat, apply_img_affine, apply_lm_affine
+from data.image_folder import make_dataset
+from PIL import Image
+import random
+import util.util as util
+import numpy as np
+import json
+import torch
+from scipy.io import loadmat, savemat
+import pickle
+from util.preprocess import align_img, estimate_norm
+from util.load_mats import load_lm3d
+def default_flist_reader(flist):
+ """
+ flist format: impath label\nimpath label\n ...(same to caffe's filelist)
+ """
+ imlist = []
+ with open(flist, 'r') as rf:
+ for line in rf.readlines():
+ impath = line.strip()
+ imlist.append(impath)
+ return imlist
+def jason_flist_reader(flist):
+ with open(flist, 'r') as fp:
+ info = json.load(fp)
+ return info
+def parse_label(label):
+ return torch.tensor(np.array(label).astype(np.float32))
+class FlistDataset(BaseDataset):
+ """
+ It requires one directories to host training images '/path/to/data/train'
+ You can train the model with the dataset flag '--dataroot /path/to/data'.
+ """
+ def __init__(self, opt):
+ """Initialize this dataset class.
+ Parameters:
+ opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+ """
+ BaseDataset.__init__(self, opt)
+ self.lm3d_std = load_lm3d(opt.bfm_folder)
+ msk_names = default_flist_reader(opt.flist)
+ self.msk_paths = [os.path.join(opt.data_root, i) for i in msk_names]
+ self.size = len(self.msk_paths)
+ self.opt = opt
+ self.name = 'train' if opt.isTrain else 'val'
+ if '_' in opt.flist:
+ self.name += '_' + opt.flist.split(os.sep)[-1].split('_')[0]
+ def __getitem__(self, index):
+ """Return a data point and its metadata information.
+ Parameters:
+ index (int) -- a random integer for data indexing
+ Returns a dictionary that contains A, B, A_paths and B_paths
+ img (tensor) -- an image in the input domain
+ msk (tensor) -- its corresponding attention mask
+ lm (tensor) -- its corresponding 3d landmarks
+ im_paths (str) -- image paths
+ aug_flag (bool) -- a flag used to tell whether its raw or augmented
+ """
+ msk_path = self.msk_paths[index % self.size] # make sure index is within then range
+ img_path = msk_path.replace('mask/', '')
+ lm_path = '.'.join(msk_path.replace('mask', 'landmarks').split('.')[:-1]) + '.txt'
+ raw_img = Image.open(img_path).convert('RGB')
+ raw_msk = Image.open(msk_path).convert('RGB')
+ raw_lm = np.loadtxt(lm_path).astype(np.float32)
+ _, img, lm, msk = align_img(raw_img, raw_lm, self.lm3d_std, raw_msk)
+ aug_flag = self.opt.use_aug and self.opt.isTrain
+ if aug_flag:
+ img, lm, msk = self._augmentation(img, lm, self.opt, msk)
+ _, H = img.size
+ M = estimate_norm(lm, H)
+ transform = get_transform()
+ img_tensor = transform(img)
+ msk_tensor = transform(msk)[:1, ...]
+ lm_tensor = parse_label(lm)
+ M_tensor = parse_label(M)
+ return {'imgs': img_tensor,
+ 'lms': lm_tensor,
+ 'msks': msk_tensor,
+ 'M': M_tensor,
+ 'im_paths': img_path,
+ 'aug_flag': aug_flag,
+ 'dataset': self.name}
+ def _augmentation(self, img, lm, opt, msk=None):
+ affine, affine_inv, flip = get_affine_mat(opt, img.size)
+ img = apply_img_affine(img, affine_inv)
+ lm = apply_lm_affine(lm, affine, flip, img.size)
+ if msk is not None:
+ msk = apply_img_affine(msk, affine_inv, method=Image.BILINEAR)
+ return img, lm, msk
+ def __len__(self):
+ """Return the total number of images in the dataset.
+ """
+ return self.size

src/face3d/data/image_folder.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""A modified image folder class
+We modify the official PyTorch image folder (https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py)
+so that this class can load images from both current directory and its subdirectories.
+"""
+import numpy as np
+import torch.utils.data as data
+from PIL import Image
+import os
+import os.path
+IMG_EXTENSIONS = [
+ '.jpg', '.JPG', '.jpeg', '.JPEG',
+ '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
+ '.tif', '.TIF', '.tiff', '.TIFF',
+]
+def is_image_file(filename):
+ return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+def make_dataset(dir, max_dataset_size=float("inf")):
+ images = []
+ assert os.path.isdir(dir) or os.path.islink(dir), '%s is not a valid directory' % dir
+ for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
+ for fname in fnames:
+ if is_image_file(fname):
+ path = os.path.join(root, fname)
+ images.append(path)
+ return images[:min(max_dataset_size, len(images))]
+def default_loader(path):
+ return Image.open(path).convert('RGB')
+class ImageFolder(data.Dataset):
+ def __init__(self, root, transform=None, return_paths=False,
+ loader=default_loader):
+ imgs = make_dataset(root)
+ if len(imgs) == 0:
+ raise(RuntimeError("Found 0 images in: " + root + "\n"
+ "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
+ self.root = root
+ self.imgs = imgs
+ self.transform = transform
+ self.return_paths = return_paths
+ self.loader = loader
+ def __getitem__(self, index):
+ path = self.imgs[index]
+ img = self.loader(path)
+ if self.transform is not None:
+ img = self.transform(img)
+ if self.return_paths:
+ return img, path
+ else:
+ return img
+ def __len__(self):
+ return len(self.imgs)

src/face3d/data/template_dataset.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Dataset class template
+This module provides a template for users to implement custom datasets.
+You can specify '--dataset_mode template' to use this dataset.
+The class name should be consistent with both the filename and its dataset_mode option.
+The filename should be <dataset_mode>_dataset.py
+The class name should be <Dataset_mode>Dataset.py
+You need to implement the following functions:
+ -- <modify_commandline_options>:　Add dataset-specific options and rewrite default values for existing options.
+ -- <__init__>: Initialize this dataset class.
+ -- <__getitem__>: Return a data point and its metadata information.
+ -- <__len__>: Return the number of images.
+"""
+from data.base_dataset import BaseDataset, get_transform
+# from data.image_folder import make_dataset
+# from PIL import Image
+class TemplateDataset(BaseDataset):
+ """A template dataset class for you to implement custom datasets."""
+ @staticmethod
+ def modify_commandline_options(parser, is_train):
+ """Add new dataset-specific options, and rewrite default values for existing options.
+ Parameters:
+ parser -- original option parser
+ is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+ Returns:
+ the modified parser.
+ """
+ parser.add_argument('--new_dataset_option', type=float, default=1.0, help='new dataset option')
+ parser.set_defaults(max_dataset_size=10, new_dataset_option=2.0) # specify dataset-specific default values
+ return parser
+ def __init__(self, opt):
+ """Initialize this dataset class.
+ Parameters:
+ opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+ A few things can be done here.
+ - save the options (have been done in BaseDataset)
+ - get image paths and meta information of the dataset.
+ - define the image transformation.
+ """
+ # save the option and dataset root
+ BaseDataset.__init__(self, opt)
+ # get the image paths of your dataset;
+ self.image_paths = [] # You can call sorted(make_dataset(self.root, opt.max_dataset_size)) to get all the image paths under the directory self.root
+ # define the default transform function. You can use <base_dataset.get_transform>; You can also define your custom transform function
+ self.transform = get_transform(opt)
+ def __getitem__(self, index):
+ """Return a data point and its metadata information.
+ Parameters:
+ index -- a random integer for data indexing
+ Returns:
+ a dictionary of data with their names. It usually contains the data itself and its metadata information.
+ Step 1: get a random image path: e.g., path = self.image_paths[index]
+ Step 2: load your data from the disk: e.g., image = Image.open(path).convert('RGB').
+ Step 3: convert your data to a PyTorch tensor. You can use helpder functions such as self.transform. e.g., data = self.transform(image)
+ Step 4: return a data point as a dictionary.
+ """
+ path = 'temp' # needs to be a string
+ data_A = None # needs to be a tensor
+ data_B = None # needs to be a tensor
+ return {'data_A': data_A, 'data_B': data_B, 'path': path}
+ def __len__(self):
+ """Return the total number of images."""
+ return len(self.image_paths)

src/face3d/extract_kp_videos.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import cv2
+import time
+import glob
+import argparse
+import face_alignment
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from itertools import cycle
+from torch.multiprocessing import Pool, Process, set_start_method
+class KeypointExtractor():
+ def __init__(self, device):
+ self.detector = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D,
+ device=device)
+ def extract_keypoint(self, images, name=None, info=True):
+ if isinstance(images, list):
+ keypoints = []
+ if info:
+ i_range = tqdm(images,desc='landmark Det:')
+ else:
+ i_range = images
+ for image in i_range:
+ current_kp = self.extract_keypoint(image)
+ if np.mean(current_kp) == -1 and keypoints:
+ keypoints.append(keypoints[-1])
+ else:
+ keypoints.append(current_kp[None])
+ keypoints = np.concatenate(keypoints, 0)
+ np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
+ return keypoints
+ else:
+ while True:
+ try:
+ keypoints = self.detector.get_landmarks_from_image(np.array(images))[0]
+ break
+ except RuntimeError as e:
+ if str(e).startswith('CUDA'):
+ print("Warning: out of memory, sleep for 1s")
+ time.sleep(1)
+ else:
+ print(e)
+ break
+ except TypeError:
+ print('No face detected in this image')
+ shape = [68, 2]
+ keypoints = -1. * np.ones(shape)
+ break
+ if name is not None:
+ np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
+ return keypoints
+def read_video(filename):
+ frames = []
+ cap = cv2.VideoCapture(filename)
+ while cap.isOpened():
+ ret, frame = cap.read()
+ if ret:
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+ frame = Image.fromarray(frame)
+ frames.append(frame)
+ else:
+ break
+ cap.release()
+ return frames
+def run(data):
+ filename, opt, device = data
+ os.environ['CUDA_VISIBLE_DEVICES'] = device
+ kp_extractor = KeypointExtractor()
+ images = read_video(filename)
+ name = filename.split('/')[-2:]
+ os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
+ kp_extractor.extract_keypoint(
+ images,
+ name=os.path.join(opt.output_dir, name[-2], name[-1])
+ )
+if __name__ == '__main__':
+ set_start_method('spawn')
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--input_dir', type=str, help='the folder of the input files')
+ parser.add_argument('--output_dir', type=str, help='the folder of the output files')
+ parser.add_argument('--device_ids', type=str, default='0,1')
+ parser.add_argument('--workers', type=int, default=4)
+ opt = parser.parse_args()
+ filenames = list()
+ VIDEO_EXTENSIONS_LOWERCASE = {'mp4'}
+ VIDEO_EXTENSIONS = VIDEO_EXTENSIONS_LOWERCASE.union({f.upper() for f in VIDEO_EXTENSIONS_LOWERCASE})
+ extensions = VIDEO_EXTENSIONS
+ for ext in extensions:
+ os.listdir(f'{opt.input_dir}')
+ print(f'{opt.input_dir}/*.{ext}')
+ filenames = sorted(glob.glob(f'{opt.input_dir}/*.{ext}'))
+ print('Total number of videos:', len(filenames))
+ pool = Pool(opt.workers)
+ args_list = cycle([opt])
+ device_ids = opt.device_ids.split(",")
+ device_ids = cycle(device_ids)
+ for data in tqdm(pool.imap_unordered(run, zip(filenames, args_list, device_ids))):
+ None

src/face3d/extract_kp_videos_safe.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import cv2
+import time
+import glob
+import argparse
+import numpy as np
+from PIL import Image
+import torch
+from tqdm import tqdm
+from itertools import cycle
+from torch.multiprocessing import Pool, Process, set_start_method
+from facexlib.alignment import landmark_98_to_68
+from facexlib.detection import init_detection_model
+from facexlib.utils import load_file_from_url
+from src.face3d.util.my_awing_arch import FAN
+def init_alignment_model(model_name, half=False, device='cuda', model_rootpath=None):
+ if model_name == 'awing_fan':
+ model = FAN(num_modules=4, num_landmarks=98, device=device)
+ model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/alignment_WFLW_4HG.pth'
+ else:
+ raise NotImplementedError(f'{model_name} is not implemented.')
+ model_path = load_file_from_url(
+ url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath)
+ model.load_state_dict(torch.load(model_path, map_location=device)['state_dict'], strict=True)
+ model.eval()
+ model = model.to(device)
+ return model
+class KeypointExtractor():
+ def __init__(self, device='cuda'):
+ ### gfpgan/weights
+ try:
+ import webui # in webui
+ root_path = 'extensions/SadTalker/gfpgan/weights'
+ except:
+ root_path = 'gfpgan/weights'
+ self.detector = init_alignment_model('awing_fan',device=device, model_rootpath=root_path)
+ self.det_net = init_detection_model('retinaface_resnet50', half=False,device=device, model_rootpath=root_path)
+ def extract_keypoint(self, images, name=None, info=True):
+ if isinstance(images, list):
+ keypoints = []
+ if info:
+ i_range = tqdm(images,desc='landmark Det:')
+ else:
+ i_range = images
+ for image in i_range:
+ current_kp = self.extract_keypoint(image)
+ # current_kp = self.detector.get_landmarks(np.array(image))
+ if np.mean(current_kp) == -1 and keypoints:
+ keypoints.append(keypoints[-1])
+ else:
+ keypoints.append(current_kp[None])
+ keypoints = np.concatenate(keypoints, 0)
+ np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
+ return keypoints
+ else:
+ while True:
+ try:
+ with torch.no_grad():
+ # face detection -> face alignment.
+ img = np.array(images)
+ bboxes = self.det_net.detect_faces(images, 0.97)
+ bboxes = bboxes[0]
+ img = img[int(bboxes[1]):int(bboxes[3]), int(bboxes[0]):int(bboxes[2]), :]
+ keypoints = landmark_98_to_68(self.detector.get_landmarks(img)) # [0]
+ #### keypoints to the original location
+ keypoints[:,0] += int(bboxes[0])
+ keypoints[:,1] += int(bboxes[1])
+ break
+ except RuntimeError as e:
+ if str(e).startswith('CUDA'):
+ print("Warning: out of memory, sleep for 1s")
+ time.sleep(1)
+ else:
+ print(e)
+ break
+ except TypeError:
+ print('No face detected in this image')
+ shape = [68, 2]
+ keypoints = -1. * np.ones(shape)
+ break
+ if name is not None:
+ np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
+ return keypoints
+def read_video(filename):
+ frames = []
+ cap = cv2.VideoCapture(filename)
+ while cap.isOpened():
+ ret, frame = cap.read()
+ if ret:
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+ frame = Image.fromarray(frame)
+ frames.append(frame)
+ else:
+ break
+ cap.release()
+ return frames
+def run(data):
+ filename, opt, device = data
+ os.environ['CUDA_VISIBLE_DEVICES'] = device
+ kp_extractor = KeypointExtractor()
+ images = read_video(filename)
+ name = filename.split('/')[-2:]
+ os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
+ kp_extractor.extract_keypoint(
+ images,
+ name=os.path.join(opt.output_dir, name[-2], name[-1])
+ )
+if __name__ == '__main__':
+ set_start_method('spawn')
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--input_dir', type=str, help='the folder of the input files')
+ parser.add_argument('--output_dir', type=str, help='the folder of the output files')
+ parser.add_argument('--device_ids', type=str, default='0,1')
+ parser.add_argument('--workers', type=int, default=4)
+ opt = parser.parse_args()
+ filenames = list()
+ VIDEO_EXTENSIONS_LOWERCASE = {'mp4'}
+ VIDEO_EXTENSIONS = VIDEO_EXTENSIONS_LOWERCASE.union({f.upper() for f in VIDEO_EXTENSIONS_LOWERCASE})
+ extensions = VIDEO_EXTENSIONS
+ for ext in extensions:
+ os.listdir(f'{opt.input_dir}')
+ print(f'{opt.input_dir}/*.{ext}')
+ filenames = sorted(glob.glob(f'{opt.input_dir}/*.{ext}'))
+ print('Total number of videos:', len(filenames))
+ pool = Pool(opt.workers)
+ args_list = cycle([opt])
+ device_ids = opt.device_ids.split(",")
+ device_ids = cycle(device_ids)
+ for data in tqdm(pool.imap_unordered(run, zip(filenames, args_list, device_ids))):
+ None

src/face3d/models/__init__.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""This package contains modules related to objective functions, optimizations, and network architectures.
+To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
+You need to implement the following five functions:
+ -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
+ -- <set_input>: unpack data from dataset and apply preprocessing.
+ -- <forward>: produce intermediate results.
+ -- <optimize_parameters>: calculate loss, gradients, and update network weights.
+ -- <modify_commandline_options>: (optionally) add model-specific options and set default options.
+In the function <__init__>, you need to define four lists:
+ -- self.loss_names (str list): specify the training losses that you want to plot and save.
+ -- self.model_names (str list): define networks used in our training.
+ -- self.visual_names (str list): specify the images that you want to display and save.
+ -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
+Now you can use the model class by specifying flag '--model dummy'.
+See our template model class 'template_model.py' for more details.
+"""
+import importlib
+from src.face3d.models.base_model import BaseModel
+def find_model_using_name(model_name):
+ """Import the module "models/[model_name]_model.py".
+ In the file, the class called DatasetNameModel() will
+ be instantiated. It has to be a subclass of BaseModel,
+ and it is case-insensitive.
+ """
+ model_filename = "face3d.models." + model_name + "_model"
+ modellib = importlib.import_module(model_filename)
+ model = None
+ target_model_name = model_name.replace('_', '') + 'model'
+ for name, cls in modellib.__dict__.items():
+ if name.lower() == target_model_name.lower() \
+ and issubclass(cls, BaseModel):
+ model = cls
+ if model is None:
+ print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
+ exit(0)
+ return model
+def get_option_setter(model_name):
+ """Return the static method <modify_commandline_options> of the model class."""
+ model_class = find_model_using_name(model_name)
+ return model_class.modify_commandline_options
+def create_model(opt):
+ """Create a model given the option.
+ This function warps the class CustomDatasetDataLoader.
+ This is the main interface between this package and 'train.py'/'test.py'
+ Example:
+ >>> from models import create_model
+ >>> model = create_model(opt)
+ """
+ model = find_model_using_name(opt.model)
+ instance = model(opt)
+ print("model [%s] was created" % type(instance).__name__)
+ return instance

src/face3d/models/arcface_torch/README.md ADDED Viewed

	@@ -0,0 +1,164 @@

+# Distributed Arcface Training in Pytorch
+This is a deep learning library that makes face recognition efficient, and effective, which can train tens of millions
+identity on a single server.
+## Requirements
+- Install [pytorch](http://pytorch.org) (torch>=1.6.0), our doc for [install.md](docs/install.md).
+- `pip install -r requirements.txt`.
+- Download the dataset
+ from [https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_)
+ .
+## How to Training
+To train a model, run `train.py` with the path to the configs:
+### 1. Single node, 8 GPUs:
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/ms1mv3_r50
+```
+### 2. Multiple nodes, each node 8 GPUs:
+Node 0:
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=1234 train.py train.py configs/ms1mv3_r50
+```
+Node 1:
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=1234 train.py train.py configs/ms1mv3_r50
+```
+### 3.Training resnet2060 with 8 GPUs:
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/ms1mv3_r2060.py
+```
+## Model Zoo
+- The models are available for non-commercial research purposes only.
+- All models can be found in here.
+- [Baidu Yun Pan](https://pan.baidu.com/s/1CL-l4zWqsI1oDuEEYVhj-g): e8pw
+- [onedrive](https://1drv.ms/u/s!AswpsDO2toNKq0lWY69vN58GR6mw?e=p9Ov5d)
+### Performance on [**ICCV2021-MFR**](http://iccv21-mfr.com/)
+ICCV2021-MFR testset consists of non-celebrities so we can ensure that it has very few overlap with public available face
+recognition training set, such as MS1M and CASIA as they mostly collected from online celebrities.
+As the result, we can evaluate the FAIR performance for different algorithms.
+For **ICCV2021-MFR-ALL** set, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.000001(e-6). The
+globalised multi-racial testset contains 242,143 identities and 1,624,305 images.
+For **ICCV2021-MFR-MASK** set, TAR is measured on mask-to-nonmask 1:1 protocal, with FAR less than 0.0001(e-4).
+Mask testset contains 6,964 identities, 6,964 masked images and 13,928 non-masked images.
+There are totally 13,928 positive pairs and 96,983,824 negative pairs.
+| Datasets | backbone | Training throughout | Size / MB | **ICCV2021-MFR-MASK** | **ICCV2021-MFR-ALL** |
+| :---: | :--- | :--- | :--- |:--- |:--- |
+| MS1MV3 | r18 | - | 91 | **47.85** | **68.33** |
+| Glint360k | r18 | 8536 | 91 | **53.32** | **72.07** |
+| MS1MV3 | r34 | - | 130 | **58.72** | **77.36** |
+| Glint360k | r34 | 6344 | 130 | **65.10** | **83.02** |
+| MS1MV3 | r50 | 5500 | 166 | **63.85** | **80.53** |
+| Glint360k | r50 | 5136 | 166 | **70.23** | **87.08** |
+| MS1MV3 | r100 | - | 248 | **69.09** | **84.31** |
+| Glint360k | r100 | 3332 | 248 | **75.57** | **90.66** |
+| MS1MV3 | mobilefacenet | 12185 | 7.8 | **41.52** | **65.26** |
+| Glint360k | mobilefacenet | 11197 | 7.8 | **44.52** | **66.48** |
+### Performance on IJB-C and Verification Datasets
+| Datasets | backbone | IJBC(1e-05) | IJBC(1e-04) | agedb30 | cfp_fp | lfw | log |
+| :---: | :--- | :--- | :--- | :--- |:--- |:--- |:--- |
+| MS1MV3 | r18 | 92.07 | 94.66 | 97.77 | 97.73 | 99.77 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r18_fp16/training.log)|
+| MS1MV3 | r34 | 94.10 | 95.90 | 98.10 | 98.67 | 99.80 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r34_fp16/training.log)|
+| MS1MV3 | r50 | 94.79 | 96.46 | 98.35 | 98.96 | 99.83 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r50_fp16/training.log)|
+| MS1MV3 | r100 | 95.31 | 96.81 | 98.48 | 99.06 | 99.85 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r100_fp16/training.log)|
+| MS1MV3 | **r2060**| 95.34 | 97.11 | 98.67 | 99.24 | 99.87 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_arcface_r2060_fp16/training.log)|
+| Glint360k |r18-0.1 | 93.16 | 95.33 | 97.72 | 97.73 | 99.77 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r18_fp16_0.1/training.log)|
+| Glint360k |r34-0.1 | 95.16 | 96.56 | 98.33 | 98.78 | 99.82 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r34_fp16_0.1/training.log)|
+| Glint360k |r50-0.1 | 95.61 | 96.97 | 98.38 | 99.20 | 99.83 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r50_fp16_0.1/training.log)|
+| Glint360k |r100-0.1 | 95.88 | 97.32 | 98.48 | 99.29 | 99.82 |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|
+[comment]: <> (More details see [model.md]&#40;docs/modelzoo.md&#41; in docs.)
+## [Speed Benchmark](docs/speed_benchmark.md)
+**Arcface Torch** can train large-scale face recognition training set efficiently and quickly. When the number of
+classes in training sets is greater than 300K and the training is sufficient, partial fc sampling strategy will get same
+accuracy with several times faster training performance and smaller GPU memory.
+Partial FC is a sparse variant of the model parallel architecture for large sacle face recognition. Partial FC use a
+sparse softmax, where each batch dynamicly sample a subset of class centers for training. In each iteration, only a
+sparse part of the parameters will be updated, which can reduce a lot of GPU memory and calculations. With Partial FC,
+we can scale trainset of 29 millions identities, the largest to date. Partial FC also supports multi-machine distributed
+training and mixed precision training.
+![Image text](https://github.com/anxiangsir/insightface_arcface_log/blob/master/partial_fc_v2.png)
+More details see
+[speed_benchmark.md](docs/speed_benchmark.md) in docs.
+### 1. Training speed of different parallel methods (samples / second), Tesla V100 32GB * 8. (Larger is better)
+`-` means training failed because of gpu memory limitations.
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :--- | :--- | :--- | :--- |
+|125000 | 4681 | 4824 | 5004 |
+|1400000 | **1672** | 3043 | 4738 |
+|5500000 | **-** | **1389** | 3975 |
+|8000000 | **-** | **-** | 3565 |
+|16000000 | **-** | **-** | 2679 |
+|29000000 | **-** | **-** | **1855** |
+### 2. GPU memory cost of different parallel methods (MB per GPU), Tesla V100 32GB * 8. (Smaller is better)
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :--- | :--- | :--- | :--- |
+|125000 | 7358 | 5306 | 4868 |
+|1400000 | 32252 | 11178 | 6056 |
+|5500000 | **-** | 32188 | 9854 |
+|8000000 | **-** | **-** | 12310 |
+|16000000 | **-** | **-** | 19950 |
+|29000000 | **-** | **-** | 32324 |
+## Evaluation ICCV2021-MFR and IJB-C
+More details see [eval.md](docs/eval.md) in docs.
+## Test
+We tested many versions of PyTorch. Please create an issue if you are having trouble.
+- [x] torch 1.6.0
+- [x] torch 1.7.1
+- [x] torch 1.8.0
+- [x] torch 1.9.0
+## Citation
+```
+@inproceedings{deng2019arcface,
+ title={Arcface: Additive angular margin loss for deep face recognition},
+ author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={4690--4699},
+ year={2019}
+}
+@inproceedings{an2020partical_fc,
+ title={Partial FC: Training 10 Million Identities on a Single Machine},
+ author={An, Xiang and Zhu, Xuhan and Xiao, Yang and Wu, Lan and Zhang, Ming and Gao, Yuan and Qin, Bin and
+ Zhang, Debing and Fu Ying},
+ booktitle={Arxiv 2010.05222},
+ year={2020}
+}
+```

src/face3d/models/arcface_torch/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from .iresnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200
+from .mobilefacenet import get_mbf
+def get_model(name, **kwargs):
+ # resnet
+ if name == "r18":
+ return iresnet18(False, **kwargs)
+ elif name == "r34":
+ return iresnet34(False, **kwargs)
+ elif name == "r50":
+ return iresnet50(False, **kwargs)
+ elif name == "r100":
+ return iresnet100(False, **kwargs)
+ elif name == "r200":
+ return iresnet200(False, **kwargs)
+ elif name == "r2060":
+ from .iresnet2060 import iresnet2060
+ return iresnet2060(False, **kwargs)
+ elif name == "mbf":
+ fp16 = kwargs.get("fp16", False)
+ num_features = kwargs.get("num_features", 512)
+ return get_mbf(fp16=fp16, num_features=num_features)
+ else:
+ raise ValueError()

src/face3d/models/arcface_torch/backbones/iresnet.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+from torch import nn
+__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200']
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+ """3x3 convolution with padding"""
+ return nn.Conv2d(in_planes,
+ out_planes,
+ kernel_size=3,
+ stride=stride,
+ padding=dilation,
+ groups=groups,
+ bias=False,
+ dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+ """1x1 convolution"""
+ return nn.Conv2d(in_planes,
+ out_planes,
+ kernel_size=1,
+ stride=stride,
+ bias=False)
+class IBasicBlock(nn.Module):
+ expansion = 1
+ def __init__(self, inplanes, planes, stride=1, downsample=None,
+ groups=1, base_width=64, dilation=1):
+ super(IBasicBlock, self).__init__()
+ if groups != 1 or base_width != 64:
+ raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+ if dilation > 1:
+ raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+ self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
+ self.conv1 = conv3x3(inplanes, planes)
+ self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
+ self.prelu = nn.PReLU(planes)
+ self.conv2 = conv3x3(planes, planes, stride)
+ self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
+ self.downsample = downsample
+ self.stride = stride
+ def forward(self, x):
+ identity = x
+ out = self.bn1(x)
+ out = self.conv1(out)
+ out = self.bn2(out)
+ out = self.prelu(out)
+ out = self.conv2(out)
+ out = self.bn3(out)
+ if self.downsample is not None:
+ identity = self.downsample(x)
+ out += identity
+ return out
+class IResNet(nn.Module):
+ fc_scale = 7 * 7
+ def __init__(self,
+ block, layers, dropout=0, num_features=512, zero_init_residual=False,
+ groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+ super(IResNet, self).__init__()
+ self.fp16 = fp16
+ self.inplanes = 64
+ self.dilation = 1
+ if replace_stride_with_dilation is None:
+ replace_stride_with_dilation = [False, False, False]
+ if len(replace_stride_with_dilation) != 3:
+ raise ValueError("replace_stride_with_dilation should be None "
+ "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+ self.groups = groups
+ self.base_width = width_per_group
+ self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+ self.prelu = nn.PReLU(self.inplanes)
+ self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+ self.layer2 = self._make_layer(block,
+ 128,
+ layers[1],
+ stride=2,
+ dilate=replace_stride_with_dilation[0])
+ self.layer3 = self._make_layer(block,
+ 256,
+ layers[2],
+ stride=2,
+ dilate=replace_stride_with_dilation[1])
+ self.layer4 = self._make_layer(block,
+ 512,
+ layers[3],
+ stride=2,
+ dilate=replace_stride_with_dilation[2])
+ self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,)
+ self.dropout = nn.Dropout(p=dropout, inplace=True)
+ self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+ self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+ nn.init.constant_(self.features.weight, 1.0)
+ self.features.weight.requires_grad = False
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.normal_(m.weight, 0, 0.1)
+ elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+ if zero_init_residual:
+ for m in self.modules():
+ if isinstance(m, IBasicBlock):
+ nn.init.constant_(m.bn2.weight, 0)
+ def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+ downsample = None
+ previous_dilation = self.dilation
+ if dilate:
+ self.dilation *= stride
+ stride = 1
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ conv1x1(self.inplanes, planes * block.expansion, stride),
+ nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+ )
+ layers = []
+ layers.append(
+ block(self.inplanes, planes, stride, downsample, self.groups,
+ self.base_width, previous_dilation))
+ self.inplanes = planes * block.expansion
+ for _ in range(1, blocks):
+ layers.append(
+ block(self.inplanes,
+ planes,
+ groups=self.groups,
+ base_width=self.base_width,
+ dilation=self.dilation))
+ return nn.Sequential(*layers)
+ def forward(self, x):
+ with torch.cuda.amp.autocast(self.fp16):
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.prelu(x)
+ x = self.layer1(x)
+ x = self.layer2(x)
+ x = self.layer3(x)
+ x = self.layer4(x)
+ x = self.bn2(x)
+ x = torch.flatten(x, 1)
+ x = self.dropout(x)
+ x = self.fc(x.float() if self.fp16 else x)
+ x = self.features(x)
+ return x
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+ model = IResNet(block, layers, **kwargs)
+ if pretrained:
+ raise ValueError()
+ return model
+def iresnet18(pretrained=False, progress=True, **kwargs):
+ return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained,
+ progress, **kwargs)
+def iresnet34(pretrained=False, progress=True, **kwargs):
+ return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained,
+ progress, **kwargs)
+def iresnet50(pretrained=False, progress=True, **kwargs):
+ return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained,
+ progress, **kwargs)
+def iresnet100(pretrained=False, progress=True, **kwargs):
+ return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained,
+ progress, **kwargs)
+def iresnet200(pretrained=False, progress=True, **kwargs):
+ return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained,
+ progress, **kwargs)

src/face3d/models/arcface_torch/backbones/iresnet2060.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import torch
+from torch import nn
+assert torch.__version__ >= "1.8.1"
+from torch.utils.checkpoint import checkpoint_sequential
+__all__ = ['iresnet2060']
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+ """3x3 convolution with padding"""
+ return nn.Conv2d(in_planes,
+ out_planes,
+ kernel_size=3,
+ stride=stride,
+ padding=dilation,
+ groups=groups,
+ bias=False,
+ dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+ """1x1 convolution"""
+ return nn.Conv2d(in_planes,
+ out_planes,
+ kernel_size=1,
+ stride=stride,
+ bias=False)
+class IBasicBlock(nn.Module):
+ expansion = 1
+ def __init__(self, inplanes, planes, stride=1, downsample=None,
+ groups=1, base_width=64, dilation=1):
+ super(IBasicBlock, self).__init__()
+ if groups != 1 or base_width != 64:
+ raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+ if dilation > 1:
+ raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+ self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05, )
+ self.conv1 = conv3x3(inplanes, planes)
+ self.bn2 = nn.BatchNorm2d(planes, eps=1e-05, )
+ self.prelu = nn.PReLU(planes)
+ self.conv2 = conv3x3(planes, planes, stride)
+ self.bn3 = nn.BatchNorm2d(planes, eps=1e-05, )
+ self.downsample = downsample
+ self.stride = stride
+ def forward(self, x):
+ identity = x
+ out = self.bn1(x)
+ out = self.conv1(out)
+ out = self.bn2(out)
+ out = self.prelu(out)
+ out = self.conv2(out)
+ out = self.bn3(out)
+ if self.downsample is not None:
+ identity = self.downsample(x)
+ out += identity
+ return out
+class IResNet(nn.Module):
+ fc_scale = 7 * 7
+ def __init__(self,
+ block, layers, dropout=0, num_features=512, zero_init_residual=False,
+ groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+ super(IResNet, self).__init__()
+ self.fp16 = fp16
+ self.inplanes = 64
+ self.dilation = 1
+ if replace_stride_with_dilation is None:
+ replace_stride_with_dilation = [False, False, False]
+ if len(replace_stride_with_dilation) != 3:
+ raise ValueError("replace_stride_with_dilation should be None "
+ "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+ self.groups = groups
+ self.base_width = width_per_group
+ self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+ self.prelu = nn.PReLU(self.inplanes)
+ self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+ self.layer2 = self._make_layer(block,
+ 128,
+ layers[1],
+ stride=2,
+ dilate=replace_stride_with_dilation[0])
+ self.layer3 = self._make_layer(block,
+ 256,
+ layers[2],
+ stride=2,
+ dilate=replace_stride_with_dilation[1])
+ self.layer4 = self._make_layer(block,
+ 512,
+ layers[3],
+ stride=2,
+ dilate=replace_stride_with_dilation[2])
+ self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05, )
+ self.dropout = nn.Dropout(p=dropout, inplace=True)
+ self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+ self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+ nn.init.constant_(self.features.weight, 1.0)
+ self.features.weight.requires_grad = False
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.normal_(m.weight, 0, 0.1)
+ elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+ if zero_init_residual:
+ for m in self.modules():
+ if isinstance(m, IBasicBlock):
+ nn.init.constant_(m.bn2.weight, 0)
+ def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+ downsample = None
+ previous_dilation = self.dilation
+ if dilate:
+ self.dilation *= stride
+ stride = 1
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ conv1x1(self.inplanes, planes * block.expansion, stride),
+ nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+ )
+ layers = []
+ layers.append(
+ block(self.inplanes, planes, stride, downsample, self.groups,
+ self.base_width, previous_dilation))
+ self.inplanes = planes * block.expansion
+ for _ in range(1, blocks):
+ layers.append(
+ block(self.inplanes,
+ planes,
+ groups=self.groups,
+ base_width=self.base_width,
+ dilation=self.dilation))
+ return nn.Sequential(*layers)
+ def checkpoint(self, func, num_seg, x):
+ if self.training:
+ return checkpoint_sequential(func, num_seg, x)
+ else:
+ return func(x)
+ def forward(self, x):
+ with torch.cuda.amp.autocast(self.fp16):
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.prelu(x)
+ x = self.layer1(x)
+ x = self.checkpoint(self.layer2, 20, x)
+ x = self.checkpoint(self.layer3, 100, x)
+ x = self.layer4(x)
+ x = self.bn2(x)
+ x = torch.flatten(x, 1)
+ x = self.dropout(x)
+ x = self.fc(x.float() if self.fp16 else x)
+ x = self.features(x)
+ return x
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+ model = IResNet(block, layers, **kwargs)
+ if pretrained:
+ raise ValueError()
+ return model
+def iresnet2060(pretrained=False, progress=True, **kwargs):
+ return _iresnet('iresnet2060', IBasicBlock, [3, 128, 1024 - 128, 3], pretrained, progress, **kwargs)

src/face3d/models/arcface_torch/backbones/mobilefacenet.py ADDED Viewed

	@@ -0,0 +1,130 @@

+'''
+Adapted from https://github.com/cavalleria/cavaface.pytorch/blob/master/backbone/mobilefacenet.py
+Original author cavalleria
+'''
+import torch.nn as nn
+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Sequential, Module
+import torch
+class Flatten(Module):
+ def forward(self, x):
+ return x.view(x.size(0), -1)
+class ConvBlock(Module):
+ def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+ super(ConvBlock, self).__init__()
+ self.layers = nn.Sequential(
+ Conv2d(in_c, out_c, kernel, groups=groups, stride=stride, padding=padding, bias=False),
+ BatchNorm2d(num_features=out_c),
+ PReLU(num_parameters=out_c)
+ )
+ def forward(self, x):
+ return self.layers(x)
+class LinearBlock(Module):
+ def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+ super(LinearBlock, self).__init__()
+ self.layers = nn.Sequential(
+ Conv2d(in_c, out_c, kernel, stride, padding, groups=groups, bias=False),
+ BatchNorm2d(num_features=out_c)
+ )
+ def forward(self, x):
+ return self.layers(x)
+class DepthWise(Module):
+ def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
+ super(DepthWise, self).__init__()
+ self.residual = residual
+ self.layers = nn.Sequential(
+ ConvBlock(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)),
+ ConvBlock(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride),
+ LinearBlock(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+ )
+ def forward(self, x):
+ short_cut = None
+ if self.residual:
+ short_cut = x
+ x = self.layers(x)
+ if self.residual:
+ output = short_cut + x
+ else:
+ output = x
+ return output
+class Residual(Module):
+ def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
+ super(Residual, self).__init__()
+ modules = []
+ for _ in range(num_block):
+ modules.append(DepthWise(c, c, True, kernel, stride, padding, groups))
+ self.layers = Sequential(*modules)
+ def forward(self, x):
+ return self.layers(x)
+class GDC(Module):
+ def __init__(self, embedding_size):
+ super(GDC, self).__init__()
+ self.layers = nn.Sequential(
+ LinearBlock(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)),
+ Flatten(),
+ Linear(512, embedding_size, bias=False),
+ BatchNorm1d(embedding_size))
+ def forward(self, x):
+ return self.layers(x)
+class MobileFaceNet(Module):
+ def __init__(self, fp16=False, num_features=512):
+ super(MobileFaceNet, self).__init__()
+ scale = 2
+ self.fp16 = fp16
+ self.layers = nn.Sequential(
+ ConvBlock(3, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1)),
+ ConvBlock(64 * scale, 64 * scale, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64),
+ DepthWise(64 * scale, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128),
+ Residual(64 * scale, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+ DepthWise(64 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256),
+ Residual(128 * scale, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+ DepthWise(128 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512),
+ Residual(128 * scale, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+ )
+ self.conv_sep = ConvBlock(128 * scale, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+ self.features = GDC(num_features)
+ self._initialize_weights()
+ def _initialize_weights(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+ if m.bias is not None:
+ m.bias.data.zero_()
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1)
+ m.bias.data.zero_()
+ elif isinstance(m, nn.Linear):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+ if m.bias is not None:
+ m.bias.data.zero_()
+ def forward(self, x):
+ with torch.cuda.amp.autocast(self.fp16):
+ x = self.layers(x)
+ x = self.conv_sep(x.float() if self.fp16 else x)
+ x = self.features(x)
+ return x
+def get_mbf(fp16, num_features):
+ return MobileFaceNet(fp16, num_features)

src/face3d/models/arcface_torch/configs/3millions.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from easydict import EasyDict as edict
+# configs for test speed
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "synthetic"
+config.num_classes = 300 * 10000
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = []

src/face3d/models/arcface_torch/configs/3millions_pfc.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from easydict import EasyDict as edict
+# configs for test speed
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "synthetic"
+config.num_classes = 300 * 10000
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = []

src/face3d/models/arcface_torch/configs/__init__.py ADDED Viewed

File without changes

src/face3d/models/arcface_torch/configs/base.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = "ms1mv3_arcface_r50"
+config.dataset = "ms1m-retinaface-t1"
+config.embedding_size = 512
+config.sample_rate = 1
+config.fp16 = False
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+if config.dataset == "emore":
+ config.rec = "/train_tmp/faces_emore"
+ config.num_classes = 85742
+ config.num_image = 5822653
+ config.num_epoch = 16
+ config.warmup_epoch = -1
+ config.decay_epoch = [8, 14, ]
+ config.val_targets = ["lfw", ]
+elif config.dataset == "ms1m-retinaface-t1":
+ config.rec = "/train_tmp/ms1m-retinaface-t1"
+ config.num_classes = 93431
+ config.num_image = 5179510
+ config.num_epoch = 25
+ config.warmup_epoch = -1
+ config.decay_epoch = [11, 17, 22]
+ config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+elif config.dataset == "glint360k":
+ config.rec = "/train_tmp/glint360k"
+ config.num_classes = 360232
+ config.num_image = 17091657
+ config.num_epoch = 20
+ config.warmup_epoch = -1
+ config.decay_epoch = [8, 12, 15, 18]
+ config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+elif config.dataset == "webface":
+ config.rec = "/train_tmp/faces_webface_112x112"
+ config.num_classes = 10572
+ config.num_image = "forget"
+ config.num_epoch = 34
+ config.warmup_epoch = -1
+ config.decay_epoch = [20, 28, 32]
+ config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/glint360k_mbf.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "cosface"
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 2e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/glint360k_r100.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "cosface"
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/glint360k_r18.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "cosface"
+config.network = "r18"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/glint360k_r34.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "cosface"
+config.network = "r34"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/glint360k_r50.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "cosface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/ms1mv3_mbf.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "arcface"
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 2e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 20, 25]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/ms1mv3_r18.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "arcface"
+config.network = "r18"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/ms1mv3_r2060.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "arcface"
+config.network = "r2060"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 64
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/ms1mv3_r34.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "arcface"
+config.network = "r34"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/ms1mv3_r50.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from easydict import EasyDict as edict
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G tmpfs /train_tmp
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]

src/face3d/models/arcface_torch/configs/speed.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from easydict import EasyDict as edict
+# configs for test speed
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1 # batch size is 512
+config.rec = "synthetic"
+config.num_classes = 100 * 10000
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = []

src/face3d/models/arcface_torch/dataset.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import numbers
+import os
+import queue as Queue
+import threading
+import mxnet as mx
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class BackgroundGenerator(threading.Thread):
+ def __init__(self, generator, local_rank, max_prefetch=6):
+ super(BackgroundGenerator, self).__init__()
+ self.queue = Queue.Queue(max_prefetch)
+ self.generator = generator
+ self.local_rank = local_rank
+ self.daemon = True
+ self.start()
+ def run(self):
+ torch.cuda.set_device(self.local_rank)
+ for item in self.generator:
+ self.queue.put(item)
+ self.queue.put(None)
+ def next(self):
+ next_item = self.queue.get()
+ if next_item is None:
+ raise StopIteration
+ return next_item
+ def __next__(self):
+ return self.next()
+ def __iter__(self):
+ return self
+class DataLoaderX(DataLoader):
+ def __init__(self, local_rank, **kwargs):
+ super(DataLoaderX, self).__init__(**kwargs)
+ self.stream = torch.cuda.Stream(local_rank)
+ self.local_rank = local_rank
+ def __iter__(self):
+ self.iter = super(DataLoaderX, self).__iter__()
+ self.iter = BackgroundGenerator(self.iter, self.local_rank)
+ self.preload()
+ return self
+ def preload(self):
+ self.batch = next(self.iter, None)
+ if self.batch is None:
+ return None
+ with torch.cuda.stream(self.stream):
+ for k in range(len(self.batch)):
+ self.batch[k] = self.batch[k].to(device=self.local_rank, non_blocking=True)
+ def __next__(self):
+ torch.cuda.current_stream().wait_stream(self.stream)
+ batch = self.batch
+ if batch is None:
+ raise StopIteration
+ self.preload()
+ return batch
+class MXFaceDataset(Dataset):
+ def __init__(self, root_dir, local_rank):
+ super(MXFaceDataset, self).__init__()
+ self.transform = transforms.Compose(
+ [transforms.ToPILImage(),
+ transforms.RandomHorizontalFlip(),
+ transforms.ToTensor(),
+ transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+ ])
+ self.root_dir = root_dir
+ self.local_rank = local_rank
+ path_imgrec = os.path.join(root_dir, 'train.rec')
+ path_imgidx = os.path.join(root_dir, 'train.idx')
+ self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+ s = self.imgrec.read_idx(0)
+ header, _ = mx.recordio.unpack(s)
+ if header.flag > 0:
+ self.header0 = (int(header.label[0]), int(header.label[1]))
+ self.imgidx = np.array(range(1, int(header.label[0])))
+ else:
+ self.imgidx = np.array(list(self.imgrec.keys))
+ def __getitem__(self, index):
+ idx = self.imgidx[index]
+ s = self.imgrec.read_idx(idx)
+ header, img = mx.recordio.unpack(s)
+ label = header.label
+ if not isinstance(label, numbers.Number):
+ label = label[0]
+ label = torch.tensor(label, dtype=torch.long)
+ sample = mx.image.imdecode(img).asnumpy()
+ if self.transform is not None:
+ sample = self.transform(sample)
+ return sample, label
+ def __len__(self):
+ return len(self.imgidx)
+class SyntheticDataset(Dataset):
+ def __init__(self, local_rank):
+ super(SyntheticDataset, self).__init__()
+ img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.int32)
+ img = np.transpose(img, (2, 0, 1))
+ img = torch.from_numpy(img).squeeze(0).float()
+ img = ((img / 255) - 0.5) / 0.5
+ self.img = img
+ self.label = 1
+ def __getitem__(self, index):
+ return self.img, self.label
+ def __len__(self):
+ return 1000000

src/face3d/models/arcface_torch/docs/eval.md ADDED Viewed

	@@ -0,0 +1,31 @@

+## Eval on ICCV2021-MFR
+coming soon.
+## Eval IJBC
+You can eval ijbc with pytorch or onnx.
+1. Eval IJBC With Onnx
+```shell
+CUDA_VISIBLE_DEVICES=0 python onnx_ijbc.py --model-root ms1mv3_arcface_r50 --image-path IJB_release/IJBC --result-dir ms1mv3_arcface_r50
+```
+2. Eval IJBC With Pytorch
+```shell
+CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \
+--model-prefix ms1mv3_arcface_r50/backbone.pth \
+--image-path IJB_release/IJBC \
+--result-dir ms1mv3_arcface_r50 \
+--batch-size 128 \
+--job ms1mv3_arcface_r50 \
+--target IJBC \
+--network iresnet50
+```
+## Inference
+```shell
+python inference.py --weight ms1mv3_arcface_r50/backbone.pth --network r50
+```

src/face3d/models/arcface_torch/docs/install.md ADDED Viewed

	@@ -0,0 +1,51 @@

+## v1.8.0
+### Linux and Windows
+```shell
+# CUDA 11.0
+pip --default-timeout=100 install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
+# CUDA 10.2
+pip --default-timeout=100 install torch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0
+# CPU only
+pip --default-timeout=100 install torch==1.8.0+cpu torchvision==0.9.0+cpu torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
+```
+## v1.7.1
+### Linux and Windows
+```shell
+# CUDA 11.0
+pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+# CUDA 10.2
+pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2
+# CUDA 10.1
+pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+# CUDA 9.2
+pip install torch==1.7.1+cu92 torchvision==0.8.2+cu92 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+# CPU only
+pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+```
+## v1.6.0
+### Linux and Windows
+```shell
+# CUDA 10.2
+pip install torch==1.6.0 torchvision==0.7.0
+# CUDA 10.1
+pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+# CUDA 9.2
+pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html
+# CPU only
+pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+```

src/face3d/models/arcface_torch/docs/modelzoo.md ADDED Viewed

File without changes

src/face3d/models/arcface_torch/docs/speed_benchmark.md ADDED Viewed

	@@ -0,0 +1,93 @@

+## Test Training Speed
+- Test Commands
+You need to use the following two commands to test the Partial FC training performance.
+The number of identites is **3 millions** (synthetic data), turn mixed precision training on, backbone is resnet50,
+batch size is 1024.
+```shell
+# Model Parallel
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/3millions
+# Partial FC 0.1
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/3millions_pfc
+```
+- GPU Memory
+```
+# (Model Parallel) gpustat -i
+[0] Tesla V100-SXM2-32GB | 64'C, 94 % | 30338 / 32510 MB
+[1] Tesla V100-SXM2-32GB | 60'C, 99 % | 28876 / 32510 MB
+[2] Tesla V100-SXM2-32GB | 60'C, 99 % | 28872 / 32510 MB
+[3] Tesla V100-SXM2-32GB | 69'C, 99 % | 28872 / 32510 MB
+[4] Tesla V100-SXM2-32GB | 66'C, 99 % | 28888 / 32510 MB
+[5] Tesla V100-SXM2-32GB | 60'C, 99 % | 28932 / 32510 MB
+[6] Tesla V100-SXM2-32GB | 68'C, 100 % | 28916 / 32510 MB
+[7] Tesla V100-SXM2-32GB | 65'C, 99 % | 28860 / 32510 MB
+# (Partial FC 0.1) gpustat -i
+[0] Tesla V100-SXM2-32GB | 60'C, 95 % | 10488 / 32510 MB │·······················
+[1] Tesla V100-SXM2-32GB | 60'C, 97 % | 10344 / 32510 MB │·······················
+[2] Tesla V100-SXM2-32GB | 61'C, 95 % | 10340 / 32510 MB │·······················
+[3] Tesla V100-SXM2-32GB | 66'C, 95 % | 10340 / 32510 MB │·······················
+[4] Tesla V100-SXM2-32GB | 65'C, 94 % | 10356 / 32510 MB │·······················
+[5] Tesla V100-SXM2-32GB | 61'C, 95 % | 10400 / 32510 MB │·······················
+[6] Tesla V100-SXM2-32GB | 68'C, 96 % | 10384 / 32510 MB │·······················
+[7] Tesla V100-SXM2-32GB | 64'C, 95 % | 10328 / 32510 MB │·······················
+```
+- Training Speed
+```python
+# (Model Parallel) trainging.log
+Training: Speed 2271.33 samples/sec Loss 1.1624 LearningRate 0.2000 Epoch: 0 Global Step: 100
+Training: Speed 2269.94 samples/sec Loss 0.0000 LearningRate 0.2000 Epoch: 0 Global Step: 150
+Training: Speed 2272.67 samples/sec Loss 0.0000 LearningRate 0.2000 Epoch: 0 Global Step: 200
+Training: Speed 2266.55 samples/sec Loss 0.0000 LearningRate 0.2000 Epoch: 0 Global Step: 250
+Training: Speed 2272.54 samples/sec Loss 0.0000 LearningRate 0.2000 Epoch: 0 Global Step: 300
+# (Partial FC 0.1) trainging.log
+Training: Speed 5299.56 samples/sec Loss 1.0965 LearningRate 0.2000 Epoch: 0 Global Step: 100
+Training: Speed 5296.37 samples/sec Loss 0.0000 LearningRate 0.2000 Epoch: 0 Global Step: 150
+Training: Speed 5304.37 samples/sec Loss 0.0000 LearningRate 0.2000 Epoch: 0 Global Step: 200
+Training: Speed 5274.43 samples/sec Loss 0.0000 LearningRate 0.2000 Epoch: 0 Global Step: 250
+Training: Speed 5300.10 samples/sec Loss 0.0000 LearningRate 0.2000 Epoch: 0 Global Step: 300
+```
+In this test case, Partial FC 0.1 only use1 1/3 of the GPU memory of the model parallel,
+and the training speed is 2.5 times faster than the model parallel.
+## Speed Benchmark
+1. Training speed of different parallel methods (samples/second), Tesla V100 32GB * 8. (Larger is better)
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :--- | :--- | :--- | :--- |
+|125000 | 4681 | 4824 | 5004 |
+|250000 | 4047 | 4521 | 4976 |
+|500000 | 3087 | 4013 | 4900 |
+|1000000 | 2090 | 3449 | 4803 |
+|1400000 | 1672 | 3043 | 4738 |
+|2000000 | - | 2593 | 4626 |
+|4000000 | - | 1748 | 4208 |
+|5500000 | - | 1389 | 3975 |
+|8000000 | - | - | 3565 |
+|16000000 | - | - | 2679 |
+|29000000 | - | - | 1855 |
+2. GPU memory cost of different parallel methods (GB per GPU), Tesla V100 32GB * 8. (Smaller is better)
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :--- | :--- | :--- | :--- |
+|125000 | 7358 | 5306 | 4868 |
+|250000 | 9940 | 5826 | 5004 |
+|500000 | 14220 | 7114 | 5202 |
+|1000000 | 23708 | 9966 | 5620 |
+|1400000 | 32252 | 11178 | 6056 |
+|2000000 | - | 13978 | 6472 |
+|4000000 | - | 23238 | 8284 |
+|5500000 | - | 32188 | 9854 |
+|8000000 | - | - | 12310 |
+|16000000 | - | - | 19950 |
+|29000000 | - | - | 32324 |

src/face3d/models/arcface_torch/eval/__init__.py ADDED Viewed

File without changes

src/face3d/models/arcface_torch/eval/verification.py ADDED Viewed

	@@ -0,0 +1,407 @@

+"""Helper for evaluation on the Labeled Faces in the Wild dataset
+"""
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import datetime
+import os
+import pickle
+import mxnet as mx
+import numpy as np
+import sklearn
+import torch
+from mxnet import ndarray as nd
+from scipy import interpolate
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+class LFold:
+ def __init__(self, n_splits=2, shuffle=False):
+ self.n_splits = n_splits
+ if self.n_splits > 1:
+ self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+ def split(self, indices):
+ if self.n_splits > 1:
+ return self.k_fold.split(indices)
+ else:
+ return [(indices, indices)]
+def calculate_roc(thresholds,
+ embeddings1,
+ embeddings2,
+ actual_issame,
+ nrof_folds=10,
+ pca=0):
+ assert (embeddings1.shape[0] == embeddings2.shape[0])
+ assert (embeddings1.shape[1] == embeddings2.shape[1])
+ nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+ nrof_thresholds = len(thresholds)
+ k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+ tprs = np.zeros((nrof_folds, nrof_thresholds))
+ fprs = np.zeros((nrof_folds, nrof_thresholds))
+ accuracy = np.zeros((nrof_folds))
+ indices = np.arange(nrof_pairs)
+ if pca == 0:
+ diff = np.subtract(embeddings1, embeddings2)
+ dist = np.sum(np.square(diff), 1)
+ for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+ if pca > 0:
+ print('doing pca on', fold_idx)
+ embed1_train = embeddings1[train_set]
+ embed2_train = embeddings2[train_set]
+ _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+ pca_model = PCA(n_components=pca)
+ pca_model.fit(_embed_train)
+ embed1 = pca_model.transform(embeddings1)
+ embed2 = pca_model.transform(embeddings2)
+ embed1 = sklearn.preprocessing.normalize(embed1)
+ embed2 = sklearn.preprocessing.normalize(embed2)
+ diff = np.subtract(embed1, embed2)
+ dist = np.sum(np.square(diff), 1)
+ # Find the best threshold for the fold
+ acc_train = np.zeros((nrof_thresholds))
+ for threshold_idx, threshold in enumerate(thresholds):
+ _, _, acc_train[threshold_idx] = calculate_accuracy(
+ threshold, dist[train_set], actual_issame[train_set])
+ best_threshold_index = np.argmax(acc_train)
+ for threshold_idx, threshold in enumerate(thresholds):
+ tprs[fold_idx, threshold_idx], fprs[fold_idx, threshold_idx], _ = calculate_accuracy(
+ threshold, dist[test_set],
+ actual_issame[test_set])
+ _, _, accuracy[fold_idx] = calculate_accuracy(
+ thresholds[best_threshold_index], dist[test_set],
+ actual_issame[test_set])
+ tpr = np.mean(tprs, 0)
+ fpr = np.mean(fprs, 0)
+ return tpr, fpr, accuracy
+def calculate_accuracy(threshold, dist, actual_issame):
+ predict_issame = np.less(dist, threshold)
+ tp = np.sum(np.logical_and(predict_issame, actual_issame))
+ fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+ tn = np.sum(
+ np.logical_and(np.logical_not(predict_issame),
+ np.logical_not(actual_issame)))
+ fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+ tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+ fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+ acc = float(tp + tn) / dist.size
+ return tpr, fpr, acc
+def calculate_val(thresholds,
+ embeddings1,
+ embeddings2,
+ actual_issame,
+ far_target,
+ nrof_folds=10):
+ assert (embeddings1.shape[0] == embeddings2.shape[0])
+ assert (embeddings1.shape[1] == embeddings2.shape[1])
+ nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+ nrof_thresholds = len(thresholds)
+ k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+ val = np.zeros(nrof_folds)
+ far = np.zeros(nrof_folds)
+ diff = np.subtract(embeddings1, embeddings2)
+ dist = np.sum(np.square(diff), 1)
+ indices = np.arange(nrof_pairs)
+ for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+ # Find the threshold that gives FAR = far_target
+ far_train = np.zeros(nrof_thresholds)
+ for threshold_idx, threshold in enumerate(thresholds):
+ _, far_train[threshold_idx] = calculate_val_far(
+ threshold, dist[train_set], actual_issame[train_set])
+ if np.max(far_train) >= far_target:
+ f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+ threshold = f(far_target)
+ else:
+ threshold = 0.0
+ val[fold_idx], far[fold_idx] = calculate_val_far(
+ threshold, dist[test_set], actual_issame[test_set])
+ val_mean = np.mean(val)
+ far_mean = np.mean(far)
+ val_std = np.std(val)
+ return val_mean, val_std, far_mean
+def calculate_val_far(threshold, dist, actual_issame):
+ predict_issame = np.less(dist, threshold)
+ true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+ false_accept = np.sum(
+ np.logical_and(predict_issame, np.logical_not(actual_issame)))
+ n_same = np.sum(actual_issame)
+ n_diff = np.sum(np.logical_not(actual_issame))
+ # print(true_accept, false_accept)
+ # print(n_same, n_diff)
+ val = float(true_accept) / float(n_same)
+ far = float(false_accept) / float(n_diff)
+ return val, far
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+ # Calculate evaluation metrics
+ thresholds = np.arange(0, 4, 0.01)
+ embeddings1 = embeddings[0::2]
+ embeddings2 = embeddings[1::2]
+ tpr, fpr, accuracy = calculate_roc(thresholds,
+ embeddings1,
+ embeddings2,
+ np.asarray(actual_issame),
+ nrof_folds=nrof_folds,
+ pca=pca)
+ thresholds = np.arange(0, 4, 0.001)
+ val, val_std, far = calculate_val(thresholds,
+ embeddings1,
+ embeddings2,
+ np.asarray(actual_issame),
+ 1e-3,
+ nrof_folds=nrof_folds)
+ return tpr, fpr, accuracy, val, val_std, far
+@torch.no_grad()
+def load_bin(path, image_size):
+ try:
+ with open(path, 'rb') as f:
+ bins, issame_list = pickle.load(f) # py2
+ except UnicodeDecodeError as e:
+ with open(path, 'rb') as f:
+ bins, issame_list = pickle.load(f, encoding='bytes') # py3
+ data_list = []
+ for flip in [0, 1]:
+ data = torch.empty((len(issame_list) * 2, 3, image_size[0], image_size[1]))
+ data_list.append(data)
+ for idx in range(len(issame_list) * 2):
+ _bin = bins[idx]
+ img = mx.image.imdecode(_bin)
+ if img.shape[1] != image_size[0]:
+ img = mx.image.resize_short(img, image_size[0])
+ img = nd.transpose(img, axes=(2, 0, 1))
+ for flip in [0, 1]:
+ if flip == 1:
+ img = mx.ndarray.flip(data=img, axis=2)
+ data_list[flip][idx][:] = torch.from_numpy(img.asnumpy())
+ if idx % 1000 == 0:
+ print('loading bin', idx)
+ print(data_list[0].shape)
+ return data_list, issame_list
+@torch.no_grad()
+def test(data_set, backbone, batch_size, nfolds=10):
+ print('testing verification..')
+ data_list = data_set[0]
+ issame_list = data_set[1]
+ embeddings_list = []
+ time_consumed = 0.0
+ for i in range(len(data_list)):
+ data = data_list[i]
+ embeddings = None
+ ba = 0
+ while ba < data.shape[0]:
+ bb = min(ba + batch_size, data.shape[0])
+ count = bb - ba
+ _data = data[bb - batch_size: bb]
+ time0 = datetime.datetime.now()
+ img = ((_data / 255) - 0.5) / 0.5
+ net_out: torch.Tensor = backbone(img)
+ _embeddings = net_out.detach().cpu().numpy()
+ time_now = datetime.datetime.now()
+ diff = time_now - time0
+ time_consumed += diff.total_seconds()
+ if embeddings is None:
+ embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+ embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+ ba = bb
+ embeddings_list.append(embeddings)
+ _xnorm = 0.0
+ _xnorm_cnt = 0
+ for embed in embeddings_list:
+ for i in range(embed.shape[0]):
+ _em = embed[i]
+ _norm = np.linalg.norm(_em)
+ _xnorm += _norm
+ _xnorm_cnt += 1
+ _xnorm /= _xnorm_cnt
+ acc1 = 0.0
+ std1 = 0.0
+ embeddings = embeddings_list[0] + embeddings_list[1]
+ embeddings = sklearn.preprocessing.normalize(embeddings)
+ print(embeddings.shape)
+ print('infer time', time_consumed)
+ _, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=nfolds)
+ acc2, std2 = np.mean(accuracy), np.std(accuracy)
+ return acc1, std1, acc2, std2, _xnorm, embeddings_list
+def dumpR(data_set,
+ backbone,
+ batch_size,
+ name='',
+ data_extra=None,
+ label_shape=None):
+ print('dump verification embedding..')
+ data_list = data_set[0]
+ issame_list = data_set[1]
+ embeddings_list = []
+ time_consumed = 0.0
+ for i in range(len(data_list)):
+ data = data_list[i]
+ embeddings = None
+ ba = 0
+ while ba < data.shape[0]:
+ bb = min(ba + batch_size, data.shape[0])
+ count = bb - ba
+ _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+ time0 = datetime.datetime.now()
+ if data_extra is None:
+ db = mx.io.DataBatch(data=(_data,), label=(_label,))
+ else:
+ db = mx.io.DataBatch(data=(_data, _data_extra),
+ label=(_label,))
+ model.forward(db, is_train=False)
+ net_out = model.get_outputs()
+ _embeddings = net_out[0].asnumpy()
+ time_now = datetime.datetime.now()
+ diff = time_now - time0
+ time_consumed += diff.total_seconds()
+ if embeddings is None:
+ embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+ embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+ ba = bb
+ embeddings_list.append(embeddings)
+ embeddings = embeddings_list[0] + embeddings_list[1]
+ embeddings = sklearn.preprocessing.normalize(embeddings)
+ actual_issame = np.asarray(issame_list)
+ outname = os.path.join('temp.bin')
+ with open(outname, 'wb') as f:
+ pickle.dump((embeddings, issame_list),
+ f,
+ protocol=pickle.HIGHEST_PROTOCOL)
+# if __name__ == '__main__':
+#
+# parser = argparse.ArgumentParser(description='do verification')
+# # general
+# parser.add_argument('--data-dir', default='', help='')
+# parser.add_argument('--model',
+# default='../model/softmax,50',
+# help='path to load model.')
+# parser.add_argument('--target',
+# default='lfw,cfp_ff,cfp_fp,agedb_30',
+# help='test targets.')
+# parser.add_argument('--gpu', default=0, type=int, help='gpu id')
+# parser.add_argument('--batch-size', default=32, type=int, help='')
+# parser.add_argument('--max', default='', type=str, help='')
+# parser.add_argument('--mode', default=0, type=int, help='')
+# parser.add_argument('--nfolds', default=10, type=int, help='')
+# args = parser.parse_args()
+# image_size = [112, 112]
+# print('image_size', image_size)
+# ctx = mx.gpu(args.gpu)
+# nets = []
+# vec = args.model.split(',')
+# prefix = args.model.split(',')[0]
+# epochs = []
+# if len(vec) == 1:
+# pdir = os.path.dirname(prefix)
+# for fname in os.listdir(pdir):
+# if not fname.endswith('.params'):
+# continue
+# _file = os.path.join(pdir, fname)
+# if _file.startswith(prefix):
+# epoch = int(fname.split('.')[0].split('-')[1])
+# epochs.append(epoch)
+# epochs = sorted(epochs, reverse=True)
+# if len(args.max) > 0:
+# _max = [int(x) for x in args.max.split(',')]
+# assert len(_max) == 2
+# if len(epochs) > _max[1]:
+# epochs = epochs[_max[0]:_max[1]]
+#
+# else:
+# epochs = [int(x) for x in vec[1].split('|')]
+# print('model number', len(epochs))
+# time0 = datetime.datetime.now()
+# for epoch in epochs:
+# print('loading', prefix, epoch)
+# sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+# # arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
+# all_layers = sym.get_internals()
+# sym = all_layers['fc1_output']
+# model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+# # model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+# model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0],
+# image_size[1]))])
+# model.set_params(arg_params, aux_params)
+# nets.append(model)
+# time_now = datetime.datetime.now()
+# diff = time_now - time0
+# print('model loading time', diff.total_seconds())
+#
+# ver_list = []
+# ver_name_list = []
+# for name in args.target.split(','):
+# path = os.path.join(args.data_dir, name + ".bin")
+# if os.path.exists(path):
+# print('loading.. ', name)
+# data_set = load_bin(path, image_size)
+# ver_list.append(data_set)
+# ver_name_list.append(name)
+#
+# if args.mode == 0:
+# for i in range(len(ver_list)):
+# results = []
+# for model in nets:
+# acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+# ver_list[i], model, args.batch_size, args.nfolds)
+# print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
+# print('[%s]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], acc1, std1))
+# print('[%s]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], acc2, std2))
+# results.append(acc2)
+# print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
+# elif args.mode == 1:
+# raise ValueError
+# else:
+# model = nets[0]
+# dumpR(ver_list[0], model, args.batch_size, args.target)

src/face3d/models/arcface_torch/eval_ijbc.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# coding: utf-8
+import os
+import pickle
+import matplotlib
+import pandas as pd
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import timeit
+import sklearn
+import argparse
+import cv2
+import numpy as np
+import torch
+from skimage import transform as trans
+from backbones import get_model
+from sklearn.metrics import roc_curve, auc
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from pathlib import Path
+import sys
+import warnings
+sys.path.insert(0, "../")
+warnings.filterwarnings("ignore")
+parser = argparse.ArgumentParser(description='do ijb test')
+# general
+parser.add_argument('--model-prefix', default='', help='path to load model.')
+parser.add_argument('--image-path', default='', type=str, help='')
+parser.add_argument('--result-dir', default='.', type=str, help='')
+parser.add_argument('--batch-size', default=128, type=int, help='')
+parser.add_argument('--network', default='iresnet50', type=str, help='')
+parser.add_argument('--job', default='insightface', type=str, help='job name')
+parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB')
+args = parser.parse_args()
+target = args.target
+model_path = args.model_prefix
+image_path = args.image_path
+result_dir = args.result_dir
+gpu_id = None
+use_norm_score = True # if Ture, TestMode(N1)
+use_detector_score = True # if Ture, TestMode(D1)
+use_flip_test = True # if Ture, TestMode(F1)
+job = args.job
+batch_size = args.batch_size
+class Embedding(object):
+ def __init__(self, prefix, data_shape, batch_size=1):
+ image_size = (112, 112)
+ self.image_size = image_size
+ weight = torch.load(prefix)
+ resnet = get_model(args.network, dropout=0, fp16=False).cuda()
+ resnet.load_state_dict(weight)
+ model = torch.nn.DataParallel(resnet)
+ self.model = model
+ self.model.eval()
+ src = np.array([
+ [30.2946, 51.6963],
+ [65.5318, 51.5014],
+ [48.0252, 71.7366],
+ [33.5493, 92.3655],
+ [62.7299, 92.2041]], dtype=np.float32)
+ src[:, 0] += 8.0
+ self.src = src
+ self.batch_size = batch_size
+ self.data_shape = data_shape
+ def get(self, rimg, landmark):
+ assert landmark.shape[0] == 68 or landmark.shape[0] == 5
+ assert landmark.shape[1] == 2
+ if landmark.shape[0] == 68:
+ landmark5 = np.zeros((5, 2), dtype=np.float32)
+ landmark5[0] = (landmark[36] + landmark[39]) / 2
+ landmark5[1] = (landmark[42] + landmark[45]) / 2
+ landmark5[2] = landmark[30]
+ landmark5[3] = landmark[48]
+ landmark5[4] = landmark[54]
+ else:
+ landmark5 = landmark
+ tform = trans.SimilarityTransform()
+ tform.estimate(landmark5, self.src)
+ M = tform.params[0:2, :]
+ img = cv2.warpAffine(rimg,
+ M, (self.image_size[1], self.image_size[0]),
+ borderValue=0.0)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+ img_flip = np.fliplr(img)
+ img = np.transpose(img, (2, 0, 1)) # 3*112*112, RGB
+ img_flip = np.transpose(img_flip, (2, 0, 1))
+ input_blob = np.zeros((2, 3, self.image_size[1], self.image_size[0]), dtype=np.uint8)
+ input_blob[0] = img
+ input_blob[1] = img_flip
+ return input_blob
+ @torch.no_grad()
+ def forward_db(self, batch_data):
+ imgs = torch.Tensor(batch_data).cuda()
+ imgs.div_(255).sub_(0.5).div_(0.5)
+ feat = self.model(imgs)
+ feat = feat.reshape([self.batch_size, 2 * feat.shape[1]])
+ return feat.cpu().numpy()
+# 将一个list尽量均分成n份，限制len(list)==n，份数大于原list内元素个数则分配空list[]
+def divideIntoNstrand(listTemp, n):
+ twoList = [[] for i in range(n)]
+ for i, e in enumerate(listTemp):
+ twoList[i % n].append(e)
+ return twoList
+def read_template_media_list(path):
+ # ijb_meta = np.loadtxt(path, dtype=str)
+ ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+ templates = ijb_meta[:, 1].astype(np.int)
+ medias = ijb_meta[:, 2].astype(np.int)
+ return templates, medias
+# In[ ]:
+def read_template_pair_list(path):
+ # pairs = np.loadtxt(path, dtype=str)
+ pairs = pd.read_csv(path, sep=' ', header=None).values
+ # print(pairs.shape)
+ # print(pairs[:, 0].astype(np.int))
+ t1 = pairs[:, 0].astype(np.int)
+ t2 = pairs[:, 1].astype(np.int)
+ label = pairs[:, 2].astype(np.int)
+ return t1, t2, label
+# In[ ]:
+def read_image_feature(path):
+ with open(path, 'rb') as fid:
+ img_feats = pickle.load(fid)
+ return img_feats
+# In[ ]:
+def get_image_feature(img_path, files_list, model_path, epoch, gpu_id):
+ batch_size = args.batch_size
+ data_shape = (3, 112, 112)
+ files = files_list
+ print('files:', len(files))
+ rare_size = len(files) % batch_size
+ faceness_scores = []
+ batch = 0
+ img_feats = np.empty((len(files), 1024), dtype=np.float32)
+ batch_data = np.empty((2 * batch_size, 3, 112, 112))
+ embedding = Embedding(model_path, data_shape, batch_size)
+ for img_index, each_line in enumerate(files[:len(files) - rare_size]):
+ name_lmk_score = each_line.strip().split(' ')
+ img_name = os.path.join(img_path, name_lmk_score[0])
+ img = cv2.imread(img_name)
+ lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+ dtype=np.float32)
+ lmk = lmk.reshape((5, 2))
+ input_blob = embedding.get(img, lmk)
+ batch_data[2 * (img_index - batch * batch_size)][:] = input_blob[0]
+ batch_data[2 * (img_index - batch * batch_size) + 1][:] = input_blob[1]
+ if (img_index + 1) % batch_size == 0:
+ print('batch', batch)
+ img_feats[batch * batch_size:batch * batch_size +
+ batch_size][:] = embedding.forward_db(batch_data)
+ batch += 1
+ faceness_scores.append(name_lmk_score[-1])
+ batch_data = np.empty((2 * rare_size, 3, 112, 112))
+ embedding = Embedding(model_path, data_shape, rare_size)
+ for img_index, each_line in enumerate(files[len(files) - rare_size:]):
+ name_lmk_score = each_line.strip().split(' ')
+ img_name = os.path.join(img_path, name_lmk_score[0])
+ img = cv2.imread(img_name)
+ lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+ dtype=np.float32)
+ lmk = lmk.reshape((5, 2))
+ input_blob = embedding.get(img, lmk)
+ batch_data[2 * img_index][:] = input_blob[0]
+ batch_data[2 * img_index + 1][:] = input_blob[1]
+ if (img_index + 1) % rare_size == 0:
+ print('batch', batch)
+ img_feats[len(files) -
+ rare_size:][:] = embedding.forward_db(batch_data)
+ batch += 1
+ faceness_scores.append(name_lmk_score[-1])
+ faceness_scores = np.array(faceness_scores).astype(np.float32)
+ # img_feats = np.ones( (len(files), 1024), dtype=np.float32) * 0.01
+ # faceness_scores = np.ones( (len(files), ), dtype=np.float32 )
+ return img_feats, faceness_scores
+# In[ ]:
+def image2template_feature(img_feats=None, templates=None, medias=None):
+ # ==========================================================
+ # 1. face image feature l2 normalization. img_feats:[number_image x feats_dim]
+ # 2. compute media feature.
+ # 3. compute template feature.
+ # ==========================================================
+ unique_templates = np.unique(templates)
+ template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+ for count_template, uqt in enumerate(unique_templates):
+ (ind_t,) = np.where(templates == uqt)
+ face_norm_feats = img_feats[ind_t]
+ face_medias = medias[ind_t]
+ unique_medias, unique_media_counts = np.unique(face_medias,
+ return_counts=True)
+ media_norm_feats = []
+ for u, ct in zip(unique_medias, unique_media_counts):
+ (ind_m,) = np.where(face_medias == u)
+ if ct == 1:
+ media_norm_feats += [face_norm_feats[ind_m]]
+ else: # image features from the same video will be aggregated into one feature
+ media_norm_feats += [
+ np.mean(face_norm_feats[ind_m], axis=0, keepdims=True)
+ ]
+ media_norm_feats = np.array(media_norm_feats)
+ # media_norm_feats = media_norm_feats / np.sqrt(np.sum(media_norm_feats ** 2, -1, keepdims=True))
+ template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+ if count_template % 2000 == 0:
+ print('Finish Calculating {} template features.'.format(
+ count_template))
+ # template_norm_feats = template_feats / np.sqrt(np.sum(template_feats ** 2, -1, keepdims=True))
+ template_norm_feats = sklearn.preprocessing.normalize(template_feats)
+ # print(template_norm_feats.shape)
+ return template_norm_feats, unique_templates
+# In[ ]:
+def verification(template_norm_feats=None,
+ unique_templates=None,
+ p1=None,
+ p2=None):
+ # ==========================================================
+ # Compute set-to-set Similarity Score.
+ # ==========================================================
+ template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+ for count_template, uqt in enumerate(unique_templates):
+ template2id[uqt] = count_template
+ score = np.zeros((len(p1),)) # save cosine distance between pairs
+ total_pairs = np.array(range(len(p1)))
+ batchsize = 100000 # small batchsize instead of all pairs in one batch due to the memory limiation
+ sublists = [
+ total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+ ]
+ total_sublists = len(sublists)
+ for c, s in enumerate(sublists):
+ feat1 = template_norm_feats[template2id[p1[s]]]
+ feat2 = template_norm_feats[template2id[p2[s]]]
+ similarity_score = np.sum(feat1 * feat2, -1)
+ score[s] = similarity_score.flatten()
+ if c % 10 == 0:
+ print('Finish {}/{} pairs.'.format(c, total_sublists))
+ return score
+# In[ ]:
+def verification2(template_norm_feats=None,
+ unique_templates=None,
+ p1=None,
+ p2=None):
+ template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+ for count_template, uqt in enumerate(unique_templates):
+ template2id[uqt] = count_template
+ score = np.zeros((len(p1),)) # save cosine distance between pairs
+ total_pairs = np.array(range(len(p1)))
+ batchsize = 100000 # small batchsize instead of all pairs in one batch due to the memory limiation
+ sublists = [
+ total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+ ]
+ total_sublists = len(sublists)
+ for c, s in enumerate(sublists):
+ feat1 = template_norm_feats[template2id[p1[s]]]
+ feat2 = template_norm_feats[template2id[p2[s]]]
+ similarity_score = np.sum(feat1 * feat2, -1)
+ score[s] = similarity_score.flatten()
+ if c % 10 == 0:
+ print('Finish {}/{} pairs.'.format(c, total_sublists))
+ return score
+def read_score(path):
+ with open(path, 'rb') as fid:
+ img_feats = pickle.load(fid)
+ return img_feats
+# # Step1: Load Meta Data
+# In[ ]:
+assert target == 'IJBC' or target == 'IJBB'
+# =============================================================
+# load image and template relationships for template feature embedding
+# tid --> template id, mid --> media id
+# format:
+# image_name tid mid
+# =============================================================
+start = timeit.default_timer()
+templates, medias = read_template_media_list(
+ os.path.join('%s/meta' % image_path,
+ '%s_face_tid_mid.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+# In[ ]:
+# =============================================================
+# load template pairs for template-to-template verification
+# tid : template id, label : 1/0
+# format:
+# tid_1 tid_2 label
+# =============================================================
+start = timeit.default_timer()
+p1, p2, label = read_template_pair_list(
+ os.path.join('%s/meta' % image_path,
+ '%s_template_pair_label.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+# # Step 2: Get Image Features
+# In[ ]:
+# =============================================================
+# load image features
+# format:
+# img_feats: [image_num x feats_dim] (227630, 512)
+# =============================================================
+start = timeit.default_timer()
+img_path = '%s/loose_crop' % image_path
+img_list_path = '%s/meta/%s_name_5pts_score.txt' % (image_path, target.lower())
+img_list = open(img_list_path)
+files = img_list.readlines()
+# files_list = divideIntoNstrand(files, rank_size)
+files_list = files
+# img_feats
+# for i in range(rank_size):
+img_feats, faceness_scores = get_image_feature(img_path, files_list,
+ model_path, 0, gpu_id)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0],
+ img_feats.shape[1]))
+# # Step3: Get Template Features
+# In[ ]:
+# =============================================================
+# compute template features from image features.
+# =============================================================
+start = timeit.default_timer()
+# ==========================================================
+# Norm feature before aggregation into template feature?
+# Feature norm from embedding network and faceness score are able to decrease weights for noise samples (not face).
+# ==========================================================
+# 1. FaceScore （Feature Norm）
+# 2. FaceScore （Detector）
+if use_flip_test:
+ # concat --- F1
+ # img_input_feats = img_feats
+ # add --- F2
+ img_input_feats = img_feats[:, 0:img_feats.shape[1] //
+ 2] + img_feats[:, img_feats.shape[1] // 2:]
+else:
+ img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+if use_norm_score:
+ img_input_feats = img_input_feats
+else:
+ # normalise features to remove norm information
+ img_input_feats = img_input_feats / np.sqrt(
+ np.sum(img_input_feats ** 2, -1, keepdims=True))
+if use_detector_score:
+ print(img_input_feats.shape, faceness_scores.shape)
+ img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+else:
+ img_input_feats = img_input_feats
+template_norm_feats, unique_templates = image2template_feature(
+ img_input_feats, templates, medias)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+# # Step 4: Get Template Similarity Scores
+# In[ ]:
+# =============================================================
+# compute verification scores between template pairs.
+# =============================================================
+start = timeit.default_timer()
+score = verification(template_norm_feats, unique_templates, p1, p2)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+# In[ ]:
+save_path = os.path.join(result_dir, args.job)
+# save_path = result_dir + '/%s_result' % target
+if not os.path.exists(save_path):
+ os.makedirs(save_path)
+score_save_file = os.path.join(save_path, "%s.npy" % target.lower())
+np.save(score_save_file, score)
+# # Step 5: Get ROC Curves and TPR@FPR Table
+# In[ ]:
+files = [score_save_file]
+methods = []
+scores = []
+for file in files:
+ methods.append(Path(file).stem)
+ scores.append(np.load(file))
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+ zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+ fpr, tpr, _ = roc_curve(label, scores[method])
+ roc_auc = auc(fpr, tpr)
+ fpr = np.flipud(fpr)
+ tpr = np.flipud(tpr) # select largest tpr at same fpr
+ plt.plot(fpr,
+ tpr,
+ color=colours[method],
+ lw=1,
+ label=('[%s (AUC = %0.4f %%)]' %
+ (method.split('-')[-1], roc_auc * 100)))
+ tpr_fpr_row = []
+ tpr_fpr_row.append("%s-%s" % (method, target))
+ for fpr_iter in np.arange(len(x_labels)):
+ _, min_index = min(
+ list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+ tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+ tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10 ** -6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+fig.savefig(os.path.join(save_path, '%s.pdf' % target.lower()))
+print(tpr_fpr_table)

src/face3d/models/arcface_torch/inference.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import argparse
+import cv2
+import numpy as np
+import torch
+from backbones import get_model
+@torch.no_grad()
+def inference(weight, name, img):
+ if img is None:
+ img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.uint8)
+ else:
+ img = cv2.imread(img)
+ img = cv2.resize(img, (112, 112))
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+ img = np.transpose(img, (2, 0, 1))
+ img = torch.from_numpy(img).unsqueeze(0).float()
+ img.div_(255).sub_(0.5).div_(0.5)
+ net = get_model(name, fp16=False)
+ net.load_state_dict(torch.load(weight))
+ net.eval()
+ feat = net(img).numpy()
+ print(feat)
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='PyTorch ArcFace Training')
+ parser.add_argument('--network', type=str, default='r50', help='backbone network')
+ parser.add_argument('--weight', type=str, default='')
+ parser.add_argument('--img', type=str, default=None)
+ args = parser.parse_args()
+ inference(args.weight, args.network, args.img)