|
import argparse |
|
import os, sys |
|
import math |
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
sys.path.append(BASE_DIR) |
|
|
|
import pprint |
|
import time |
|
import torch |
|
import torch.nn.parallel |
|
from torch.nn.parallel import DistributedDataParallel as DDP |
|
from torch.cuda import amp |
|
import torch.distributed as dist |
|
import torch.backends.cudnn as cudnn |
|
import torch.optim |
|
import torch.utils.data |
|
import torch.utils.data.distributed |
|
import torchvision.transforms as transforms |
|
import numpy as np |
|
from lib.utils import DataLoaderX, torch_distributed_zero_first |
|
from tensorboardX import SummaryWriter |
|
|
|
import lib.dataset as dataset |
|
from lib.config import cfg |
|
from lib.config import update_config |
|
from lib.core.loss import get_loss |
|
from lib.core.function import train |
|
from lib.core.function import validate |
|
from lib.core.general import fitness |
|
from lib.models import get_net |
|
from lib.utils import is_parallel |
|
from lib.utils.utils import get_optimizer |
|
from lib.utils.utils import save_checkpoint |
|
from lib.utils.utils import create_logger, select_device |
|
from lib.utils import run_anchor |
|
|
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser(description='Train Multitask network') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument('--modelDir', |
|
help='model directory', |
|
type=str, |
|
default='') |
|
parser.add_argument('--logDir', |
|
help='log directory', |
|
type=str, |
|
default='runs/') |
|
parser.add_argument('--dataDir', |
|
help='data directory', |
|
type=str, |
|
default='') |
|
parser.add_argument('--prevModelDir', |
|
help='prev Model directory', |
|
type=str, |
|
default='') |
|
|
|
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') |
|
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') |
|
parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold') |
|
parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS') |
|
args = parser.parse_args() |
|
|
|
return args |
|
|
|
|
|
def main(): |
|
|
|
args = parse_args() |
|
update_config(cfg, args) |
|
|
|
|
|
world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 |
|
global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 |
|
|
|
rank = global_rank |
|
|
|
|
|
|
|
|
|
logger, final_output_dir, tb_log_dir = create_logger( |
|
cfg, cfg.LOG_DIR, 'train', rank=rank) |
|
|
|
if rank in [-1, 0]: |
|
logger.info(pprint.pformat(args)) |
|
logger.info(cfg) |
|
|
|
writer_dict = { |
|
'writer': SummaryWriter(log_dir=tb_log_dir), |
|
'train_global_steps': 0, |
|
'valid_global_steps': 0, |
|
} |
|
else: |
|
writer_dict = None |
|
|
|
|
|
cudnn.benchmark = cfg.CUDNN.BENCHMARK |
|
torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC |
|
torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED |
|
|
|
|
|
|
|
print("begin to bulid up model...") |
|
|
|
device = select_device(logger, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU* len(cfg.GPUS)) if not cfg.DEBUG \ |
|
else select_device(logger, 'cpu') |
|
|
|
if args.local_rank != -1: |
|
assert torch.cuda.device_count() > args.local_rank |
|
torch.cuda.set_device(args.local_rank) |
|
device = torch.device('cuda', args.local_rank) |
|
dist.init_process_group(backend='nccl', init_method='env://') |
|
|
|
print("load model to device") |
|
model = get_net(cfg).to(device) |
|
|
|
|
|
|
|
|
|
|
|
|
|
criterion = get_loss(cfg, device=device) |
|
optimizer = get_optimizer(cfg, model) |
|
|
|
|
|
|
|
best_perf = 0.0 |
|
best_model = False |
|
last_epoch = -1 |
|
|
|
Encoder_para_idx = [str(i) for i in range(0, 17)] |
|
Det_Head_para_idx = [str(i) for i in range(17, 25)] |
|
Da_Seg_Head_para_idx = [str(i) for i in range(25, 34)] |
|
Ll_Seg_Head_para_idx = [str(i) for i in range(34,43)] |
|
|
|
lf = lambda x: ((1 + math.cos(x * math.pi / cfg.TRAIN.END_EPOCH)) / 2) * \ |
|
(1 - cfg.TRAIN.LRF) + cfg.TRAIN.LRF |
|
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) |
|
begin_epoch = cfg.TRAIN.BEGIN_EPOCH |
|
|
|
if rank in [-1, 0]: |
|
checkpoint_file = os.path.join( |
|
os.path.join(cfg.LOG_DIR, cfg.DATASET.DATASET), 'checkpoint.pth' |
|
) |
|
if os.path.exists(cfg.MODEL.PRETRAINED): |
|
logger.info("=> loading model '{}'".format(cfg.MODEL.PRETRAINED)) |
|
checkpoint = torch.load(cfg.MODEL.PRETRAINED) |
|
begin_epoch = checkpoint['epoch'] |
|
|
|
last_epoch = checkpoint['epoch'] |
|
model.load_state_dict(checkpoint['state_dict']) |
|
optimizer.load_state_dict(checkpoint['optimizer']) |
|
logger.info("=> loaded checkpoint '{}' (epoch {})".format( |
|
cfg.MODEL.PRETRAINED, checkpoint['epoch'])) |
|
|
|
|
|
if os.path.exists(cfg.MODEL.PRETRAINED_DET): |
|
logger.info("=> loading model weight in det branch from '{}'".format(cfg.MODEL.PRETRAINED)) |
|
det_idx_range = [str(i) for i in range(0,25)] |
|
model_dict = model.state_dict() |
|
checkpoint_file = cfg.MODEL.PRETRAINED_DET |
|
checkpoint = torch.load(checkpoint_file) |
|
begin_epoch = checkpoint['epoch'] |
|
last_epoch = checkpoint['epoch'] |
|
checkpoint_dict = {k: v for k, v in checkpoint['state_dict'].items() if k.split(".")[1] in det_idx_range} |
|
model_dict.update(checkpoint_dict) |
|
model.load_state_dict(model_dict) |
|
logger.info("=> loaded det branch checkpoint '{}' ".format(checkpoint_file)) |
|
|
|
if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): |
|
logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) |
|
checkpoint = torch.load(checkpoint_file) |
|
begin_epoch = checkpoint['epoch'] |
|
|
|
last_epoch = checkpoint['epoch'] |
|
model.load_state_dict(checkpoint['state_dict']) |
|
|
|
optimizer.load_state_dict(checkpoint['optimizer']) |
|
logger.info("=> loaded checkpoint '{}' (epoch {})".format( |
|
checkpoint_file, checkpoint['epoch'])) |
|
|
|
|
|
|
|
if cfg.TRAIN.SEG_ONLY: |
|
logger.info('freeze encoder and Det head...') |
|
for k, v in model.named_parameters(): |
|
v.requires_grad = True |
|
if k.split(".")[1] in Encoder_para_idx + Det_Head_para_idx: |
|
print('freezing %s' % k) |
|
v.requires_grad = False |
|
|
|
if cfg.TRAIN.DET_ONLY: |
|
logger.info('freeze encoder and two Seg heads...') |
|
|
|
for k, v in model.named_parameters(): |
|
v.requires_grad = True |
|
if k.split(".")[1] in Encoder_para_idx + Da_Seg_Head_para_idx + Ll_Seg_Head_para_idx: |
|
print('freezing %s' % k) |
|
v.requires_grad = False |
|
|
|
if cfg.TRAIN.ENC_SEG_ONLY: |
|
logger.info('freeze Det head...') |
|
for k, v in model.named_parameters(): |
|
v.requires_grad = True |
|
if k.split(".")[1] in Det_Head_para_idx: |
|
print('freezing %s' % k) |
|
v.requires_grad = False |
|
|
|
if cfg.TRAIN.ENC_DET_ONLY or cfg.TRAIN.DET_ONLY: |
|
logger.info('freeze two Seg heads...') |
|
for k, v in model.named_parameters(): |
|
v.requires_grad = True |
|
if k.split(".")[1] in Da_Seg_Head_para_idx + Ll_Seg_Head_para_idx: |
|
print('freezing %s' % k) |
|
v.requires_grad = False |
|
|
|
|
|
if cfg.TRAIN.LANE_ONLY: |
|
logger.info('freeze encoder and Det head and Da_Seg heads...') |
|
|
|
for k, v in model.named_parameters(): |
|
v.requires_grad = True |
|
if k.split(".")[1] in Encoder_para_idx + Da_Seg_Head_para_idx + Det_Head_para_idx: |
|
print('freezing %s' % k) |
|
v.requires_grad = False |
|
|
|
if cfg.TRAIN.DRIVABLE_ONLY: |
|
logger.info('freeze encoder and Det head and Ll_Seg heads...') |
|
|
|
for k, v in model.named_parameters(): |
|
v.requires_grad = True |
|
if k.split(".")[1] in Encoder_para_idx + Ll_Seg_Head_para_idx + Det_Head_para_idx: |
|
print('freezing %s' % k) |
|
v.requires_grad = False |
|
|
|
if rank == -1 and torch.cuda.device_count() > 1: |
|
model = torch.nn.DataParallel(model, device_ids=cfg.GPUS) |
|
|
|
|
|
if rank != -1: |
|
model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank,find_unused_parameters=True) |
|
|
|
|
|
|
|
model.gr = 1.0 |
|
model.nc = 1 |
|
|
|
|
|
print("begin to load data") |
|
|
|
normalize = transforms.Normalize( |
|
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] |
|
) |
|
|
|
train_dataset = eval('dataset.' + cfg.DATASET.DATASET)( |
|
cfg=cfg, |
|
is_train=True, |
|
inputsize=cfg.MODEL.IMAGE_SIZE, |
|
transform=transforms.Compose([ |
|
transforms.ToTensor(), |
|
normalize, |
|
]) |
|
) |
|
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if rank != -1 else None |
|
|
|
train_loader = DataLoaderX( |
|
train_dataset, |
|
batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS), |
|
shuffle=(cfg.TRAIN.SHUFFLE & rank == -1), |
|
num_workers=cfg.WORKERS, |
|
sampler=train_sampler, |
|
pin_memory=cfg.PIN_MEMORY, |
|
collate_fn=dataset.AutoDriveDataset.collate_fn |
|
) |
|
num_batch = len(train_loader) |
|
|
|
if rank in [-1, 0]: |
|
valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)( |
|
cfg=cfg, |
|
is_train=False, |
|
inputsize=cfg.MODEL.IMAGE_SIZE, |
|
transform=transforms.Compose([ |
|
transforms.ToTensor(), |
|
normalize, |
|
]) |
|
) |
|
|
|
valid_loader = DataLoaderX( |
|
valid_dataset, |
|
batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS), |
|
shuffle=False, |
|
num_workers=cfg.WORKERS, |
|
pin_memory=cfg.PIN_MEMORY, |
|
collate_fn=dataset.AutoDriveDataset.collate_fn |
|
) |
|
print('load data finished') |
|
|
|
if rank in [-1, 0]: |
|
if cfg.NEED_AUTOANCHOR: |
|
logger.info("begin check anchors") |
|
run_anchor(logger,train_dataset, model=model, thr=cfg.TRAIN.ANCHOR_THRESHOLD, imgsz=min(cfg.MODEL.IMAGE_SIZE)) |
|
else: |
|
logger.info("anchors loaded successfully") |
|
det = model.module.model[model.module.detector_index] if is_parallel(model) \ |
|
else model.model[model.detector_index] |
|
logger.info(str(det.anchors)) |
|
|
|
|
|
num_warmup = max(round(cfg.TRAIN.WARMUP_EPOCHS * num_batch), 1000) |
|
scaler = amp.GradScaler(enabled=device.type != 'cpu') |
|
print('=> start training...') |
|
for epoch in range(begin_epoch+1, cfg.TRAIN.END_EPOCH+1): |
|
if rank != -1: |
|
train_loader.sampler.set_epoch(epoch) |
|
|
|
train(cfg, train_loader, model, criterion, optimizer, scaler, |
|
epoch, num_batch, num_warmup, writer_dict, logger, device, rank) |
|
|
|
lr_scheduler.step() |
|
|
|
|
|
if (epoch % cfg.TRAIN.VAL_FREQ == 0 or epoch == cfg.TRAIN.END_EPOCH) and rank in [-1, 0]: |
|
|
|
da_segment_results,ll_segment_results,detect_results, total_loss,maps, times = validate( |
|
epoch,cfg, valid_loader, valid_dataset, model, criterion, |
|
final_output_dir, tb_log_dir, writer_dict, |
|
logger, device, rank |
|
) |
|
fi = fitness(np.array(detect_results).reshape(1, -1)) |
|
|
|
msg = 'Epoch: [{0}] Loss({loss:.3f})\n' \ |
|
'Driving area Segment: Acc({da_seg_acc:.3f}) IOU ({da_seg_iou:.3f}) mIOU({da_seg_miou:.3f})\n' \ |
|
'Lane line Segment: Acc({ll_seg_acc:.3f}) IOU ({ll_seg_iou:.3f}) mIOU({ll_seg_miou:.3f})\n' \ |
|
'Detect: P({p:.3f}) R({r:.3f}) [email protected]({map50:.3f}) [email protected]:0.95({map:.3f})\n'\ |
|
'Time: inference({t_inf:.4f}s/frame) nms({t_nms:.4f}s/frame)'.format( |
|
epoch, loss=total_loss, da_seg_acc=da_segment_results[0],da_seg_iou=da_segment_results[1],da_seg_miou=da_segment_results[2], |
|
ll_seg_acc=ll_segment_results[0],ll_seg_iou=ll_segment_results[1],ll_seg_miou=ll_segment_results[2], |
|
p=detect_results[0],r=detect_results[1],map50=detect_results[2],map=detect_results[3], |
|
t_inf=times[0], t_nms=times[1]) |
|
logger.info(msg) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if rank in [-1, 0]: |
|
savepath = os.path.join(final_output_dir, f'epoch-{epoch}.pth') |
|
logger.info('=> saving checkpoint to {}'.format(savepath)) |
|
save_checkpoint( |
|
epoch=epoch, |
|
name=cfg.MODEL.NAME, |
|
model=model, |
|
|
|
|
|
optimizer=optimizer, |
|
output_dir=final_output_dir, |
|
filename=f'epoch-{epoch}.pth' |
|
) |
|
save_checkpoint( |
|
epoch=epoch, |
|
name=cfg.MODEL.NAME, |
|
model=model, |
|
|
|
|
|
optimizer=optimizer, |
|
output_dir=os.path.join(cfg.LOG_DIR, cfg.DATASET.DATASET), |
|
filename='checkpoint.pth' |
|
) |
|
|
|
|
|
if rank in [-1, 0]: |
|
final_model_state_file = os.path.join( |
|
final_output_dir, 'final_state.pth' |
|
) |
|
logger.info('=> saving final model state to {}'.format( |
|
final_model_state_file) |
|
) |
|
model_state = model.module.state_dict() if is_parallel(model) else model.state_dict() |
|
torch.save(model_state, final_model_state_file) |
|
writer_dict['writer'].close() |
|
else: |
|
dist.destroy_process_group() |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |