diff --git a/OpenPSG/checkpoints/epoch_60.pth b/OpenPSG/checkpoints/epoch_60.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9ca2d6cd2ca7532a08cd84df438dbdcece5049c5
--- /dev/null
+++ b/OpenPSG/checkpoints/epoch_60.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c4ddcbda74686568b7e6b8145f7f33030407e27e390c37c23206f95c51829ed
+size 531751994
diff --git a/OpenPSG/configs/_base_/custom_runtime.py b/OpenPSG/configs/_base_/custom_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c0898bafac0870b691dcfc1467a618973646e7f
--- /dev/null
+++ b/OpenPSG/configs/_base_/custom_runtime.py
@@ -0,0 +1,17 @@
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+# yapf:disable
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+custom_hooks = [dict(type='NumClassCheckHook')]
+
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+
+workflow = [('train', 1), ('val', 1)]
diff --git a/OpenPSG/configs/_base_/datasets/psg.py b/OpenPSG/configs/_base_/datasets/psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..052dcd787578900f875b7f9d43729a188a4d2aca
--- /dev/null
+++ b/OpenPSG/configs/_base_/datasets/psg.py
@@ -0,0 +1,93 @@
+# dataset settings
+dataset_type = 'PanopticSceneGraphDataset'
+ann_file = './data/psg/psg.json'
+coco_root = 'data/coco'
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='LoadPanopticSceneGraphAnnotations',
+ with_bbox=True,
+ with_rel=True,
+ with_mask=True,
+ with_seg=True,
+ ),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='SegRescale', scale_factor=1 / 4),
+ dict(type='SceneGraphFormatBundle'),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ 'gt_bboxes',
+ 'gt_labels',
+ 'gt_rels',
+ 'gt_relmaps',
+ 'gt_masks',
+ 'gt_semantic_seg',
+ ],
+ ),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ # Since the forward process may need gt info, annos must be loaded.
+ dict(type='LoadPanopticSceneGraphAnnotations',
+ with_bbox=True,
+ with_rel=True),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ # NOTE: Do not change the img to DC.
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+ dict(
+ type='ToDataContainer',
+ fields=(dict(key='gt_bboxes'), dict(key='gt_labels')),
+ ),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+ ],
+ ),
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=coco_root,
+ seg_prefix=coco_root,
+ pipeline=train_pipeline,
+ split='train',
+ all_bboxes=True,
+ ),
+ val=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=coco_root,
+ seg_prefix=coco_root,
+ pipeline=test_pipeline,
+ split='test',
+ all_bboxes=True,
+ ),
+ test=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=coco_root,
+ seg_prefix=coco_root,
+ pipeline=test_pipeline,
+ split='test',
+ all_bboxes=True,
+ ),
+)
diff --git a/OpenPSG/configs/_base_/datasets/psg_panoptic.py b/OpenPSG/configs/_base_/datasets/psg_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e5ee5f27af854da81cc9b936a47d3ed7721502f
--- /dev/null
+++ b/OpenPSG/configs/_base_/datasets/psg_panoptic.py
@@ -0,0 +1,72 @@
+# dataset settings
+dataset_type = 'PanopticSceneGraphDataset'
+ann_file = './data/psg/psg.json'
+coco_root = './data/coco'
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='LoadPanopticSceneGraphAnnotations',
+ with_bbox=True,
+ with_mask=True,
+ with_seg=True,
+ ),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='SegRescale', scale_factor=1 / 4),
+ dict(type='DefaultFormatBundle'),
+ dict(
+ type='Collect',
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg'],
+ ),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ],
+ ),
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=coco_root,
+ seg_prefix=coco_root,
+ pipeline=train_pipeline,
+ split='train',
+ ),
+ val=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=coco_root,
+ seg_prefix=coco_root,
+ pipeline=test_pipeline,
+ split='test',
+ ),
+ test=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=coco_root,
+ seg_prefix=coco_root,
+ pipeline=test_pipeline,
+ split='test',
+ ),
+)
+evaluation = dict(interval=1, metric='PQ')
diff --git a/OpenPSG/configs/_base_/datasets/vg_detection.py b/OpenPSG/configs/_base_/datasets/vg_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..d826ecca5ea9c9bfbaf08366b5b2a468c908363b
--- /dev/null
+++ b/OpenPSG/configs/_base_/datasets/vg_detection.py
@@ -0,0 +1,56 @@
+# dataset settings
+custom_imports = dict(imports=[
+ 'openpsg.datasets',
+ 'openpsg.datasets.pipelines',
+],
+ allow_failed_imports=False)
+
+dataset_type = 'SceneGraphDataset'
+ann_file = 'data/vg/data_openpsg.json'
+img_dir = 'data/vg/VG_100K'
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadSceneGraphAnnotations', with_bbox=True),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=img_dir,
+ pipeline=train_pipeline,
+ split='train'),
+ val=dict(type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=img_dir,
+ pipeline=test_pipeline,
+ split='test'),
+ test=dict(type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=img_dir,
+ pipeline=test_pipeline,
+ split='test'))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/OpenPSG/configs/_base_/datasets/vg_sg.py b/OpenPSG/configs/_base_/datasets/vg_sg.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f555ac70bc04c85cbeb9099fd792114ee2ed9a9
--- /dev/null
+++ b/OpenPSG/configs/_base_/datasets/vg_sg.py
@@ -0,0 +1,57 @@
+# dataset settings
+dataset_type = 'SceneGraphDataset'
+ann_file = '/mnt/ssd/gzj/data/VisualGenome/data_openpsg.json'
+img_dir = '/mnt/ssd/gzj/data/VisualGenome/VG_100K'
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='SceneGraphFormatBundle'),
+ dict(type='Collect',
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_relmaps']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ # Since the forward process may need gt info, annos must be loaded.
+ dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ # NOTE: Do not change the img to DC.
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+ dict(type='ToDataContainer',
+ fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+ ])
+]
+data = dict(samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=img_dir,
+ pipeline=train_pipeline,
+ split='train'),
+ val=dict(type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=img_dir,
+ pipeline=test_pipeline,
+ split='test'),
+ test=dict(type=dataset_type,
+ ann_file=ann_file,
+ img_prefix=img_dir,
+ pipeline=test_pipeline,
+ split='test'))
diff --git a/OpenPSG/configs/_base_/models/detr4seg_r101.py b/OpenPSG/configs/_base_/models/detr4seg_r101.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c366f686fe6b2467ec29613cb9f95a229d038cc
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr4seg_r101.py
@@ -0,0 +1,64 @@
+model = dict(
+ type='DETR4seg',
+ backbone=dict(type='ResNet',
+ depth=101,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet101')),
+ bbox_head=dict(type='detr4segHead',
+ num_classes=80,
+ in_channels=2048,
+ transformer=dict(
+ type='Transformer',
+ encoder=dict(type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1)
+ ],
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=6,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1),
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'cross_attn', 'norm', 'ffn',
+ 'norm')),
+ )),
+ positional_encoding=dict(type='SinePositionalEncoding',
+ num_feats=128,
+ normalize=True),
+ loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ dice_loss=dict(type='DiceLoss', loss_weight=1.0)),
+ # training and testing settings
+ train_cfg=dict(assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='ClassificationCost', weight=1.),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+ test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py b/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d21e75bc4fd8b693daeaa488a613feb052914fe
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py
@@ -0,0 +1,137 @@
+_base_ = [
+ '../_base_/models/detr4seg_r101.py', '../_base_/datasets/psg.py',
+ '../_base_/custom_runtime.py'
+]
+
+custom_imports = dict(imports=[
+ 'openpsg.models.frameworks.detr4seg',
+ 'openpsg.models.relation_heads.detr4seg_head', 'openpsg.datasets',
+ 'openpsg.datasets.pipelines.loading',
+ 'openpsg.datasets.pipelines.rel_randomcrop',
+ 'openpsg.models.relation_heads.approaches.matcher',
+ 'openpsg.models.losses.seg_losses'
+],
+ allow_failed_imports=False)
+
+object_classes = [
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+ 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+ 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+ 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+ 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+ 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+ 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+ 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+ 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+ 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+ 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+ 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+ 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+ 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+ 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+ 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+ 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+ 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+ 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+ 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+ 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+ 'food-other-merged', 'building-other-merged', 'rock-merged',
+ 'wall-other-merged', 'rug-merged'
+]
+
+model = dict(bbox_head=dict(
+ num_classes=len(object_classes),
+ object_classes=object_classes,
+))
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(
+ type='AutoAugment',
+ policies=[
+ [
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True)
+ ],
+ [
+ dict(type='Resize',
+ img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True),
+ dict(type='RandomCrop',
+ crop_type='absolute_range',
+ crop_size=(384, 600),
+ allow_negative_crop=False), # no empty relations
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ override=True,
+ keep_ratio=True)
+ ]
+ ]),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='RelsFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+data = dict(samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(pipeline=train_pipeline),
+ val=dict(pipeline=test_pipeline),
+ test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=0.0001,
+ weight_decay=0.0001,
+ paramwise_cfg=dict(
+ custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=110)
+runner = dict(type='EpochBasedRunner', max_epochs=150)
+
+project_name = 'detr4seg'
+expt_name = 'detr4seg_r101_coco'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')],
+)
+
+load_from = '/mnt/ssd/gzj/test/OpenPSG/detr_r50_fb_origin.pth'
diff --git a/OpenPSG/configs/_base_/models/detr4seg_r50.py b/OpenPSG/configs/_base_/models/detr4seg_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..326bc62336154ca94211a820406fb26025a9c544
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr4seg_r50.py
@@ -0,0 +1,65 @@
+model = dict(
+ type='DETR4seg',
+ backbone=dict(type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet50')),
+ bbox_head=dict(type='detr4segHead',
+ num_classes=80,
+ in_channels=2048,
+ transformer=dict(
+ type='Transformer',
+ encoder=dict(type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1)
+ ],
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=6,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1),
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'cross_attn', 'norm', 'ffn',
+ 'norm')),
+ )),
+ positional_encoding=dict(type='SinePositionalEncoding',
+ num_feats=128,
+ normalize=True),
+ loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
+ dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)),
+ # training and testing settings
+ train_cfg=dict(assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='ClassificationCost', weight=1.),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+ test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py b/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..07324d4942419d7879ce771a19cc8215a45fd5d2
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py
@@ -0,0 +1,152 @@
+_base_ = ['./detr4seg_r50.py', '../datasets/psg.py', '../custom_runtime.py']
+
+custom_imports = dict(imports=[
+ 'openpsg.models.frameworks.detr4seg',
+ 'openpsg.models.relation_heads.detr4seg_head', 'openpsg.datasets',
+ 'openpsg.datasets.pipelines.loading',
+ 'openpsg.datasets.pipelines.rel_randomcrop',
+ 'openpsg.models.relation_heads.approaches.matcher',
+ 'openpsg.models.losses.seg_losses'
+],
+ allow_failed_imports=False)
+
+object_classes = [
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+ 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+ 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+ 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+ 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+ 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+ 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+ 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+ 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+ 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+ 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+ 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+ 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+ 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+ 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+ 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+ 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+ 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+ 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+ 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+ 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+ 'food-other-merged', 'building-other-merged', 'rock-merged',
+ 'wall-other-merged', 'rug-merged'
+]
+
+model = dict(bbox_head=dict(
+ num_classes=len(object_classes),
+ object_classes=object_classes,
+))
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadPanopticSceneGraphAnnotations',
+ with_bbox=True,
+ with_mask=True,
+ with_seg=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(
+ type='AutoAugment',
+ policies=[
+ [
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True)
+ ],
+ [
+ dict(type='Resize',
+ img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True),
+ dict(type='RandomCrop',
+ crop_type='absolute_range',
+ crop_size=(384, 600),
+ allow_negative_crop=False), # no empty relations
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ override=True,
+ keep_ratio=True)
+ ]
+ ]),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='RelsFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+data = dict(samples_per_gpu=1,
+ workers_per_gpu=1,
+ train=dict(pipeline=train_pipeline),
+ val=dict(pipeline=test_pipeline),
+ test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='AdamW',
+ lr=0.00001,
+ weight_decay=0.0001,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+ 'bbox_attention': dict(lr_mult=10.0, decay_mult=1.0),
+ 'mask_head': dict(lr_mult=10.0, decay_mult=1.0)
+ }))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=8)
+runner = dict(type='EpochBasedRunner', max_epochs=10)
+
+evaluation = dict(interval=1, metric='PQ')
+checkpoint_config = dict(interval=1, max_keep_ckpts=10)
+
+project_name = 'detr4seg'
+expt_name = 'test_detr4seg_r50_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ))
+ ],
+)
+
+load_from = 'detr_pan_r50.pth'
diff --git a/OpenPSG/configs/_base_/models/detr_r50.py b/OpenPSG/configs/_base_/models/detr_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83d7d5e108ff52eb9c2c8701697684e1fd88844
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr_r50.py
@@ -0,0 +1,64 @@
+model = dict(
+ type='DETR',
+ backbone=dict(type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(3, ),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet50')),
+ bbox_head=dict(type='DETRHead',
+ num_classes=80,
+ in_channels=2048,
+ transformer=dict(
+ type='Transformer',
+ encoder=dict(type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1)
+ ],
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=6,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1),
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'cross_attn', 'norm', 'ffn',
+ 'norm')),
+ )),
+ positional_encoding=dict(type='SinePositionalEncoding',
+ num_feats=128,
+ normalize=True),
+ loss_cls=dict(type='CrossEntropyLoss',
+ bg_cls_weight=0.1,
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+ # training and testing settings
+ train_cfg=dict(assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='ClassificationCost', weight=1.),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+ test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py b/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..19b9c3d29c7af8ac828c25a1b388248aa23a2d77
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,107 @@
+# model settings
+model = dict(
+ type='MaskRCNN',
+ backbone=dict(type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet50')),
+ neck=dict(type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=True,
+ loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ roi_head=dict(type='StandardRoIHead',
+ bbox_roi_extractor=dict(type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign',
+ output_size=7,
+ sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ mask_roi_extractor=dict(type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign',
+ output_size=14,
+ sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ mask_head=dict(type='FCNMaskHead',
+ num_convs=4,
+ in_channels=256,
+ conv_out_channels=256,
+ num_classes=80,
+ loss_mask=dict(type='CrossEntropyLoss',
+ use_mask=True,
+ loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(rpn=dict(assigner=dict(type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(nms_pre=2000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(assigner=dict(type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(rpn=dict(nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100,
+ mask_thr_binary=0.5)))
diff --git a/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py b/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..449ec6c9ff81c8447bc74029fad68d1bb3dc9598
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py
@@ -0,0 +1,8 @@
+_base_ = './panoptic_fpn_r50_fpn_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+expt_name = 'panoptic_fpn_r101_fpn_psg'
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py b/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a01a4ea386ddb8c4264a6454da4d70ffde63fc
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py
@@ -0,0 +1,74 @@
+_base_ = [
+ '../models/mask_rcnn_r50_fpn.py',
+ '../datasets/psg_panoptic.py',
+ '../schedules/schedule_1x.py',
+ '../custom_runtime.py',
+]
+
+model = dict(
+ type='PanopticFPN',
+ semantic_head=dict(
+ type='PanopticFPNHead',
+ num_things_classes=80,
+ num_stuff_classes=53,
+ in_channels=256,
+ inner_channels=128,
+ start_level=0,
+ end_level=4,
+ norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+ conv_cfg=None,
+ loss_seg=dict(type='CrossEntropyLoss',
+ ignore_index=255,
+ loss_weight=0.5),
+ ),
+ panoptic_fusion_head=dict(type='HeuristicFusionHead',
+ num_things_classes=80,
+ num_stuff_classes=53),
+ test_cfg=dict(panoptic=dict(
+ score_thr=0.6,
+ max_per_img=100,
+ mask_thr_binary=0.5,
+ mask_overlap=0.5,
+ nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True),
+ stuff_area_limit=4096,
+ )),
+)
+
+custom_hooks = []
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=8,
+ # workers_per_gpu=2
+ )
+# optimizer = dict(lr=0.02)
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(_delete_=True,
+ grad_clip=dict(max_norm=35, norm_type=2))
+
+lr_config = dict(policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ step=[8, 11])
+
+project_name = 'openpsg'
+expt_name = 'panoptic_fpn_r50_fpn_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth'
diff --git a/OpenPSG/configs/_base_/models/psgtr_r101.py b/OpenPSG/configs/_base_/models/psgtr_r101.py
new file mode 100644
index 0000000000000000000000000000000000000000..28a043e12a54656ed52202a348058bd0dc3d6f9d
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/psgtr_r101.py
@@ -0,0 +1,5 @@
+_base_ = './psgtr_r50.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
diff --git a/OpenPSG/configs/_base_/models/psgtr_r50.py b/OpenPSG/configs/_base_/models/psgtr_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..96eccd68df077c5de98613fe62d4bcacb5b7f5a4
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/psgtr_r50.py
@@ -0,0 +1,82 @@
+model = dict(
+ type='PSGTr',
+ backbone=dict(type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet50')),
+ bbox_head=dict(type='PSGTrHead',
+ num_classes=80,
+ num_relations=117,
+ in_channels=2048,
+ transformer=dict(
+ type='Transformer',
+ encoder=dict(type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1)
+ ],
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=6,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1),
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'cross_attn', 'norm', 'ffn',
+ 'norm')),
+ )),
+ positional_encoding=dict(type='SinePositionalEncoding',
+ num_feats=128,
+ normalize=True),
+ sub_loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=1.0),
+ sub_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ sub_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ sub_focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
+ sub_dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0),
+ obj_loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=1.0),
+ obj_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ obj_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ obj_focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
+ obj_dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0),
+ rel_loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=2.0,
+ class_weight=1.0)),
+ # training and testing settings
+ train_cfg=dict(assigner=dict(
+ type='HTriMatcher',
+ s_cls_cost=dict(type='ClassificationCost', weight=1.),
+ s_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+ s_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ o_cls_cost=dict(type='ClassificationCost', weight=1.),
+ o_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+ o_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ r_cls_cost=dict(type='ClassificationCost', weight=2.))),
+ test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/_base_/schedules/schedule_1x.py b/OpenPSG/configs/_base_/schedules/schedule_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c01d3df3d9169fee87ffeaa4e0fb60ac3f07b66
--- /dev/null
+++ b/OpenPSG/configs/_base_/schedules/schedule_1x.py
@@ -0,0 +1,10 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/OpenPSG/configs/_base_/schedules/schedule_3x.py b/OpenPSG/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000000000000000000000000000000000000..4109da969702ecb2962606ec3891cedfcd4cd2ae
--- /dev/null
+++ b/OpenPSG/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,10 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step',
+ warmup='linear',
+ warmup_iters=1000,
+ warmup_ratio=0.001,
+ step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1be5fdcf74eeb3e941ef2829546cfb14338face8
--- /dev/null
+++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py
@@ -0,0 +1,26 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'gpsnet_panoptic_fpn_r101_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae604515cefc1aa3849ee328c1667408f08cab4
--- /dev/null
+++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
@@ -0,0 +1,26 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'gpsnet_panoptic_fpn_r101_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd06ebcd9c19aec5210937600af4db0d66d99def
--- /dev/null
+++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py
@@ -0,0 +1,41 @@
+_base_ = [
+ '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(relation_head=dict(
+ type='GPSHead',
+ head_config=dict(
+ # NOTE: Evaluation type
+ use_gt_box=True,
+ use_gt_label=True,
+ ),
+))
+
+evaluation = dict(interval=1,
+ metric='predcls',
+ relation_mode=True,
+ classwise=True,
+ detection_method='pan_seg')
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16, workers_per_gpu=0)
+optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'gpsnet_panoptic_fpn_r50_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ ),
+ ],
+)
diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..78165a4ce56b57819445d8d58840c6f9fca5f4a8
--- /dev/null
+++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
@@ -0,0 +1,45 @@
+_base_ = [
+ '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(
+ relation_head=dict(
+ type='GPSHead',
+ head_config=dict(
+ # NOTE: Evaluation type
+ use_gt_box=False,
+ use_gt_label=False,
+ ),
+ ),
+ roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
+)
+
+evaluation = dict(
+ interval=1,
+ metric='sgdet',
+ relation_mode=True,
+ classwise=True,
+ iou_thrs=0.5,
+ detection_method='pan_seg',
+)
+
+data = dict(samples_per_gpu=16)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'gpsnet_panoptic_fpn_r50_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ ),
+ ],
+)
diff --git a/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..28bc0487451535069f9301853e0190fc9025bb85
--- /dev/null
+++ b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'imp_panoptic_fpn_r101_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f0f96866d423e0f6a214e98462c721626744309
--- /dev/null
+++ b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
@@ -0,0 +1,26 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'imp_panoptic_fpn_r101_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..93189cfd37a51374fe62e29b0bc8550559da3a27
--- /dev/null
+++ b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py
@@ -0,0 +1,44 @@
+_base_ = [
+ '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(relation_head=dict(
+ type='IMPHead',
+ head_config=dict(
+ # NOTE: Evaluation type
+ use_gt_box=True,
+ use_gt_label=True,
+ num_iter=2,
+ ),
+))
+
+evaluation = dict(interval=1,
+ metric='predcls',
+ relation_mode=True,
+ classwise=True)
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16, )
+# workers_per_gpu=0) # FIXME: Is this the problem?
+optimizer = dict(type='SGD', lr=0.001, momentum=0.9)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'imp_panoptic_fpn_r50_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
diff --git a/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ec83492bfccc1b706723b6de680392f9b0e2c7a
--- /dev/null
+++ b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
@@ -0,0 +1,48 @@
+_base_ = [
+ '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(relation_head=dict(
+ type='IMPHead',
+ head_config=dict(
+ # NOTE: Evaluation type
+ use_gt_box=False,
+ use_gt_label=False,
+ num_iter=2,
+ ),
+))
+
+evaluation = dict(
+ interval=1,
+ metric='sgdet',
+ relation_mode=True,
+ classwise=True,
+ iou_thrs=0.5,
+ detection_method='pan_seg',
+)
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16, )
+# workers_per_gpu=0) # FIXME: Is this the problem?
+optimizer = dict(type='SGD', lr=0.001, momentum=0.9)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'imp_panoptic_fpn_r50_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d125d475b96e26c7862d16b5335798ee9defab44
--- /dev/null
+++ b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'motifs_panoptic_fpn_r101_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b1f9eadee9904706504b57f896e2e6482d6385
--- /dev/null
+++ b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'motifs_panoptic_fpn_r101_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..72e2afc7e139a93749fcb28f8f8a7b4c3612478d
--- /dev/null
+++ b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py
@@ -0,0 +1,241 @@
+_base_ = [
+ '../_base_/models/mask_rcnn_r50_fpn.py',
+ '../_base_/datasets/psg.py',
+ '../_base_/schedules/schedule_1x.py',
+ '../_base_/custom_runtime.py',
+]
+
+find_unused_parameters = True
+dataset_type = 'PanopticSceneGraphDataset'
+
+# HACK:
+object_classes = [
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+ 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+ 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+ 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+ 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+ 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+ 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+ 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+ 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+ 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+ 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+ 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+ 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+ 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+ 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+ 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+ 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+ 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+ 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+ 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+ 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+ 'food-other-merged', 'building-other-merged', 'rock-merged',
+ 'wall-other-merged', 'rug-merged'
+]
+
+predicate_classes = [
+ 'over',
+ 'in front of',
+ 'beside',
+ 'on',
+ 'in',
+ 'attached to',
+ 'hanging from',
+ 'on back of',
+ 'falling off',
+ 'going down',
+ 'painted on',
+ 'walking on',
+ 'running on',
+ 'crossing',
+ 'standing on',
+ 'lying on',
+ 'sitting on',
+ 'flying over',
+ 'jumping over',
+ 'jumping from',
+ 'wearing',
+ 'holding',
+ 'carrying',
+ 'looking at',
+ 'guiding',
+ 'kissing',
+ 'eating',
+ 'drinking',
+ 'feeding',
+ 'biting',
+ 'catching',
+ 'picking',
+ 'playing with',
+ 'chasing',
+ 'climbing',
+ 'cleaning',
+ 'playing',
+ 'touching',
+ 'pushing',
+ 'pulling',
+ 'opening',
+ 'cooking',
+ 'talking to',
+ 'throwing',
+ 'slicing',
+ 'driving',
+ 'riding',
+ 'parked on',
+ 'driving on',
+ 'about to hit',
+ 'kicking',
+ 'swinging',
+ 'entering',
+ 'exiting',
+ 'enclosing',
+ 'leaning on',
+]
+
+model = dict(
+ type='SceneGraphPanopticFPN',
+ semantic_head=dict(
+ type='PanopticFPNHead',
+ num_things_classes=80,
+ num_stuff_classes=53,
+ in_channels=256,
+ inner_channels=128,
+ start_level=0,
+ end_level=4,
+ norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+ conv_cfg=None,
+ loss_seg=dict(type='CrossEntropyLoss',
+ ignore_index=255,
+ loss_weight=0.5),
+ ),
+ panoptic_fusion_head=dict(type='HeuristicFusionHead',
+ num_things_classes=80,
+ num_stuff_classes=53),
+ test_cfg=dict(panoptic=dict(
+ score_thr=0.6,
+ max_per_img=100,
+ mask_thr_binary=0.5,
+ mask_overlap=0.5,
+ nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True),
+ stuff_area_limit=4096,
+ )),
+ relation_head=dict(
+ type='MotifHead',
+ object_classes=object_classes,
+ predicate_classes=predicate_classes,
+ num_classes=len(object_classes) + 1, # with background class
+ num_predicates=len(predicate_classes) + 1,
+ use_bias=False, # NOTE: whether to use frequency bias
+ head_config=dict(
+ # NOTE: Evaluation type
+ use_gt_box=True,
+ use_gt_label=True,
+ use_vision=True,
+ embed_dim=200,
+ hidden_dim=512,
+ roi_dim=1024,
+ context_pooling_dim=4096,
+ dropout_rate=0.2,
+ context_object_layer=1,
+ context_edge_layer=1,
+ glove_dir='data/glove/',
+ causal_effect_analysis=False,
+ ),
+ bbox_roi_extractor=dict(
+ type='VisualSpatialExtractor',
+ bbox_roi_layer=dict(type='RoIAlign',
+ output_size=7,
+ sampling_ratio=2),
+ with_visual_bbox=True,
+ with_visual_mask=False,
+ with_visual_point=False,
+ with_spatial=False,
+ in_channels=256,
+ fc_out_channels=1024,
+ featmap_strides=[4, 8, 16, 32],
+ ),
+ relation_roi_extractor=dict(
+ type='VisualSpatialExtractor',
+ bbox_roi_layer=dict(type='RoIAlign',
+ output_size=7,
+ sampling_ratio=2),
+ with_visual_bbox=True,
+ with_visual_mask=False,
+ with_visual_point=False,
+ with_spatial=True,
+ separate_spatial=False,
+ in_channels=256,
+ fc_out_channels=1024,
+ featmap_strides=[4, 8, 16, 32],
+ ),
+ relation_sampler=dict(
+ type='Motif',
+ pos_iou_thr=0.5,
+ require_overlap=False, # for sgdet training, not require
+ num_sample_per_gt_rel=4,
+ num_rel_per_image=1024,
+ pos_fraction=0.25,
+ # NOTE: To only include overlapping bboxes?
+ test_overlap=False, # for testing
+ ),
+ loss_object=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_relation=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ ),
+)
+
+custom_hooks = []
+
+# To freeze modules
+freeze_modules = [
+ 'backbone',
+ 'neck',
+ 'rpn_head',
+ 'roi_head',
+ 'semantic_head',
+ 'panoptic_fusion_head',
+]
+
+evaluation = dict(interval=1,
+ metric='predcls',
+ relation_mode=True,
+ classwise=True)
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16, )
+# optimizer = dict(lr=0.003)
+optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(_delete_=True,
+ grad_clip=dict(max_norm=35, norm_type=2))
+
+lr_config = dict(policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ step=[7, 10])
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'motifs_panoptic_fpn_r50_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth'
diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..55586140a9723c83b0b347bbfde042822ae8618b
--- /dev/null
+++ b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
@@ -0,0 +1,44 @@
+_base_ = [
+ './panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(
+ relation_head=dict(
+ head_config=dict(
+ # NOTE: Evaluation type
+ use_gt_box=False,
+ use_gt_label=False,
+ ), ),
+ roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
+)
+
+evaluation = dict(interval=1,
+ metric='sgdet',
+ relation_mode=True,
+ classwise=True,
+ iou_thrs=0.5,
+ detection_method='pan_seg')
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=8,
+ # workers_per_gpu=2
+ )
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'motifs_panoptic_fpn_r50_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ ),
+ ],
+)
diff --git a/OpenPSG/configs/psgformer/psgformer_r101_psg.py b/OpenPSG/configs/psgformer/psgformer_r101_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..7055248f2307ca9b32f7efe3c6a65f118019a0c7
--- /dev/null
+++ b/OpenPSG/configs/psgformer/psgformer_r101_psg.py
@@ -0,0 +1,16 @@
+_base_ = './psgformer_r50_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# learning policy
+lr_config = dict(policy='step', step=48)
+runner = dict(type='EpochBasedRunner', max_epochs=60)
+
+project_name = 'psgformer'
+expt_name = 'psgformer_r101_psg'
+work_dir = f'./work_dirs/{expt_name}'
+checkpoint_config = dict(interval=12, max_keep_ckpts=10)
+
+load_from = './work_dirs/checkpoints/detr4psgformer_r101.pth'
diff --git a/OpenPSG/configs/psgformer/psgformer_r50.py b/OpenPSG/configs/psgformer/psgformer_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..31f77e61bf46c57f8b064ca94d6a5d35b8008411
--- /dev/null
+++ b/OpenPSG/configs/psgformer/psgformer_r50.py
@@ -0,0 +1,96 @@
+model = dict(
+ type='PSGTr',
+ backbone=dict(type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet50')),
+ bbox_head=dict(
+ type='PSGFormerHead',
+ num_classes=80,
+ num_relations=117,
+ in_channels=2048,
+ transformer=dict(
+ type='DualTransformer',
+ encoder=dict(type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1)
+ ],
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'ffn',
+ 'norm'))),
+ decoder1=dict(type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=6,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1),
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'cross_attn', 'norm', 'ffn',
+ 'norm'))),
+ decoder2=dict(type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=6,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1),
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'cross_attn', 'norm', 'ffn',
+ 'norm'))),
+ ),
+ positional_encoding=dict(type='SinePositionalEncoding',
+ num_feats=128,
+ normalize=True),
+ rel_loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=2.0,
+ class_weight=1.0),
+ sub_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0),
+ obj_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0),
+ loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=4.0,
+ class_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=3.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
+ dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)),
+ # training and testing settings
+ train_cfg=dict(id_assigner=dict(type='IdMatcher',
+ sub_id_cost=dict(type='ClassificationCost',
+ weight=1.),
+ obj_id_cost=dict(type='ClassificationCost',
+ weight=1.),
+ r_cls_cost=dict(type='ClassificationCost',
+ weight=1.)),
+ bbox_assigner=dict(type='HungarianAssigner',
+ cls_cost=dict(type='ClassificationCost',
+ weight=4.0),
+ reg_cost=dict(type='BBoxL1Cost',
+ weight=3.0),
+ iou_cost=dict(type='IoUCost',
+ iou_mode='giou',
+ weight=2.0))),
+ test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/psgformer/psgformer_r50_psg.py b/OpenPSG/configs/psgformer/psgformer_r50_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6452d39335427fe40de8c8a869dedeb5992da2f9
--- /dev/null
+++ b/OpenPSG/configs/psgformer/psgformer_r50_psg.py
@@ -0,0 +1,244 @@
+_base_ = [
+ './psgformer_r50.py', '../_base_/datasets/psg.py',
+ '../_base_/custom_runtime.py'
+]
+
+find_unused_parameters = True
+
+custom_imports = dict(imports=[
+ 'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
+ 'openpsg.models.frameworks.dual_transformer',
+ 'openpsg.models.relation_heads.psgformer_head', 'openpsg.datasets',
+ 'openpsg.datasets.pipelines.loading',
+ 'openpsg.datasets.pipelines.rel_randomcrop',
+ 'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
+],
+ allow_failed_imports=False)
+
+dataset_type = 'PanopticSceneGraphDataset'
+
+# HACK:
+object_classes = [
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+ 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+ 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+ 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+ 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+ 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+ 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+ 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+ 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+ 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+ 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+ 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+ 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+ 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+ 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+ 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+ 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+ 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+ 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+ 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+ 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+ 'food-other-merged', 'building-other-merged', 'rock-merged',
+ 'wall-other-merged', 'rug-merged'
+]
+
+predicate_classes = [
+ 'over',
+ 'in front of',
+ 'beside',
+ 'on',
+ 'in',
+ 'attached to',
+ 'hanging from',
+ 'on back of',
+ 'falling off',
+ 'going down',
+ 'painted on',
+ 'walking on',
+ 'running on',
+ 'crossing',
+ 'standing on',
+ 'lying on',
+ 'sitting on',
+ 'flying over',
+ 'jumping over',
+ 'jumping from',
+ 'wearing',
+ 'holding',
+ 'carrying',
+ 'looking at',
+ 'guiding',
+ 'kissing',
+ 'eating',
+ 'drinking',
+ 'feeding',
+ 'biting',
+ 'catching',
+ 'picking',
+ 'playing with',
+ 'chasing',
+ 'climbing',
+ 'cleaning',
+ 'playing',
+ 'touching',
+ 'pushing',
+ 'pulling',
+ 'opening',
+ 'cooking',
+ 'talking to',
+ 'throwing',
+ 'slicing',
+ 'driving',
+ 'riding',
+ 'parked on',
+ 'driving on',
+ 'about to hit',
+ 'kicking',
+ 'swinging',
+ 'entering',
+ 'exiting',
+ 'enclosing',
+ 'leaning on',
+]
+
+model = dict(bbox_head=dict(
+ num_classes=len(object_classes),
+ num_relations=len(predicate_classes),
+ object_classes=object_classes,
+ predicate_classes=predicate_classes,
+ num_obj_query=100,
+ num_rel_query=100,
+), )
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadPanopticSceneGraphAnnotations',
+ with_bbox=True,
+ with_rel=True,
+ with_mask=True,
+ with_seg=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(
+ type='AutoAugment',
+ policies=[
+ [
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True)
+ ],
+ [
+ dict(type='Resize',
+ img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True),
+ dict(type='RelRandomCrop',
+ crop_type='absolute_range',
+ crop_size=(384, 600),
+ allow_negative_crop=False), # no empty relations
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ override=True,
+ keep_ratio=True)
+ ]
+ ]),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='RelsFormatBundle'),
+ dict(type='Collect',
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+ dict(type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+ dict(type='ToDataContainer',
+ fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+
+evaluation = dict(
+ interval=1,
+ metric='sgdet',
+ relation_mode=True,
+ classwise=True,
+ iou_thrs=0.5,
+ detection_method='pan_seg',
+)
+
+data = dict(samples_per_gpu=1,
+ workers_per_gpu=2,
+ train=dict(pipeline=train_pipeline),
+ val=dict(pipeline=test_pipeline),
+ test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=0.001,
+ weight_decay=0.0001,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+ 'transformer.encoder': dict(lr_mult=0.1, decay_mult=1.0),
+ 'transformer.decoder1': dict(lr_mult=0.1, decay_mult=1.0),
+ 'obj_query_embed': dict(lr_mult=0.1, decay_mult=1.0),
+ 'input_proj': dict(lr_mult=0.1, decay_mult=1.0),
+ 'class_embed': dict(lr_mult=0.1, decay_mult=1.0),
+ 'box_embed': dict(lr_mult=0.1, decay_mult=1.0),
+ 'bbox_attention': dict(lr_mult=0.1, decay_mult=1.0),
+ 'mask_head': dict(lr_mult=0.1, decay_mult=1.0),
+ }))
+
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=40)
+runner = dict(type='EpochBasedRunner', max_epochs=60)
+
+project_name = 'psgformer'
+expt_name = 'psgformer_r50_psg'
+work_dir = f'./work_dirs/{expt_name}'
+checkpoint_config = dict(interval=1, max_keep_ckpts=15)
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ )
+ ],
+)
+
+load_from = './work_dirs/checkpoints/detr4psgformer_r50.pth'
diff --git a/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py b/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..37bebaf42627dc17503986567b18fc6a9770f427
--- /dev/null
+++ b/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py
@@ -0,0 +1,31 @@
+_base_ = [
+ './psgformer_r50_psg.py'
+]
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ # NOTE: Do not change the img to DC.
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+
+ ],
+ ),
+]
+
+data = dict(
+ test=dict(
+ pipeline=pipeline,
+ ),
+)
\ No newline at end of file
diff --git a/OpenPSG/configs/psgtr/psgtr_r101_psg.py b/OpenPSG/configs/psgtr/psgtr_r101_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..916dc05998c72b83fb5c3221be10af3f5a7f7827
--- /dev/null
+++ b/OpenPSG/configs/psgtr/psgtr_r101_psg.py
@@ -0,0 +1,231 @@
+_base_ = [
+ '../_base_/models/psgtr_r101.py', '../_base_/datasets/psg.py',
+ '../_base_/custom_runtime.py'
+]
+
+custom_imports = dict(imports=[
+ 'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
+ 'openpsg.models.relation_heads.psgtr_head', 'openpsg.datasets',
+ 'openpsg.datasets.pipelines.loading',
+ 'openpsg.datasets.pipelines.rel_randomcrop',
+ 'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
+],
+ allow_failed_imports=False)
+
+dataset_type = 'PanopticSceneGraphDataset'
+
+# HACK:
+object_classes = [
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+ 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+ 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+ 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+ 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+ 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+ 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+ 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+ 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+ 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+ 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+ 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+ 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+ 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+ 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+ 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+ 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+ 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+ 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+ 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+ 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+ 'food-other-merged', 'building-other-merged', 'rock-merged',
+ 'wall-other-merged', 'rug-merged'
+]
+
+predicate_classes = [
+ 'over',
+ 'in front of',
+ 'beside',
+ 'on',
+ 'in',
+ 'attached to',
+ 'hanging from',
+ 'on back of',
+ 'falling off',
+ 'going down',
+ 'painted on',
+ 'walking on',
+ 'running on',
+ 'crossing',
+ 'standing on',
+ 'lying on',
+ 'sitting on',
+ 'flying over',
+ 'jumping over',
+ 'jumping from',
+ 'wearing',
+ 'holding',
+ 'carrying',
+ 'looking at',
+ 'guiding',
+ 'kissing',
+ 'eating',
+ 'drinking',
+ 'feeding',
+ 'biting',
+ 'catching',
+ 'picking',
+ 'playing with',
+ 'chasing',
+ 'climbing',
+ 'cleaning',
+ 'playing',
+ 'touching',
+ 'pushing',
+ 'pulling',
+ 'opening',
+ 'cooking',
+ 'talking to',
+ 'throwing',
+ 'slicing',
+ 'driving',
+ 'riding',
+ 'parked on',
+ 'driving on',
+ 'about to hit',
+ 'kicking',
+ 'swinging',
+ 'entering',
+ 'exiting',
+ 'enclosing',
+ 'leaning on',
+]
+
+model = dict(bbox_head=dict(
+ num_classes=len(object_classes),
+ num_relations=len(predicate_classes),
+ object_classes=object_classes,
+ predicate_classes=predicate_classes,
+ use_mask=True,
+ num_query=100,
+), )
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadPanopticSceneGraphAnnotations',
+ with_bbox=True,
+ with_rel=True,
+ with_mask=True,
+ with_seg=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(
+ type='AutoAugment',
+ policies=[
+ [
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True)
+ ],
+ [
+ dict(type='Resize',
+ img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True),
+ dict(type='RelRandomCrop',
+ crop_type='absolute_range',
+ crop_size=(384, 600),
+ allow_negative_crop=False), # no empty relations
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ override=True,
+ keep_ratio=True)
+ ]
+ ]),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='RelsFormatBundle'),
+ dict(type='Collect',
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ # dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='ImageToTensor', keys=['img']),
+ # dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+ # dict(type='ToDataContainer', fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+
+evaluation = dict(
+ interval=1,
+ metric='sgdet',
+ relation_mode=True,
+ classwise=True,
+ iou_thrs=0.5,
+ detection_method='pan_seg',
+)
+
+data = dict(samples_per_gpu=1,
+ workers_per_gpu=2,
+ train=dict(pipeline=train_pipeline),
+ val=dict(pipeline=test_pipeline),
+ test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=0.0001,
+ weight_decay=0.0001,
+ paramwise_cfg=dict(custom_keys={
+ 'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+ }))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=40)
+runner = dict(type='EpochBasedRunner', max_epochs=60)
+
+project_name = 'psgtr'
+expt_name = 'psgtr_r101_psg'
+work_dir = f'./work_dirs/{expt_name}'
+checkpoint_config = dict(interval=2, max_keep_ckpts=10)
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ ),
+ )
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/detr_pan_r101.pth'
diff --git a/OpenPSG/configs/psgtr/psgtr_r50.py b/OpenPSG/configs/psgtr/psgtr_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8827bbb9461a34a9d894c2aee9fb6286503898d
--- /dev/null
+++ b/OpenPSG/configs/psgtr/psgtr_r50.py
@@ -0,0 +1,82 @@
+model = dict(
+ type='PSGTr',
+ backbone=dict(type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet50')),
+ bbox_head=dict(type='PSGTrHead',
+ num_classes=80,
+ num_relations=117,
+ in_channels=2048,
+ transformer=dict(
+ type='Transformer',
+ encoder=dict(type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1)
+ ],
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=6,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ dropout=0.1),
+ feedforward_channels=2048,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm',
+ 'cross_attn', 'norm', 'ffn',
+ 'norm')),
+ )),
+ positional_encoding=dict(type='SinePositionalEncoding',
+ num_feats=128,
+ normalize=True),
+ sub_loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=1.0),
+ sub_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ sub_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ sub_focal_loss=dict(type='BCEFocalLoss', loss_weight=2.0),
+ sub_dice_loss=dict(type='psgtrDiceLoss', loss_weight=2.0),
+ obj_loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=1.0),
+ obj_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ obj_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ obj_focal_loss=dict(type='BCEFocalLoss', loss_weight=2.0),
+ obj_dice_loss=dict(type='psgtrDiceLoss', loss_weight=2.0),
+ rel_loss_cls=dict(type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=2.0,
+ class_weight=1.0)),
+ # training and testing settings
+ train_cfg=dict(assigner=dict(
+ type='HTriMatcher',
+ s_cls_cost=dict(type='ClassificationCost', weight=1.),
+ s_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+ s_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ o_cls_cost=dict(type='ClassificationCost', weight=1.),
+ o_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+ o_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ r_cls_cost=dict(type='ClassificationCost', weight=2.))),
+ test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/psgtr/psgtr_r50_psg.py b/OpenPSG/configs/psgtr/psgtr_r50_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6440149836d4eadd912b5c00412e247ee4637e68
--- /dev/null
+++ b/OpenPSG/configs/psgtr/psgtr_r50_psg.py
@@ -0,0 +1,233 @@
+_base_ = [
+ '../_base_/models/psgtr_r50.py', '../_base_/datasets/psg.py',
+ '../_base_/custom_runtime.py'
+]
+
+custom_imports = dict(imports=[
+ 'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
+ 'openpsg.models.relation_heads.psgtr_head', 'openpsg.datasets',
+ 'openpsg.datasets.pipelines.loading',
+ 'openpsg.datasets.pipelines.rel_randomcrop',
+ 'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
+],
+ allow_failed_imports=False)
+
+dataset_type = 'PanopticSceneGraphDataset'
+
+# HACK:
+object_classes = [
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+ 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+ 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+ 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+ 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+ 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+ 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+ 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+ 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+ 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+ 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+ 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+ 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+ 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+ 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+ 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+ 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+ 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+ 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+ 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+ 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+ 'food-other-merged', 'building-other-merged', 'rock-merged',
+ 'wall-other-merged', 'rug-merged'
+]
+
+predicate_classes = [
+ 'over',
+ 'in front of',
+ 'beside',
+ 'on',
+ 'in',
+ 'attached to',
+ 'hanging from',
+ 'on back of',
+ 'falling off',
+ 'going down',
+ 'painted on',
+ 'walking on',
+ 'running on',
+ 'crossing',
+ 'standing on',
+ 'lying on',
+ 'sitting on',
+ 'flying over',
+ 'jumping over',
+ 'jumping from',
+ 'wearing',
+ 'holding',
+ 'carrying',
+ 'looking at',
+ 'guiding',
+ 'kissing',
+ 'eating',
+ 'drinking',
+ 'feeding',
+ 'biting',
+ 'catching',
+ 'picking',
+ 'playing with',
+ 'chasing',
+ 'climbing',
+ 'cleaning',
+ 'playing',
+ 'touching',
+ 'pushing',
+ 'pulling',
+ 'opening',
+ 'cooking',
+ 'talking to',
+ 'throwing',
+ 'slicing',
+ 'driving',
+ 'riding',
+ 'parked on',
+ 'driving on',
+ 'about to hit',
+ 'kicking',
+ 'swinging',
+ 'entering',
+ 'exiting',
+ 'enclosing',
+ 'leaning on',
+]
+
+model = dict(bbox_head=dict(
+ num_classes=len(object_classes),
+ num_relations=len(predicate_classes),
+ object_classes=object_classes,
+ predicate_classes=predicate_classes,
+ use_mask=True,
+ num_query=100,
+), )
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadPanopticSceneGraphAnnotations',
+ with_bbox=True,
+ with_rel=True,
+ with_mask=True,
+ with_seg=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(
+ type='AutoAugment',
+ policies=[
+ [
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True)
+ ],
+ [
+ dict(type='Resize',
+ img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+ multiscale_mode='value',
+ keep_ratio=True),
+ dict(type='RelRandomCrop',
+ crop_type='absolute_range',
+ crop_size=(384, 600),
+ allow_negative_crop=False), # no empty relations
+ dict(type='Resize',
+ img_scale=[(480, 1333), (512, 1333), (544, 1333),
+ (576, 1333), (608, 1333), (640, 1333),
+ (672, 1333), (704, 1333), (736, 1333),
+ (768, 1333), (800, 1333)],
+ multiscale_mode='value',
+ override=True,
+ keep_ratio=True)
+ ]
+ ]),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='RelsFormatBundle'),
+ dict(type='Collect',
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ # dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=1),
+ dict(type='ImageToTensor', keys=['img']),
+ # dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+ # dict(type='ToDataContainer', fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+
+evaluation = dict(
+ interval=1,
+ metric='sgdet',
+ relation_mode=True,
+ classwise=True,
+ iou_thrs=0.5,
+ detection_method='pan_seg',
+)
+
+data = dict(samples_per_gpu=1,
+ workers_per_gpu=2,
+ train=dict(pipeline=train_pipeline),
+ val=dict(pipeline=test_pipeline),
+ test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=0.0001,
+ weight_decay=0.0001,
+ paramwise_cfg=dict(custom_keys={
+ 'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+ }))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=40)
+runner = dict(type='EpochBasedRunner', max_epochs=60)
+
+project_name = 'psgformer'
+expt_name = 'psgtr_r50_psg_0.5_scale_mask'
+work_dir = f'./work_dirs/{expt_name}'
+checkpoint_config = dict(interval=2, max_keep_ckpts=10)
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook'),
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ )
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/detr_pan_r50.pth'
diff --git a/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py b/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d32a233c2690c53b40a60a69d10b6fa58d0ea7f
--- /dev/null
+++ b/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py
@@ -0,0 +1,31 @@
+_base_ = [
+ './psgtr_r50_psg.py'
+]
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True)
+pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ # NOTE: Do not change the img to DC.
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+
+ ],
+ ),
+]
+
+data = dict(
+ test=dict(
+ pipeline=pipeline,
+ ),
+)
\ No newline at end of file
diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..faabe0d659a7e1b24b2f58dda644a9a0fe8faf08
--- /dev/null
+++ b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'vctree_panoptic_fpn_r101_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc49e1368baa36b8fcc2c14a3fb7703e51c854f2
--- /dev/null
+++ b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
+
+model = dict(backbone=dict(
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'vctree_panoptic_fpn_r101_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..e78db15d48d404634713181231bb498ed27b936b
--- /dev/null
+++ b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py
@@ -0,0 +1,43 @@
+_base_ = [
+ '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(relation_head=dict(
+ type='VCTreeHead',
+ head_config=dict(
+ # NOTE: Evaluation type
+ use_gt_box=True,
+ use_gt_label=True,
+ ),
+))
+
+evaluation = dict(interval=1,
+ metric='predcls',
+ relation_mode=True,
+ classwise=True)
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16,
+ workers_per_gpu=0) # FIXME: Is this the problem?
+# optimizer = dict(lr=0.001)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'vctree_panoptic_fpn_r50_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0f05d87f47ebc28920183e317aa26d0abb15026
--- /dev/null
+++ b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
@@ -0,0 +1,49 @@
+_base_ = [
+ '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(
+ relation_head=dict(
+ type='VCTreeHead',
+ head_config=dict(
+ # NOTE: Evaluation type
+ use_gt_box=False,
+ use_gt_label=False,
+ ),
+ ),
+ roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
+)
+
+evaluation = dict(interval=1,
+ metric='sgdet',
+ relation_mode=True,
+ classwise=True,
+ iou_thrs=0.5,
+ detection_method='pan_seg')
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16,
+ # workers_per_gpu=2
+ )
+# optimizer = dict(lr=0.003)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'vctree_panoptic_fpn_r50_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ dict(
+ type='WandbLoggerHook',
+ init_kwargs=dict(
+ project=project_name,
+ name=expt_name,
+ # config=work_dir + "/cfg.yaml"
+ ),
+ ),
+ ],
+)
diff --git a/README.md b/README.md
index 24242ef29f59b4de0e9631b0475d05e42dc73a05..56c55de1fab90a8642f9378c9dfef888302d9530 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
---
title: OpenPSG
-emoji: 🐠
-colorFrom: green
-colorTo: gray
+emoji: 🖼️🏙️🌄🌉
+colorFrom: yellow
+colorTo: blue
sdk: gradio
sdk_version: 3.1.4
app_file: app.py
diff --git a/app.py b/app.py
index f55e9e6c34fad83380e6d0562df75076cdd3883e..a471833d791aa71a92bad57908e9e8dc7e703dd7 100644
--- a/app.py
+++ b/app.py
@@ -1,15 +1,135 @@
-import numpy as np
+#!/usr/bin/env python
+
+from __future__ import annotations
+
+import argparse
+import os
+import pathlib
+import subprocess
+import tarfile
+
+if os.getenv('SYSTEM') == 'spaces':
+ import mim
+
+ mim.uninstall('mmcv-full', confirm_yes=True)
+ mim.install('mmcv-full==1.5.2', is_yes=True)
+
+ subprocess.call('pip uninstall -y opencv-python'.split())
+ subprocess.call('pip uninstall -y opencv-python-headless'.split())
+ subprocess.call('pip install opencv-python-headless==4.5.5.64'.split())
+
+import cv2
import gradio as gr
+import numpy as np
+
+from mmdet.apis import init_detector, inference_detector
+from utils import show_result
+import mmcv
+from mmcv import Config
+import os.path as osp
+
+DESCRIPTION = '''# OpenPSG
+
+This is an official demo for [OpenPSG](https://github.com/Jingkang50/OpenPSG).
+
+'''
+FOOTER = '
'
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--device', type=str, default='cpu')
+ parser.add_argument('--theme', type=str)
+ parser.add_argument('--share', action='store_true')
+ parser.add_argument('--port', type=int)
+ parser.add_argument('--disable-queue',
+ dest='enable_queue',
+ action='store_false')
+ return parser.parse_args()
+
+
+def update_input_image(image: np.ndarray) -> dict:
+ if image is None:
+ return gr.Image.update(value=None)
+ scale = 1500 / max(image.shape[:2])
+ if scale < 1:
+ image = cv2.resize(image, None, fx=scale, fy=scale)
+ return gr.Image.update(value=image)
+
+
+def set_example_image(example: list) -> dict:
+ return gr.Image.update(value=example[0])
+
+
+def infer(model, input_image, num_rel):
+ result = inference_detector(model, input_image)
+ return show_result(input_image,
+ result,
+ is_one_stage=True,
+ num_rel=num_rel,
+ show=True
+ )
+
+
+def main():
+ args = parse_args()
+
+ model_ckt ='OpenPSG/checkpoints/epoch_60.pth'
+ cfg = Config.fromfile('OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py')
+
+ model = init_detector(cfg, model_ckt, device=args.device)
+
+ with gr.Blocks(theme=args.theme, css='style.css') as demo:
+ gr.Markdown(DESCRIPTION)
+
+ with gr.Row():
+ with gr.Column():
+ with gr.Row():
+ input_image = gr.Image(label='Input Image', type='numpy')
+ with gr.Group():
+ with gr.Row():
+ num_rel = gr.Slider(
+ 5,
+ 100,
+ step=5,
+ value=20,
+ label='Number of Relations')
+ with gr.Row():
+ run_button = gr.Button(value='Run')
+ # prediction_results = gr.Variable()
+ with gr.Column():
+ with gr.Row():
+ # visualization = gr.Image(label='Result', type='numpy')
+ result = gr.Gallery(label='Result', type='numpy')
+
+ with gr.Row():
+ paths = sorted(pathlib.Path('images').rglob('*.jpg'))
+ example_images = gr.Dataset(components=[input_image],
+ samples=[[path.as_posix()]
+ for path in paths])
+
+ gr.Markdown(FOOTER)
+
+ input_image.change(fn=update_input_image,
+ inputs=input_image,
+ outputs=input_image)
+
+ run_button.click(fn=infer,
+ inputs=[
+ model, input_image
+ ],
+ outputs=result)
+
+ example_images.click(fn=set_example_image,
+ inputs=example_images,
+ outputs=input_image)
+
+ demo.launch(
+ enable_queue=args.enable_queue,
+ server_port=args.port,
+ share=args.share,
+ )
+
-def sepia(input_img):
- sepia_filter = np.array([
- [0.393, 0.769, 0.189],
- [0.349, 0.686, 0.168],
- [0.272, 0.534, 0.131]
- ])
- sepia_img = input_img.dot(sepia_filter.T)
- sepia_img /= sepia_img.max()
- return sepia_img
-
-demo = gr.Interface(sepia, gr.Image(shape=(200, 200)), "image")
-demo.launch(share=True)
\ No newline at end of file
+if __name__ == '__main__':
+ main()
diff --git a/fake_gan.py b/fake_gan.py
new file mode 100644
index 0000000000000000000000000000000000000000..723fa422afdd4a4323fff964b7b48d68315a76e5
--- /dev/null
+++ b/fake_gan.py
@@ -0,0 +1,56 @@
+# another demo
+# https://huggingface.co/spaces/dalle-mini/dalle-mini/blob/21944e2a8508568387951fc66a30e90f1d58819d/app/gradio/app.py
+
+# This demo needs to be run from the repo folder.
+# python demo/fake_gan/run.py
+import os
+import random
+import time
+
+import gradio as gr
+
+
+def fake_gan(count, *args):
+ time.sleep(1)
+ images = [
+ random.choice(
+ [
+ "https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=387&q=80",
+ "https://images.unsplash.com/photo-1554151228-14d9def656e4?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=386&q=80",
+ "https://images.unsplash.com/photo-1542909168-82c3e7fdca5c?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxzZWFyY2h8MXx8aHVtYW4lMjBmYWNlfGVufDB8fDB8fA%3D%3D&w=1000&q=80",
+ "https://images.unsplash.com/photo-1546456073-92b9f0a8d413?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=387&q=80",
+ "https://images.unsplash.com/photo-1601412436009-d964bd02edbc?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=464&q=80",
+ ]
+ )
+ for _ in range(int(count))
+ ]
+ return images
+
+
+cheetah = os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg")
+
+demo = gr.Interface(
+ fn=fake_gan,
+ inputs=[
+ gr.Number(label="Generation Count"),
+ gr.Image(label="Initial Image (optional)"),
+ gr.Slider(0, 50, 25, label="TV_scale (for smoothness)"),
+ gr.Slider(0, 50, 25, label="Range_Scale (out of range RBG)"),
+ gr.Number(label="Seed"),
+ gr.Number(label="Respacing"),
+ ],
+ outputs=gr.Gallery(label="Generated Images"),
+ title="FD-GAN",
+ description="This is a fake demo of a GAN. In reality, the images are randomly chosen from Unsplash.",
+ examples=[
+ [2, cheetah, 12, None, None, None],
+ [1, cheetah, 2, None, None, None],
+ [4, cheetah, 42, None, None, None],
+ [5, cheetah, 23, None, None, None],
+ [4, cheetah, 11, None, None, None],
+ [3, cheetah, 1, None, None, None],
+ ],
+)
+
+if __name__ == "__main__":
+ demo.launch()
\ No newline at end of file
diff --git a/images/cooking.jpg b/images/cooking.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5e6a0026995854873875ca7831b0b88e473db837
Binary files /dev/null and b/images/cooking.jpg differ
diff --git a/images/forrest-gump.jpg b/images/forrest-gump.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1c5b19b774bc3b9f9292aa870f7897c69aae89f4
Binary files /dev/null and b/images/forrest-gump.jpg differ
diff --git a/images/friends.jpg b/images/friends.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..92cb34b3671d620c7034331d3ff67300c686b74a
Binary files /dev/null and b/images/friends.jpg differ
diff --git a/images/mbappe.jpg b/images/mbappe.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b5f8c42f389696ad2a176c0282c5f705f01e0ca9
Binary files /dev/null and b/images/mbappe.jpg differ
diff --git a/images/messi.jpg b/images/messi.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bf3e1b4141b376c85df73f4c14c4cac9be858ff3
Binary files /dev/null and b/images/messi.jpg differ
diff --git a/images/neymar-jr-angers-x-psg-160121.jpg b/images/neymar-jr-angers-x-psg-160121.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2e048c6998a15fa1c2af034ac259522b23d34348
Binary files /dev/null and b/images/neymar-jr-angers-x-psg-160121.jpg differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dac17be281ce349af3d16e22fda95372e2930116
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+mmcv-full==1.5.2
+mmdet==2.25.0
+numpy==1.22.4
+opencv-python-headless==4.5.5.64
+openmim==0.1.5
+torch==1.11.0
+torchvision==0.12.0
diff --git a/style.css b/style.css
new file mode 100644
index 0000000000000000000000000000000000000000..22ad0be91ed35841bc456be4a0044474affc9a17
--- /dev/null
+++ b/style.css
@@ -0,0 +1,16 @@
+h1 {
+ text-align: center;
+}
+#input-image {
+ max-height: 300px;
+}
+#label-image {
+ height: 300px;
+}
+#result-image {
+ height: 300px;
+}
+img#visitor-badge {
+ display: block;
+ margin: auto;
+}
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f966daf6533811ab2a96bcdc84c2cf9e7360ad8
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,288 @@
+from typing import Tuple
+import os.path as osp
+import PIL
+import mmcv
+import numpy as np
+from detectron2.utils.colormap import colormap
+from detectron2.utils.visualizer import VisImage, Visualizer
+from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET
+
+CLASSES = [
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+ 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+ 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+ 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+ 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+ 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+ 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+ 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+ 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+ 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+ 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+ 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+ 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+ 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+ 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+ 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+ 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+ 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+ 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+ 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+ 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+ 'food-other-merged', 'building-other-merged', 'rock-merged',
+ 'wall-other-merged', 'rug-merged', 'background'
+]
+
+PREDICATES = [
+ 'over',
+ 'in front of',
+ 'beside',
+ 'on',
+ 'in',
+ 'attached to',
+ 'hanging from',
+ 'on back of',
+ 'falling off',
+ 'going down',
+ 'painted on',
+ 'walking on',
+ 'running on',
+ 'crossing',
+ 'standing on',
+ 'lying on',
+ 'sitting on',
+ 'flying over',
+ 'jumping over',
+ 'jumping from',
+ 'wearing',
+ 'holding',
+ 'carrying',
+ 'looking at',
+ 'guiding',
+ 'kissing',
+ 'eating',
+ 'drinking',
+ 'feeding',
+ 'biting',
+ 'catching',
+ 'picking',
+ 'playing with',
+ 'chasing',
+ 'climbing',
+ 'cleaning',
+ 'playing',
+ 'touching',
+ 'pushing',
+ 'pulling',
+ 'opening',
+ 'cooking',
+ 'talking to',
+ 'throwing',
+ 'slicing',
+ 'driving',
+ 'riding',
+ 'parked on',
+ 'driving on',
+ 'about to hit',
+ 'kicking',
+ 'swinging',
+ 'entering',
+ 'exiting',
+ 'enclosing',
+ 'leaning on',
+]
+
+
+def get_colormap(num_colors: int):
+ return (np.resize(colormap(), (num_colors, 3))).tolist()
+
+
+def draw_text(
+ viz_img: VisImage = None,
+ text: str = None,
+ x: float = None,
+ y: float = None,
+ color: Tuple[float, float, float] = [0, 0, 0],
+ size: float = 10,
+ padding: float = 5,
+ box_color: str = 'black',
+ font: str = None,
+) -> float:
+ text_obj = viz_img.ax.text(
+ x,
+ y,
+ text,
+ size=size,
+ # family="sans-serif",
+ bbox={
+ 'facecolor': box_color,
+ 'alpha': 0.8,
+ 'pad': padding,
+ 'edgecolor': 'none',
+ },
+ verticalalignment='top',
+ horizontalalignment='left',
+ color=color,
+ zorder=10,
+ rotation=0,
+ )
+ viz_img.get_image()
+ text_dims = text_obj.get_bbox_patch().get_extents()
+
+ return text_dims.width
+
+
+def show_result(img,
+ result,
+ is_one_stage,
+ num_rel=20,
+ show=False,
+ out_dir=None,
+ out_file=None):
+ # Load image
+ img = mmcv.imread(img)
+ img = img.copy() # (H, W, 3)
+ img_h, img_w = img.shape[:-1]
+
+ # Decrease contrast
+ img = PIL.Image.fromarray(img)
+ converter = PIL.ImageEnhance.Color(img)
+ img = converter.enhance(0.01)
+ if out_file is not None:
+ mmcv.imwrite(np.asarray(img), 'bw'+out_file)
+
+ # Draw masks
+ pan_results = result.pan_results
+
+ ids = np.unique(pan_results)[::-1]
+ num_classes = 133
+ legal_indices = (ids != num_classes) # for VOID label
+ ids = ids[legal_indices]
+
+ # Get predicted labels
+ labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+ labels = [CLASSES[l] for l in labels]
+
+ #For psgtr
+ rel_obj_labels = result.labels
+ rel_obj_labels = [CLASSES[l - 1] for l in rel_obj_labels]
+
+ # (N_m, H, W)
+ segms = pan_results[None] == ids[:, None, None]
+ # Resize predicted masks
+ segms = [
+ mmcv.image.imresize(m.astype(float), (img_w, img_h)) for m in segms
+ ]
+ # One stage segmentation
+ masks = result.masks
+
+ # Choose colors for each instance in coco
+ colormap_coco = get_colormap(len(masks)) if is_one_stage else get_colormap(len(segms))
+ colormap_coco = (np.array(colormap_coco) / 255).tolist()
+
+ # Viualize masks
+ viz = Visualizer(img)
+ viz.overlay_instances(
+ labels=rel_obj_labels if is_one_stage else labels,
+ masks=masks if is_one_stage else segms,
+ assigned_colors=colormap_coco,
+ )
+ viz_img = viz.get_output().get_image()
+ if out_file is not None:
+ mmcv.imwrite(viz_img, out_file)
+
+ # Draw relations
+
+ # Filter out relations
+ n_rel_topk = num_rel
+ # Exclude background class
+ rel_dists = result.rel_dists[:, 1:]
+ # rel_dists = result.rel_dists
+ rel_scores = rel_dists.max(1)
+ # rel_scores = result.triplet_scores
+ # Extract relations with top scores
+ rel_topk_idx = np.argpartition(rel_scores, -n_rel_topk)[-n_rel_topk:]
+ rel_labels_topk = rel_dists[rel_topk_idx].argmax(1)
+ rel_pair_idxes_topk = result.rel_pair_idxes[rel_topk_idx]
+ relations = np.concatenate(
+ [rel_pair_idxes_topk, rel_labels_topk[..., None]], axis=1)
+ n_rels = len(relations)
+
+ top_padding = 20
+ bottom_padding = 20
+ left_padding = 20
+ text_size = 10
+ text_padding = 5
+ text_height = text_size + 2 * text_padding
+ row_padding = 10
+ height = (top_padding + bottom_padding + n_rels *
+ (text_height + row_padding) - row_padding)
+ width = img_w
+ curr_x = left_padding
+ curr_y = top_padding
+
+ # # Adjust colormaps
+ # colormap_coco = [adjust_text_color(c, viz) for c in colormap_coco]
+ viz_graph = VisImage(np.full((height, width, 3), 255))
+
+ all_rel_vis = []
+
+ for i, r in enumerate(relations):
+ s_idx, o_idx, rel_id = r
+ s_label = rel_obj_labels[s_idx]
+ o_label = rel_obj_labels[o_idx]
+ rel_label = PREDICATES[rel_id]
+ viz = Visualizer(img)
+ viz.overlay_instances(
+ labels=[s_label, o_label],
+ masks=[masks[s_idx], masks[o_idx]],
+ assigned_colors=[colormap_coco[s_idx], colormap_coco[o_idx]],
+ )
+ viz_masked_img = viz.get_output().get_image()
+
+ viz_graph = VisImage(np.full((40, width, 3), 255))
+ curr_x = 2
+ curr_y = 2
+ text_size = 25
+ text_padding = 20
+ font = 36
+ text_width = draw_text(
+ viz_img=viz_graph,
+ text=s_label,
+ x=curr_x,
+ y=curr_y,
+ color=colormap_coco[s_idx],
+ size=text_size,
+ padding=text_padding,
+ font=font,
+ )
+ curr_x += text_width
+ # Draw relation text
+ text_width = draw_text(
+ viz_img=viz_graph,
+ text=rel_label,
+ x=curr_x,
+ y=curr_y,
+ size=text_size,
+ padding=text_padding,
+ box_color='gainsboro',
+ font=font,
+ )
+ curr_x += text_width
+
+ # Draw object text
+ text_width = draw_text(
+ viz_img=viz_graph,
+ text=o_label,
+ x=curr_x,
+ y=curr_y,
+ color=colormap_coco[o_idx],
+ size=text_size,
+ padding=text_padding,
+ font=font,
+ )
+ output_viz_graph = np.vstack([viz_masked_img, viz_graph.get_image()])
+ if show:
+ all_rel_vis.append(output_viz_graph)
+
+ return all_rel_vis
\ No newline at end of file