File size: 1,202 Bytes
dcc8c59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
pixel_mean: [0.485, 0.456, 0.406]
pixel_std: [0.229, 0.224, 0.225]

pixel_dim: 256
key_dim: 64
value_dim: 256
sensory_dim: 256
embed_dim: 256

pixel_encoder:
  type: resnet50
  ms_dims: [1024, 512, 256, 64, 3] # f16, f8, f4, f2, f1

mask_encoder:
  type: resnet18
  final_dim: 256

pixel_pe_scale: 32
pixel_pe_temperature: 128

object_transformer:
  embed_dim: ${model.embed_dim}
  ff_dim: 2048
  num_heads: 8
  num_blocks: 3
  num_queries: 16
  read_from_pixel:
    input_norm: False
    input_add_pe: False
    add_pe_to_qkv: [True, True, False]
  read_from_past:
    add_pe_to_qkv: [True, True, False]
  read_from_memory:
    add_pe_to_qkv: [True, True, False]
  read_from_query:
    add_pe_to_qkv: [True, True, False]
    output_norm: False
  query_self_attention:
    add_pe_to_qkv: [True, True, False]
  pixel_self_attention:
    add_pe_to_qkv: [True, True, False]

object_summarizer:
  embed_dim: ${model.object_transformer.embed_dim}
  num_summaries: ${model.object_transformer.num_queries}
  add_pe: True

aux_loss:
  sensory:
    enabled: True
    weight: 0.01
  query:
    enabled: True
    weight: 0.01

mask_decoder:
  # first value must equal embed_dim
  up_dims: [256, 128, 128, 64, 16]