{ "cfg": { "amp": false, "chunk_size": -1, "flip_aug": false, "long_term": { "buffer_tokens": 2000, "count_usage": true, "max_mem_frames": 10, "max_num_tokens": 10000, "min_mem_frames": 5, "num_prototypes": 128 }, "max_internal_size": -1, "max_mem_frames": 5, "mem_every": 5, "model": { "aux_loss": { "query": { "enabled": true, "weight": 0.01 }, "sensory": { "enabled": true, "weight": 0.01 } }, "embed_dim": 256, "key_dim": 64, "mask_decoder": { "up_dims": [ 256, 128, 128, 64, 16 ] }, "mask_encoder": { "final_dim": 256, "type": "resnet18" }, "object_summarizer": { "add_pe": true, "embed_dim": "${model.object_transformer.embed_dim}", "num_summaries": "${model.object_transformer.num_queries}" }, "object_transformer": { "embed_dim": "${model.embed_dim}", "ff_dim": 2048, "num_blocks": 3, "num_heads": 8, "num_queries": 16, "pixel_self_attention": { "add_pe_to_qkv": [ true, true, false ] }, "query_self_attention": { "add_pe_to_qkv": [ true, true, false ] }, "read_from_memory": { "add_pe_to_qkv": [ true, true, false ] }, "read_from_past": { "add_pe_to_qkv": [ true, true, false ] }, "read_from_pixel": { "add_pe_to_qkv": [ true, true, false ], "input_add_pe": false, "input_norm": false }, "read_from_query": { "add_pe_to_qkv": [ true, true, false ], "output_norm": false } }, "pixel_dim": 256, "pixel_encoder": { "ms_dims": [ 1024, 512, 256, 64, 3 ], "type": "resnet50" }, "pixel_mean": [ 0.485, 0.456, 0.406 ], "pixel_pe_scale": 32, "pixel_pe_temperature": 128, "pixel_std": [ 0.229, 0.224, 0.225 ], "sensory_dim": 256, "value_dim": 256 }, "output_dir": null, "pretrained_resnet": false, "save_all": true, "save_aux": false, "save_scores": false, "stagger_updates": 5, "top_k": 30, "use_all_masks": false, "use_long_term": false, "visualize": false, "weights": "pretrained_models/matanyone.pth" }, "single_object": true }