Safetensors
custom_code
Files changed (4) hide show
  1. common.py +7 -0
  2. config.json +20 -33
  3. extra_timm_models.py +16 -6
  4. model.safetensors +3 -0
common.py CHANGED
@@ -59,6 +59,13 @@ RESOURCE_MAP = {
59
  preferred_resolution=(896, 896),
60
  vitdet_num_global=8,
61
  ),
 
 
 
 
 
 
 
62
  # RADIO
63
  "radio_v2.1": RadioResource(
64
  "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",
 
59
  preferred_resolution=(896, 896),
60
  vitdet_num_global=8,
61
  ),
62
+ "c-radio_v2.5-g": RadioResource(
63
+ "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.5-g.pth.tar?download=true",
64
+ patch_size=16,
65
+ max_resolution=2048,
66
+ preferred_resolution=(768, 768),
67
+ vitdet_num_global=8,
68
+ ),
69
  # RADIO
70
  "radio_v2.1": RadioResource(
71
  "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",
config.json CHANGED
@@ -16,7 +16,7 @@
16
  "cache_dir": null,
17
  "channels_last": false,
18
  "checkpoint_hist": 10,
19
- "chk_keep_forever": 100,
20
  "class_map": "",
21
  "clip_grad": null,
22
  "clip_mode": "norm",
@@ -25,8 +25,7 @@
25
  "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
26
  "color_jitter": 0.4,
27
  "cooldown_epochs": 0,
28
- "cpe_max_size": 1792,
29
- "cpe_num_registers": 4,
30
  "crd_loss": false,
31
  "crd_loss_weight": 0.8,
32
  "crop_pct": null,
@@ -59,7 +58,7 @@
59
  "eval_throughput": false,
60
  "fast_norm": false,
61
  "fd_loss_fn": "MSE",
62
- "feature_normalization": "SHIP_NORM",
63
  "feature_summarizer": "cls_token",
64
  "feature_upscale_factor": null,
65
  "force_new_wandb_id": false,
@@ -74,8 +73,8 @@
74
  "head_init_bias": null,
75
  "head_init_scale": null,
76
  "head_lr": null,
77
- "head_warmup": 0,
78
- "head_weight_decay": 0.2,
79
  "hflip": 0.5,
80
  "img_size": null,
81
  "in_chans": null,
@@ -106,10 +105,10 @@
106
  "mixup_off_epoch": 0,
107
  "mixup_prob": 1.0,
108
  "mixup_switch_prob": 0.5,
109
- "mlp_hidden_size": 3328,
110
- "mlp_num_inner": 2,
111
  "mlp_version": "v2",
112
- "model": "vit_bigG_patch14_224",
113
  "model_kwargs": {},
114
  "model_norm": false,
115
  "momentum": 0.9,
@@ -137,10 +136,10 @@
137
  ],
138
  "recount": 1,
139
  "recovery_interval": 0,
140
- "register_multiple": 0,
141
  "remode": "pixel",
142
  "reprob": 0.0,
143
- "reset_loss_state": false,
144
  "resplit": false,
145
  "sample_tracking": false,
146
  "save_images": false,
@@ -169,29 +168,17 @@
169
  "model": "ViT-H-14-378-quickgelu",
170
  "name": "clip",
171
  "pretrained": "dfn5b",
172
- "type": "open_clip"
173
- },
174
- {
175
- "feature_distillation": true,
176
- "input_size": 448,
177
- "name": "paligemma-448",
178
- "type": "paligemma",
179
- "use_summary": false
180
  },
181
  {
182
  "fd_normalize": false,
183
  "feature_distillation": true,
184
- "input_size": 378,
185
  "model": "dinov2_vitg14_reg",
186
  "name": "dino_v2",
187
- "type": "dino_v2"
188
- },
189
- {
190
- "feature_distillation": true,
191
- "input_size": 378,
192
- "name": "aimv2",
193
- "type": "aimv2",
194
- "use_summary": false
195
  },
196
  {
197
  "fd_normalize": false,
@@ -230,14 +217,14 @@
230
  },
231
  "feature_normalizer_config": null,
232
  "inter_feature_normalizer_config": null,
233
- "max_resolution": 1792,
234
- "patch_size": 14,
235
  "preferred_resolution": [
236
- 896,
237
- 896
238
  ],
239
  "torch_dtype": "float32",
240
  "transformers_version": "4.47.0.dev0",
241
- "version": "radio_v2.5-g",
242
  "vitdet_window_size": null
243
  }
 
16
  "cache_dir": null,
17
  "channels_last": false,
18
  "checkpoint_hist": 10,
19
+ "chk_keep_forever": 50,
20
  "class_map": "",
21
  "clip_grad": null,
22
  "clip_mode": "norm",
 
25
  "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
26
  "color_jitter": 0.4,
27
  "cooldown_epochs": 0,
28
+ "cpe_max_size": 2048,
 
29
  "crd_loss": false,
30
  "crd_loss_weight": 0.8,
31
  "crop_pct": null,
 
58
  "eval_throughput": false,
59
  "fast_norm": false,
60
  "fd_loss_fn": "MSE",
61
+ "feature_normalization": "PHI_STANDARDIZE",
62
  "feature_summarizer": "cls_token",
63
  "feature_upscale_factor": null,
64
  "force_new_wandb_id": false,
 
73
  "head_init_bias": null,
74
  "head_init_scale": null,
75
  "head_lr": null,
76
+ "head_warmup": 5,
77
+ "head_weight_decay": 0.03,
78
  "hflip": 0.5,
79
  "img_size": null,
80
  "in_chans": null,
 
105
  "mixup_off_epoch": 0,
106
  "mixup_prob": 1.0,
107
  "mixup_switch_prob": 0.5,
108
+ "mlp_hidden_size": 1520,
109
+ "mlp_num_inner": 1,
110
  "mlp_version": "v2",
111
+ "model": "vit_giant_patch16_224",
112
  "model_kwargs": {},
113
  "model_norm": false,
114
  "momentum": 0.9,
 
136
  ],
137
  "recount": 1,
138
  "recovery_interval": 0,
139
+ "register_multiple": 8,
140
  "remode": "pixel",
141
  "reprob": 0.0,
142
+ "reset_loss_state": true,
143
  "resplit": false,
144
  "sample_tracking": false,
145
  "save_images": false,
 
168
  "model": "ViT-H-14-378-quickgelu",
169
  "name": "clip",
170
  "pretrained": "dfn5b",
171
+ "type": "open_clip",
172
+ "use_summary": true
 
 
 
 
 
 
173
  },
174
  {
175
  "fd_normalize": false,
176
  "feature_distillation": true,
177
+ "input_size": 448,
178
  "model": "dinov2_vitg14_reg",
179
  "name": "dino_v2",
180
+ "type": "dino_v2",
181
+ "use_summary": true
 
 
 
 
 
 
182
  },
183
  {
184
  "fd_normalize": false,
 
217
  },
218
  "feature_normalizer_config": null,
219
  "inter_feature_normalizer_config": null,
220
+ "max_resolution": 2048,
221
+ "patch_size": 16,
222
  "preferred_resolution": [
223
+ 768,
224
+ 768
225
  ],
226
  "torch_dtype": "float32",
227
  "transformers_version": "4.47.0.dev0",
228
+ "version": "c-radio_v2.5-g",
229
  "vitdet_window_size": null
230
  }
extra_timm_models.py CHANGED
@@ -24,7 +24,7 @@ from . import dinov2_arch
24
  def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
25
  """ ViT-Tiny (Vit-Ti/16)
26
  """
27
- model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3)
28
  model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
29
  return model
30
 
@@ -33,7 +33,7 @@ def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
33
  def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
34
  """ ViT-Small (ViT-S/16)
35
  """
36
- model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6)
37
  model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
38
  return model
39
 
@@ -43,7 +43,7 @@ def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
43
  """ ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
44
  ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
45
  """
46
- model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12)
47
  model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
48
  return model
49
 
@@ -52,7 +52,7 @@ def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
52
  def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
53
  """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
54
  """
55
- model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16)
56
  if pretrained:
57
  # There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
58
  model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
@@ -65,7 +65,7 @@ def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
65
  def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
66
  """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
67
  """
68
- model = vit_huge_patch16_224(pretrained=pretrained, **kwargs)
69
 
70
  for m in model.modules():
71
  if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
@@ -74,9 +74,18 @@ def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransforme
74
  return model
75
 
76
 
 
 
 
 
 
 
 
 
 
77
  @register_model
78
  def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
79
- model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6)
80
  model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
81
  return model
82
 
@@ -102,3 +111,4 @@ def _patch_layer_scale(model: VisionTransformer):
102
  if isinstance(mod.ls2, TIMMLayerScale):
103
  mod.ls2 = replace_ls(mod.ls2)
104
  pass
 
 
24
  def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
25
  """ ViT-Tiny (Vit-Ti/16)
26
  """
27
+ model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3, weight_init='skip')
28
  model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
29
  return model
30
 
 
33
  def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
34
  """ ViT-Small (ViT-S/16)
35
  """
36
+ model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6, weight_init='skip')
37
  model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
38
  return model
39
 
 
43
  """ ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
44
  ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
45
  """
46
+ model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12, weight_init='skip')
47
  model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
48
  return model
49
 
 
52
  def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
53
  """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
54
  """
55
+ model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16, weight_init='skip')
56
  if pretrained:
57
  # There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
58
  model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
 
65
  def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
66
  """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
67
  """
68
+ model = vit_huge_patch16_224(pretrained=pretrained, weight_init='skip', **kwargs)
69
 
70
  for m in model.modules():
71
  if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
 
74
  return model
75
 
76
 
77
+ @register_model
78
+ def vit_giant_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
79
+ """ ViT-giant model (ViT-g/16) from original paper (https://arxiv.org/abs/2010.11929).
80
+ """
81
+ model_args = dict(patch_size=16, embed_dim=1536, depth=40, num_heads=24, weight_init='skip')
82
+ model = _create_vision_transformer('vit_giant_patch16_224', pretrained=False, **dict(model_args, **kwargs))
83
+ return model
84
+
85
+
86
  @register_model
87
  def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
88
+ model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6, weight_init='skip')
89
  model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
90
  return model
91
 
 
111
  if isinstance(mod.ls2, TIMMLayerScale):
112
  mod.ls2 = replace_ls(mod.ls2)
113
  pass
114
+
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa6e741d60c99c87d8be4f74439daaadf1eb831bf78d4cfbe1e97ce672204bd1
3
+ size 4638530048