Upload model

by gheinrich - opened Mar 11

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+46

-39

Files changed (4) hide show

common.py +7 -0
config.json +20 -33
extra_timm_models.py +16 -6
model.safetensors +3 -0

common.py CHANGED Viewed

@@ -59,6 +59,13 @@ RESOURCE_MAP = {
         preferred_resolution=(896, 896),
         vitdet_num_global=8,
     ),
     # RADIO
     "radio_v2.1": RadioResource(
         "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",

         preferred_resolution=(896, 896),
         vitdet_num_global=8,
     ),
+    "c-radio_v2.5-g": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.5-g.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=(768, 768),
+        vitdet_num_global=8,
+    ),
     # RADIO
     "radio_v2.1": RadioResource(
         "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",

config.json CHANGED Viewed

@@ -16,7 +16,7 @@
     "cache_dir": null,
     "channels_last": false,
     "checkpoint_hist": 10,
-    "chk_keep_forever": 100,
     "class_map": "",
     "clip_grad": null,
     "clip_mode": "norm",
@@ -25,8 +25,7 @@
     "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
     "color_jitter": 0.4,
     "cooldown_epochs": 0,
-    "cpe_max_size": 1792,
-    "cpe_num_registers": 4,
     "crd_loss": false,
     "crd_loss_weight": 0.8,
     "crop_pct": null,
@@ -59,7 +58,7 @@
     "eval_throughput": false,
     "fast_norm": false,
     "fd_loss_fn": "MSE",
-    "feature_normalization": "SHIP_NORM",
     "feature_summarizer": "cls_token",
     "feature_upscale_factor": null,
     "force_new_wandb_id": false,
@@ -74,8 +73,8 @@
     "head_init_bias": null,
     "head_init_scale": null,
     "head_lr": null,
-    "head_warmup": 0,
-    "head_weight_decay": 0.2,
     "hflip": 0.5,
     "img_size": null,
     "in_chans": null,
@@ -106,10 +105,10 @@
     "mixup_off_epoch": 0,
     "mixup_prob": 1.0,
     "mixup_switch_prob": 0.5,
-    "mlp_hidden_size": 3328,
-    "mlp_num_inner": 2,
     "mlp_version": "v2",
-    "model": "vit_bigG_patch14_224",
     "model_kwargs": {},
     "model_norm": false,
     "momentum": 0.9,
@@ -137,10 +136,10 @@
     ],
     "recount": 1,
     "recovery_interval": 0,
-    "register_multiple": 0,
     "remode": "pixel",
     "reprob": 0.0,
-    "reset_loss_state": false,
     "resplit": false,
     "sample_tracking": false,
     "save_images": false,
@@ -169,29 +168,17 @@
         "model": "ViT-H-14-378-quickgelu",
         "name": "clip",
         "pretrained": "dfn5b",
-        "type": "open_clip"
-      },
-      {
-        "feature_distillation": true,
-        "input_size": 448,
-        "name": "paligemma-448",
-        "type": "paligemma",
-        "use_summary": false
       },
       {
         "fd_normalize": false,
         "feature_distillation": true,
-        "input_size": 378,
         "model": "dinov2_vitg14_reg",
         "name": "dino_v2",
-        "type": "dino_v2"
-      },
-      {
-        "feature_distillation": true,
-        "input_size": 378,
-        "name": "aimv2",
-        "type": "aimv2",
-        "use_summary": false
       },
       {
         "fd_normalize": false,
@@ -230,14 +217,14 @@
   },
   "feature_normalizer_config": null,
   "inter_feature_normalizer_config": null,
-  "max_resolution": 1792,
-  "patch_size": 14,
   "preferred_resolution": [
-    896,
-    896
   ],
   "torch_dtype": "float32",
   "transformers_version": "4.47.0.dev0",
-  "version": "radio_v2.5-g",
   "vitdet_window_size": null
 }

     "cache_dir": null,
     "channels_last": false,
     "checkpoint_hist": 10,
+    "chk_keep_forever": 50,
     "class_map": "",
     "clip_grad": null,
     "clip_mode": "norm",
     "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
     "color_jitter": 0.4,
     "cooldown_epochs": 0,
+    "cpe_max_size": 2048,
     "crd_loss": false,
     "crd_loss_weight": 0.8,
     "crop_pct": null,
     "eval_throughput": false,
     "fast_norm": false,
     "fd_loss_fn": "MSE",
+    "feature_normalization": "PHI_STANDARDIZE",
     "feature_summarizer": "cls_token",
     "feature_upscale_factor": null,
     "force_new_wandb_id": false,
     "head_init_bias": null,
     "head_init_scale": null,
     "head_lr": null,
+    "head_warmup": 5,
+    "head_weight_decay": 0.03,
     "hflip": 0.5,
     "img_size": null,
     "in_chans": null,
     "mixup_off_epoch": 0,
     "mixup_prob": 1.0,
     "mixup_switch_prob": 0.5,
+    "mlp_hidden_size": 1520,
+    "mlp_num_inner": 1,
     "mlp_version": "v2",
+    "model": "vit_giant_patch16_224",
     "model_kwargs": {},
     "model_norm": false,
     "momentum": 0.9,
     ],
     "recount": 1,
     "recovery_interval": 0,
+    "register_multiple": 8,
     "remode": "pixel",
     "reprob": 0.0,
+    "reset_loss_state": true,
     "resplit": false,
     "sample_tracking": false,
     "save_images": false,
         "model": "ViT-H-14-378-quickgelu",
         "name": "clip",
         "pretrained": "dfn5b",
+        "type": "open_clip",
+        "use_summary": true
       },
       {
         "fd_normalize": false,
         "feature_distillation": true,
+        "input_size": 448,
         "model": "dinov2_vitg14_reg",
         "name": "dino_v2",
+        "type": "dino_v2",
+        "use_summary": true
       },
       {
         "fd_normalize": false,
   },
   "feature_normalizer_config": null,
   "inter_feature_normalizer_config": null,
+  "max_resolution": 2048,
+  "patch_size": 16,
   "preferred_resolution": [
+    768,
+    768
   ],
   "torch_dtype": "float32",
   "transformers_version": "4.47.0.dev0",
+  "version": "c-radio_v2.5-g",
   "vitdet_window_size": null
 }

extra_timm_models.py CHANGED Viewed

@@ -24,7 +24,7 @@ from . import dinov2_arch
 def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Tiny (Vit-Ti/16)
     """
-    model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3)
     model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
     return model
@@ -33,7 +33,7 @@ def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
 def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Small (ViT-S/16)
     """
-    model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6)
     model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
     return model
@@ -43,7 +43,7 @@ def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
     ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
     """
-    model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12)
     model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
     return model
@@ -52,7 +52,7 @@ def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
 def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
     """
-    model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16)
     if pretrained:
         # There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
         model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
@@ -65,7 +65,7 @@ def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
 def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
     """
-    model = vit_huge_patch16_224(pretrained=pretrained, **kwargs)
     for m in model.modules():
         if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
@@ -74,9 +74,18 @@ def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransforme
     return model
 @register_model
 def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
-    model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6)
     model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
     return model
@@ -102,3 +111,4 @@ def _patch_layer_scale(model: VisionTransformer):
             if isinstance(mod.ls2, TIMMLayerScale):
                 mod.ls2 = replace_ls(mod.ls2)
     pass

 def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Tiny (Vit-Ti/16)
     """
+    model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3, weight_init='skip')
     model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
     return model
 def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Small (ViT-S/16)
     """
+    model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6, weight_init='skip')
     model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
     return model
     """ ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
     ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
     """
+    model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12, weight_init='skip')
     model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
     return model
 def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
     """
+    model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16, weight_init='skip')
     if pretrained:
         # There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
         model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
 def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
     """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
     """
+    model = vit_huge_patch16_224(pretrained=pretrained, weight_init='skip', **kwargs)
     for m in model.modules():
         if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
     return model
+@register_model
+def vit_giant_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
+    """ ViT-giant model (ViT-g/16) from original paper (https://arxiv.org/abs/2010.11929).
+    """
+    model_args = dict(patch_size=16, embed_dim=1536, depth=40, num_heads=24, weight_init='skip')
+    model = _create_vision_transformer('vit_giant_patch16_224', pretrained=False, **dict(model_args, **kwargs))
+    return model
 @register_model
 def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
+    model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6, weight_init='skip')
     model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
     return model
             if isinstance(mod.ls2, TIMMLayerScale):
                 mod.ls2 = replace_ls(mod.ls2)
     pass

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa6e741d60c99c87d8be4f74439daaadf1eb831bf78d4cfbe1e97ce672204bd1
+size 4638530048