Upload model
#2
by
gheinrich
- opened
- common.py +7 -0
- config.json +20 -33
- extra_timm_models.py +16 -6
- model.safetensors +3 -0
common.py
CHANGED
@@ -59,6 +59,13 @@ RESOURCE_MAP = {
|
|
59 |
preferred_resolution=(896, 896),
|
60 |
vitdet_num_global=8,
|
61 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
# RADIO
|
63 |
"radio_v2.1": RadioResource(
|
64 |
"https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",
|
|
|
59 |
preferred_resolution=(896, 896),
|
60 |
vitdet_num_global=8,
|
61 |
),
|
62 |
+
"c-radio_v2.5-g": RadioResource(
|
63 |
+
"https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.5-g.pth.tar?download=true",
|
64 |
+
patch_size=16,
|
65 |
+
max_resolution=2048,
|
66 |
+
preferred_resolution=(768, 768),
|
67 |
+
vitdet_num_global=8,
|
68 |
+
),
|
69 |
# RADIO
|
70 |
"radio_v2.1": RadioResource(
|
71 |
"https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",
|
config.json
CHANGED
@@ -16,7 +16,7 @@
|
|
16 |
"cache_dir": null,
|
17 |
"channels_last": false,
|
18 |
"checkpoint_hist": 10,
|
19 |
-
"chk_keep_forever":
|
20 |
"class_map": "",
|
21 |
"clip_grad": null,
|
22 |
"clip_mode": "norm",
|
@@ -25,8 +25,7 @@
|
|
25 |
"coco_image_dir": "/datasets/coco2017-adlsa/val2017",
|
26 |
"color_jitter": 0.4,
|
27 |
"cooldown_epochs": 0,
|
28 |
-
"cpe_max_size":
|
29 |
-
"cpe_num_registers": 4,
|
30 |
"crd_loss": false,
|
31 |
"crd_loss_weight": 0.8,
|
32 |
"crop_pct": null,
|
@@ -59,7 +58,7 @@
|
|
59 |
"eval_throughput": false,
|
60 |
"fast_norm": false,
|
61 |
"fd_loss_fn": "MSE",
|
62 |
-
"feature_normalization": "
|
63 |
"feature_summarizer": "cls_token",
|
64 |
"feature_upscale_factor": null,
|
65 |
"force_new_wandb_id": false,
|
@@ -74,8 +73,8 @@
|
|
74 |
"head_init_bias": null,
|
75 |
"head_init_scale": null,
|
76 |
"head_lr": null,
|
77 |
-
"head_warmup":
|
78 |
-
"head_weight_decay": 0.
|
79 |
"hflip": 0.5,
|
80 |
"img_size": null,
|
81 |
"in_chans": null,
|
@@ -106,10 +105,10 @@
|
|
106 |
"mixup_off_epoch": 0,
|
107 |
"mixup_prob": 1.0,
|
108 |
"mixup_switch_prob": 0.5,
|
109 |
-
"mlp_hidden_size":
|
110 |
-
"mlp_num_inner":
|
111 |
"mlp_version": "v2",
|
112 |
-
"model": "
|
113 |
"model_kwargs": {},
|
114 |
"model_norm": false,
|
115 |
"momentum": 0.9,
|
@@ -137,10 +136,10 @@
|
|
137 |
],
|
138 |
"recount": 1,
|
139 |
"recovery_interval": 0,
|
140 |
-
"register_multiple":
|
141 |
"remode": "pixel",
|
142 |
"reprob": 0.0,
|
143 |
-
"reset_loss_state":
|
144 |
"resplit": false,
|
145 |
"sample_tracking": false,
|
146 |
"save_images": false,
|
@@ -169,29 +168,17 @@
|
|
169 |
"model": "ViT-H-14-378-quickgelu",
|
170 |
"name": "clip",
|
171 |
"pretrained": "dfn5b",
|
172 |
-
"type": "open_clip"
|
173 |
-
|
174 |
-
{
|
175 |
-
"feature_distillation": true,
|
176 |
-
"input_size": 448,
|
177 |
-
"name": "paligemma-448",
|
178 |
-
"type": "paligemma",
|
179 |
-
"use_summary": false
|
180 |
},
|
181 |
{
|
182 |
"fd_normalize": false,
|
183 |
"feature_distillation": true,
|
184 |
-
"input_size":
|
185 |
"model": "dinov2_vitg14_reg",
|
186 |
"name": "dino_v2",
|
187 |
-
"type": "dino_v2"
|
188 |
-
|
189 |
-
{
|
190 |
-
"feature_distillation": true,
|
191 |
-
"input_size": 378,
|
192 |
-
"name": "aimv2",
|
193 |
-
"type": "aimv2",
|
194 |
-
"use_summary": false
|
195 |
},
|
196 |
{
|
197 |
"fd_normalize": false,
|
@@ -230,14 +217,14 @@
|
|
230 |
},
|
231 |
"feature_normalizer_config": null,
|
232 |
"inter_feature_normalizer_config": null,
|
233 |
-
"max_resolution":
|
234 |
-
"patch_size":
|
235 |
"preferred_resolution": [
|
236 |
-
|
237 |
-
|
238 |
],
|
239 |
"torch_dtype": "float32",
|
240 |
"transformers_version": "4.47.0.dev0",
|
241 |
-
"version": "radio_v2.5-g",
|
242 |
"vitdet_window_size": null
|
243 |
}
|
|
|
16 |
"cache_dir": null,
|
17 |
"channels_last": false,
|
18 |
"checkpoint_hist": 10,
|
19 |
+
"chk_keep_forever": 50,
|
20 |
"class_map": "",
|
21 |
"clip_grad": null,
|
22 |
"clip_mode": "norm",
|
|
|
25 |
"coco_image_dir": "/datasets/coco2017-adlsa/val2017",
|
26 |
"color_jitter": 0.4,
|
27 |
"cooldown_epochs": 0,
|
28 |
+
"cpe_max_size": 2048,
|
|
|
29 |
"crd_loss": false,
|
30 |
"crd_loss_weight": 0.8,
|
31 |
"crop_pct": null,
|
|
|
58 |
"eval_throughput": false,
|
59 |
"fast_norm": false,
|
60 |
"fd_loss_fn": "MSE",
|
61 |
+
"feature_normalization": "PHI_STANDARDIZE",
|
62 |
"feature_summarizer": "cls_token",
|
63 |
"feature_upscale_factor": null,
|
64 |
"force_new_wandb_id": false,
|
|
|
73 |
"head_init_bias": null,
|
74 |
"head_init_scale": null,
|
75 |
"head_lr": null,
|
76 |
+
"head_warmup": 5,
|
77 |
+
"head_weight_decay": 0.03,
|
78 |
"hflip": 0.5,
|
79 |
"img_size": null,
|
80 |
"in_chans": null,
|
|
|
105 |
"mixup_off_epoch": 0,
|
106 |
"mixup_prob": 1.0,
|
107 |
"mixup_switch_prob": 0.5,
|
108 |
+
"mlp_hidden_size": 1520,
|
109 |
+
"mlp_num_inner": 1,
|
110 |
"mlp_version": "v2",
|
111 |
+
"model": "vit_giant_patch16_224",
|
112 |
"model_kwargs": {},
|
113 |
"model_norm": false,
|
114 |
"momentum": 0.9,
|
|
|
136 |
],
|
137 |
"recount": 1,
|
138 |
"recovery_interval": 0,
|
139 |
+
"register_multiple": 8,
|
140 |
"remode": "pixel",
|
141 |
"reprob": 0.0,
|
142 |
+
"reset_loss_state": true,
|
143 |
"resplit": false,
|
144 |
"sample_tracking": false,
|
145 |
"save_images": false,
|
|
|
168 |
"model": "ViT-H-14-378-quickgelu",
|
169 |
"name": "clip",
|
170 |
"pretrained": "dfn5b",
|
171 |
+
"type": "open_clip",
|
172 |
+
"use_summary": true
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
},
|
174 |
{
|
175 |
"fd_normalize": false,
|
176 |
"feature_distillation": true,
|
177 |
+
"input_size": 448,
|
178 |
"model": "dinov2_vitg14_reg",
|
179 |
"name": "dino_v2",
|
180 |
+
"type": "dino_v2",
|
181 |
+
"use_summary": true
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
},
|
183 |
{
|
184 |
"fd_normalize": false,
|
|
|
217 |
},
|
218 |
"feature_normalizer_config": null,
|
219 |
"inter_feature_normalizer_config": null,
|
220 |
+
"max_resolution": 2048,
|
221 |
+
"patch_size": 16,
|
222 |
"preferred_resolution": [
|
223 |
+
768,
|
224 |
+
768
|
225 |
],
|
226 |
"torch_dtype": "float32",
|
227 |
"transformers_version": "4.47.0.dev0",
|
228 |
+
"version": "c-radio_v2.5-g",
|
229 |
"vitdet_window_size": null
|
230 |
}
|
extra_timm_models.py
CHANGED
@@ -24,7 +24,7 @@ from . import dinov2_arch
|
|
24 |
def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
25 |
""" ViT-Tiny (Vit-Ti/16)
|
26 |
"""
|
27 |
-
model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3)
|
28 |
model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
29 |
return model
|
30 |
|
@@ -33,7 +33,7 @@ def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
|
33 |
def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
34 |
""" ViT-Small (ViT-S/16)
|
35 |
"""
|
36 |
-
model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6)
|
37 |
model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
38 |
return model
|
39 |
|
@@ -43,7 +43,7 @@ def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
|
43 |
""" ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
|
44 |
ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
|
45 |
"""
|
46 |
-
model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12)
|
47 |
model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
48 |
return model
|
49 |
|
@@ -52,7 +52,7 @@ def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
|
52 |
def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
|
53 |
""" ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
|
54 |
"""
|
55 |
-
model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16)
|
56 |
if pretrained:
|
57 |
# There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
|
58 |
model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
|
@@ -65,7 +65,7 @@ def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
|
|
65 |
def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
|
66 |
""" ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
|
67 |
"""
|
68 |
-
model = vit_huge_patch16_224(pretrained=pretrained, **kwargs)
|
69 |
|
70 |
for m in model.modules():
|
71 |
if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
|
@@ -74,9 +74,18 @@ def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransforme
|
|
74 |
return model
|
75 |
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
@register_model
|
78 |
def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
79 |
-
model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6)
|
80 |
model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
|
81 |
return model
|
82 |
|
@@ -102,3 +111,4 @@ def _patch_layer_scale(model: VisionTransformer):
|
|
102 |
if isinstance(mod.ls2, TIMMLayerScale):
|
103 |
mod.ls2 = replace_ls(mod.ls2)
|
104 |
pass
|
|
|
|
24 |
def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
25 |
""" ViT-Tiny (Vit-Ti/16)
|
26 |
"""
|
27 |
+
model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3, weight_init='skip')
|
28 |
model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
29 |
return model
|
30 |
|
|
|
33 |
def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
34 |
""" ViT-Small (ViT-S/16)
|
35 |
"""
|
36 |
+
model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6, weight_init='skip')
|
37 |
model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
38 |
return model
|
39 |
|
|
|
43 |
""" ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
|
44 |
ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
|
45 |
"""
|
46 |
+
model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12, weight_init='skip')
|
47 |
model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
48 |
return model
|
49 |
|
|
|
52 |
def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
|
53 |
""" ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
|
54 |
"""
|
55 |
+
model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16, weight_init='skip')
|
56 |
if pretrained:
|
57 |
# There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
|
58 |
model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
|
|
|
65 |
def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
|
66 |
""" ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
|
67 |
"""
|
68 |
+
model = vit_huge_patch16_224(pretrained=pretrained, weight_init='skip', **kwargs)
|
69 |
|
70 |
for m in model.modules():
|
71 |
if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
|
|
|
74 |
return model
|
75 |
|
76 |
|
77 |
+
@register_model
|
78 |
+
def vit_giant_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
|
79 |
+
""" ViT-giant model (ViT-g/16) from original paper (https://arxiv.org/abs/2010.11929).
|
80 |
+
"""
|
81 |
+
model_args = dict(patch_size=16, embed_dim=1536, depth=40, num_heads=24, weight_init='skip')
|
82 |
+
model = _create_vision_transformer('vit_giant_patch16_224', pretrained=False, **dict(model_args, **kwargs))
|
83 |
+
return model
|
84 |
+
|
85 |
+
|
86 |
@register_model
|
87 |
def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
88 |
+
model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6, weight_init='skip')
|
89 |
model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
|
90 |
return model
|
91 |
|
|
|
111 |
if isinstance(mod.ls2, TIMMLayerScale):
|
112 |
mod.ls2 = replace_ls(mod.ls2)
|
113 |
pass
|
114 |
+
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa6e741d60c99c87d8be4f74439daaadf1eb831bf78d4cfbe1e97ce672204bd1
|
3 |
+
size 4638530048
|