@@ -1919,6 +1919,31 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19191919 notes = ('natively QuickGELU, use quickgelu model variant for original results' ,),
19201920 crop_pct = 1.0 , input_size = (3 , 378 , 378 ), num_classes = 1024 ),
19211921
1922+ 'vit_large_patch14_clip_224.metaclip2_worldwide' : _cfg (
1923+ hf_hub_id = 'timm/' ,
1924+ license = 'cc-by-nc-4.0' ,
1925+ notes = ('natively QuickGELU, use quickgelu model variant for original results' ,),
1926+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 768 ),
1927+ 'vit_huge_patch14_clip_224.metaclip2_worldwide' : _cfg (
1928+ hf_hub_id = 'timm/' ,
1929+ license = 'cc-by-nc-4.0' ,
1930+ notes = ('natively QuickGELU, use quickgelu model variant for original results' ,),
1931+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
1932+ 'vit_huge_patch14_clip_378.metaclip2_worldwide' : _cfg (
1933+ hf_hub_id = 'timm/' ,
1934+ license = 'cc-by-nc-4.0' ,
1935+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1936+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' , num_classes = 1024 ),
1937+ 'vit_gigantic_patch14_clip_224.metaclip2_worldwide' : _cfg (
1938+ hf_hub_id = 'timm/' ,
1939+ license = 'cc-by-nc-4.0' ,
1940+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1280 ),
1941+ 'vit_gigantic_patch14_clip_378.metaclip2_worldwide' : _cfg (
1942+ hf_hub_id = 'timm/' ,
1943+ license = 'cc-by-nc-4.0' ,
1944+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1945+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' , num_classes = 1280 ),
1946+
19221947 'vit_base_patch32_clip_224.metaclip_2pt5b' : _cfg (
19231948 hf_hub_id = 'timm/' ,
19241949 license = 'cc-by-nc-4.0' ,
@@ -3178,6 +3203,20 @@ def vit_gigantic_patch14_clip_224(pretrained: bool = False, **kwargs) -> VisionT
31783203 return model
31793204
31803205
3206+ @register_model
3207+ def vit_gigantic_patch14_clip_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3208+ """ ViT-bigG model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
3209+ Pretrained weights from CLIP image tower.
3210+ """
3211+ model_args = dict (
3212+ patch_size = 14 , embed_dim = 1664 , mlp_ratio = 64 / 13 , depth = 48 , num_heads = 16 , pre_norm = True ,
3213+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
3214+ )
3215+ model = _create_vision_transformer (
3216+ 'vit_gigantic_patch14_clip_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3217+ return model
3218+
3219+
31813220@register_model
31823221def vit_base_patch32_clip_quickgelu_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
31833222 """ ViT-B/32 CLIP image tower @ 224x224
0 commit comments