# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""ViT"""
from typing import Optional, Union, List
import numpy as np
import mindspore as ms
from mindspore import nn
from mindspore import ops
from mindspore.common.initializer import Normal
from mindspore.common.initializer import initializer
from mindspore.common.parameter import Parameter
from mindspore import Tensor
from mindspore import ops as P
from .utils import load_pretrained, ConfigDict
from .registry import register_model
__all__ = ['ViT',
'vit_b_16_224',
'vit_b_16_384',
'vit_l_16_224', #train
'vit_l_16_384',
'vit_b_32_224', #train
'vit_b_32_384',
'vit_l_32_224', #train
]
def _cfg(url='', **kwargs):
return {
'url': url,
'num_classes': 1000,
'input_size': (3, 224, 224),
'first_conv': 'patch_embed.proj', 'classifier': 'classifier',
**kwargs
}
default_cfgs = {
"vit_b_16_224": _cfg(url="https://download.mindspore.cn/vision/classification/vit_b_16_224.ckpt"),
"vit_b_16_384": _cfg(url="https://download.mindspore.cn/vision/classification/vit_b_16_384.ckpt", input_size=(3, 384, 384)),
"vit_l_16_224": _cfg(url="https://download.mindspore.cn/toolkits/mindcv/vit/vit_l_16_224.ckpt"),
"vit_l_16_384": _cfg(url="https://download.mindspore.cn/vision/classification/vit_l_16_384.ckpt", input_size=(3, 384, 384)),
"vit_b_32_224": _cfg(url="https://download.mindspore.cn/toolkits/mindcv/vit/vit_b_32_224.ckpt"),
"vit_b_32_384": _cfg(url="https://download.mindspore.cn/vision/classification/vit_b_32_384.ckpt", input_size=(3, 384, 384)),
"vit_l_32_224": _cfg(url="https://download.mindspore.cn/toolkits/mindcv/vit/vit_l_32_224.ckpt"),
}
class PatchEmbedding(nn.Cell):
"""
Path embedding layer for ViT. First rearrange b c (h p) (w p) -> b (h w) (p p c).
Args:
image_size (int): Input image size. Default: 224.
patch_size (int): Patch size of image. Default: 16.
embed_dim (int): The dimension of embedding. Default: 768.
input_channels (int): The number of input channel. Default: 3.
Returns:
Tensor, output tensor.
Examples:
>>> ops = PathEmbedding(224, 16, 768, 3)
"""
MIN_NUM_PATCHES = 4
def __init__(self,
image_size: int = 224,
patch_size: int = 16,
embed_dim: int = 768,
input_channels: int = 3):
super().__init__()
self.image_size = image_size
self.patch_size = patch_size
self.num_patches = (image_size // patch_size) ** 2
self.conv = nn.Conv2d(input_channels, embed_dim, kernel_size=patch_size, stride=patch_size, has_bias=True)
self.reshape = ops.Reshape()
self.transpose = ops.Transpose()
def construct(self, x):
"""Path Embedding construct."""
x = self.conv(x)
b, c, h, w = x.shape
x = self.reshape(x, (b, c, h * w))
x = self.transpose(x, (0, 2, 1))
return x
class Attention(nn.Cell):
"""
Attention layer implementation, Rearrange Input -> B x N x hidden size.
Args:
dim (int): The dimension of input features.
num_heads (int): The number of attention heads. Default: 8.
keep_prob (float): The keep rate, greater than 0 and less equal than 1. Default: 1.0.
attention_keep_prob (float): The keep rate for attention. Default: 1.0.
Returns:
Tensor, output tensor.
Examples:
>>> ops = Attention(768, 12)
"""
def __init__(self,
dim: int,
num_heads: int = 8,
keep_prob: float = 1.0,
attention_keep_prob: float = 1.0):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = Tensor(head_dim ** -0.5)
self.qkv = nn.Dense(dim, dim * 3)
self.attn_drop = nn.Dropout(attention_keep_prob)
self.out = nn.Dense(dim, dim)
self.out_drop = nn.Dropout(keep_prob)
self.mul = ops.Mul()
self.reshape = ops.Reshape()
self.transpose = ops.Transpose()
self.unstack = ops.Unstack(axis=0)
self.attn_matmul_v = ops.BatchMatMul()
self.q_matmul_k = ops.BatchMatMul(transpose_b=True)
self.softmax = nn.Softmax(axis=-1)
def construct(self, x):
"""Attention construct."""
b, n, c = x.shape
qkv = self.qkv(x)
qkv = self.reshape(qkv, (b, n, 3, self.num_heads, c // self.num_heads))
qkv = self.transpose(qkv, (2, 0, 3, 1, 4))
q, k, v = self.unstack(qkv)
attn = self.q_matmul_k(q, k)
attn = self.mul(attn, self.scale)
attn = self.softmax(attn)
attn = self.attn_drop(attn)
out = self.attn_matmul_v(attn, v)
out = self.transpose(out, (0, 2, 1, 3))
out = self.reshape(out, (b, n, c))
out = self.out(out)
out = self.out_drop(out)
return out
class FeedForward(nn.Cell):
"""
Feed Forward layer implementation.
Args:
in_features (int): The dimension of input features.
hidden_features (int): The dimension of hidden features. Default: None.
out_features (int): The dimension of output features. Default: None
activation (nn.Cell): Activation function which will be stacked on top of the
normalization layer (if not None), otherwise on top of the conv layer. Default: nn.GELU.
keep_prob (float): The keep rate, greater than 0 and less equal than 1. Default: 1.0.
Returns:
Tensor, output tensor.
Examples:
>>> ops = FeedForward(768, 3072)
"""
def __init__(self,
in_features: int,
hidden_features: Optional[int] = None,
out_features: Optional[int] = None,
activation: nn.Cell = nn.GELU,
keep_prob: float = 1.0):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.dense1 = nn.Dense(in_features, hidden_features)
self.activation = activation()
self.dense2 = nn.Dense(hidden_features, out_features)
self.dropout = nn.Dropout(keep_prob)
def construct(self, x):
"""Feed Forward construct."""
x = self.dense1(x)
x = self.activation(x)
x = self.dropout(x)
x = self.dense2(x)
x = self.dropout(x)
return x
class ResidualCell(nn.Cell):
"""
Cell which implements Residual function:
$$output = x + f(x)$$
Args:
cell (Cell): Cell needed to add residual block.
Returns:
Tensor, output tensor.
Examples:
>>> ops = ResidualCell(nn.Dense(3,4))
"""
def __init__(self, cell):
super().__init__()
self.cell = cell
def construct(self, x):
"""ResidualCell construct."""
return self.cell(x) + x
class DropPath(nn.Cell):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, keep_prob=None, seed=0):
super().__init__()
self.keep_prob = 1 - keep_prob
seed = min(seed, 0)
self.rand = P.UniformReal(seed=seed)
self.shape = P.Shape()
self.floor = P.Floor()
def construct(self, x):
if self.training:
x_shape = self.shape(x)
random_tensor = self.rand((x_shape[0], 1, 1))
random_tensor = random_tensor + self.keep_prob
random_tensor = self.floor(random_tensor)
x = x / self.keep_prob
x = x * random_tensor
return x
class TransformerEncoder(nn.Cell):
"""
TransformerEncoder implementation.
Args:
dim (int): The dimension of embedding.
num_layers (int): The depth of transformer.
num_heads (int): The number of attention heads.
mlp_dim (int): The dimension of MLP hidden layer.
keep_prob (float): The keep rate, greater than 0 and less equal than 1. Default: 1.0.
attention_keep_prob (float): The keep rate for attention. Default: 1.0.
drop_path_keep_prob (float): The keep rate for drop path. Default: 1.0.
activation (nn.Cell): Activation function which will be stacked on top of the
normalization layer (if not None), otherwise on top of the conv layer. Default: nn.GELU.
norm (nn.Cell, optional): Norm layer that will be stacked on top of the convolution
layer. Default: nn.LayerNorm.
Returns:
Tensor, output tensor.
Examples:
>>> ops = TransformerEncoder(768, 12, 12, 3072)
"""
def __init__(self,
dim: int,
num_layers: int,
num_heads: int,
mlp_dim: int,
keep_prob: float = 1.,
attention_keep_prob: float = 1.0,
drop_path_keep_prob: float = 1.0,
activation: nn.Cell = nn.GELU,
norm: nn.Cell = nn.LayerNorm):
super().__init__()
drop_path_rate = 1 - drop_path_keep_prob
dpr = [i.item() for i in np.linspace(0, drop_path_rate, num_layers)]
attn_seeds = [np.random.randint(1024) for _ in range(num_layers)]
mlp_seeds = [np.random.randint(1024) for _ in range(num_layers)]
layers = []
for i in range(num_layers):
normalization1 = norm((dim,))
normalization2 = norm((dim,))
attention = Attention(dim=dim,
num_heads=num_heads,
keep_prob=keep_prob,
attention_keep_prob=attention_keep_prob)
feedforward = FeedForward(in_features=dim,
hidden_features=mlp_dim,
activation=activation,
keep_prob=keep_prob)
if drop_path_rate > 0:
layers.append(
nn.SequentialCell([
ResidualCell(nn.SequentialCell([normalization1,
attention,
DropPath(dpr[i], attn_seeds[i])])),
ResidualCell(nn.SequentialCell([normalization2,
feedforward,
DropPath(dpr[i], mlp_seeds[i])]))]))
else:
layers.append(
nn.SequentialCell([
ResidualCell(nn.SequentialCell([normalization1,
attention])),
ResidualCell(nn.SequentialCell([normalization2,
feedforward]))
])
)
self.layers = nn.SequentialCell(layers)
def construct(self, x):
"""Transformer construct."""
return self.layers(x)
class DenseHead(nn.Cell):
"""
LinearClsHead architecture.
Args:
input_channel (int): The number of input channel.
num_classes (int): Number of classes.
has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
activation (Union[str, Cell, Primitive]): activate function applied to the output. Eg. `ReLU`. Default: None.
keep_prob (float): Dropout keeping rate, between [0, 1]. E.g. rate=0.9, means dropping out 10% of input.
Default: 1.0.
Returns:
Tensor, output tensor.
"""
def __init__(self,
input_channel: int,
num_classes: int,
has_bias: bool = True,
activation: Optional[Union[str, nn.Cell]] = None,
keep_prob: float = 1.0
) -> None:
super().__init__()
self.dropout = nn.Dropout(keep_prob)
self.classifier = nn.Dense(input_channel, num_classes, has_bias=has_bias, activation=activation)
def construct(self, x):
if self.training:
x = self.dropout(x)
x = self.classifier(x)
return x
class MultilayerDenseHead(nn.Cell):
"""
MultilayerDenseHead architecture.
Args:
input_channel (int): The number of input channel.
num_classes (int): Number of classes.
mid_channel (list): Number of channels in the hidden fc layers.
keep_prob (list): Dropout keeping rate, between [0, 1]. E.g. rate=0.9, means dropping out 10% of
input.
activation (list): activate function applied to the output. Eg. `ReLU`.
Returns:
Tensor, output tensor.
"""
def __init__(self,
input_channel: int,
num_classes: int,
mid_channel: List[int],
keep_prob: List[float],
activation: List[Optional[Union[str, nn.Cell]]],
) -> None:
super().__init__()
mid_channel.append(num_classes)
assert len(mid_channel) == len(activation) == len(keep_prob), "The length of the list should be the same."
length = len(activation)
head = []
for i in range(length):
linear = DenseHead(input_channel,
mid_channel[i],
activation=activation[i],
keep_prob=keep_prob[i],
)
head.append(linear)
input_channel = mid_channel[i]
self.classifier = nn.SequentialCell(head)
def construct(self, x):
x = self.classifier(x)
return x
class BaseClassifier(nn.Cell):
"""
generate classifier to combine the backbone and head
"""
def __init__(self, backbone, neck=None, head=None):
super().__init__()
self.backbone = backbone
if neck:
self.neck = neck
self.with_neck = True
else:
self.with_neck = False
if head:
self.head = head
self.with_head = True
else:
self.with_head = False
def forward_features(self, x: Tensor) -> Tensor:
x = self.backbone(x)
return x
def forward_head(self, x: Tensor) -> Tensor:
x = self.head(x)
return x
def construct(self, x):
x = self.forward_features(x)
if self.with_neck:
x = self.neck(x)
if self.with_head:
x = self.forward_head(x)
return x
def init(init_type, shape, dtype, name, requires_grad):
initial = initializer(init_type, shape, dtype).init_data()
return Parameter(initial, name=name, requires_grad=requires_grad)
[文档]class ViT(nn.Cell):
"""
Vision Transformer architecture implementation.
Args:
image_size (int): Input image size. Default: 224.
input_channels (int): The number of input channel. Default: 3.
patch_size (int): Patch size of image. Default: 16.
embed_dim (int): The dimension of embedding. Default: 768.
num_layers (int): The depth of transformer. Default: 12.
num_heads (int): The number of attention heads. Default: 12.
mlp_dim (int): The dimension of MLP hidden layer. Default: 3072.
keep_prob (float): The keep rate, greater than 0 and less equal than 1. Default: 1.0.
attention_keep_prob (float): The keep rate for attention layer. Default: 1.0.
drop_path_keep_prob (float): The keep rate for drop path. Default: 1.0.
activation (nn.Cell): Activation function which will be stacked on top of the
normalization layer (if not None), otherwise on top of the conv layer. Default: nn.GELU.
norm (nn.Cell, optional): Norm layer that will be stacked on top of the convolution
layer. Default: nn.LayerNorm.
pool (str): The method of pooling. Default: 'cls'.
Inputs:
- **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
Outputs:
Tensor of shape :math:`(N, 768)`
Raises:
ValueError: If `split` is not 'train', "test or 'infer'.
Supported Platforms:
``GPU``
Examples:
>>> net = ViT()
>>> x = ms.Tensor(np.ones([1, 3, 224, 224]), ms.float32)
>>> output = net(x)
>>> print(output.shape)
(1, 768)
About ViT:
Vision Transformer (ViT) shows that a pure transformer applied directly to sequences of image
patches can perform very well on image classification tasks. When pre-trained on large amounts
of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet,
CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art
convolutional networks while requiring substantially fewer computational resources to train.
Citation:
.. code-block::
@article{2020An,
title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
author={Dosovitskiy, A. and Beyer, L. and Kolesnikov, A. and Weissenborn, D. and Houlsby, N.},
year={2020},
}
"""
def __init__(self,
image_size: int = 224,
input_channels: int = 3,
patch_size: int = 16,
embed_dim: int = 768,
num_layers: int = 12,
num_heads: int = 12,
mlp_dim: int = 3072,
keep_prob: float = 1.0,
attention_keep_prob: float = 1.0,
drop_path_keep_prob: float = 1.0,
activation: nn.Cell = nn.GELU,
norm: Optional[nn.Cell] = nn.LayerNorm,
pool: str = 'cls') -> None:
super().__init__()
#Validator.check_string(pool, ["cls", "mean"], "pool type")
#self.image_size = image_size
self.patch_embedding = PatchEmbedding(image_size=image_size,
patch_size=patch_size,
embed_dim=embed_dim,
input_channels=input_channels)
num_patches = self.patch_embedding.num_patches
if pool == "cls":
self.cls_token = init(init_type=Normal(sigma=1.0),
shape=(1, 1, embed_dim),
dtype=ms.float32,
name='cls',
requires_grad=True)
self.pos_embedding = init(init_type=Normal(sigma=1.0),
shape=(1, num_patches + 1, embed_dim),
dtype=ms.float32,
name='pos_embedding',
requires_grad=True)
self.concat = ops.Concat(axis=1)
else:
self.pos_embedding = init(init_type=Normal(sigma=1.0),
shape=(1, num_patches, embed_dim),
dtype=ms.float32,
name='pos_embedding',
requires_grad=True)
self.mean = ops.ReduceMean(keep_dims=False)
self.pool = pool
self.pos_dropout = nn.Dropout(keep_prob)
self.norm = norm((embed_dim,))
self.tile = ops.Tile()
self.transformer = TransformerEncoder(dim=embed_dim,
num_layers=num_layers,
num_heads=num_heads,
mlp_dim=mlp_dim,
keep_prob=keep_prob,
attention_keep_prob=attention_keep_prob,
drop_path_keep_prob=drop_path_keep_prob,
activation=activation,
norm=norm)
def construct(self, x):
"""ViT construct."""
x = self.patch_embedding(x)
if self.pool == "cls":
cls_tokens = self.tile(self.cls_token, (x.shape[0], 1, 1))
x = self.concat((cls_tokens, x))
x += self.pos_embedding
else:
x += self.pos_embedding
x = self.pos_dropout(x)
x = self.transformer(x)
x = self.norm(x)
if self.pool == "cls":
x = x[:, 0]
else:
x = self.mean(x, (1, 2)) # (1,) or (1,2)
return x
def vit(image_size: int,
input_channels: int,
patch_size: int,
embed_dim: int,
num_layers: int,
num_heads: int,
num_classes: int,
mlp_dim: int,
dropout: float = 0.,
attention_dropout: float = 0.,
drop_path_rate: float = 0.,
activation: nn.Cell = nn.GELU,
norm: nn.Cell = nn.LayerNorm,
pool: str = 'cls',
representation_size: Optional[int] = None,
pretrained: bool = False,
url_cfg: dict = None) -> ViT:
"""Vision Transformer architecture."""
backbone = ViT(image_size=image_size,
input_channels=input_channels,
patch_size=patch_size,
embed_dim=embed_dim,
num_layers=num_layers,
num_heads=num_heads,
mlp_dim=mlp_dim,
keep_prob=1.0 - dropout,
attention_keep_prob=1.0 - attention_dropout,
drop_path_keep_prob=1.0 - drop_path_rate,
activation=activation,
norm=norm,
pool=pool,
)
if representation_size:
head = MultilayerDenseHead(input_channel=embed_dim,
num_classes=num_classes,
mid_channel=[representation_size],
activation=['tanh', None],
keep_prob=[1.0, 1.0])
else:
head = DenseHead(input_channel=embed_dim,
num_classes=num_classes)
model = BaseClassifier(backbone=backbone, head=head)
model.image_size = image_size
if pretrained:
# Download the pre-trained checkpoint file from url, and load ckpt file.
load_pretrained(model, url_cfg, num_classes=num_classes, in_channels=input_channels)
return model
@register_model
def vit_b_16_224(
pretrained: bool = False,
num_classes: int = 1000,
in_channels: int = 3,
image_size: int = 224,
has_logits: bool = False,
drop_rate: float = 0.0,
#attention-dropout: float = 0.0,
drop_path_rate: float = 0.0
) -> ViT:
"""
Constructs a vit_b_16 architecture from
`An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.
Args:
pretrained (bool): Whether to download and load the pre-trained model. Default: False.
num_classes (int): The number of classification. Default: 1000.
in_channels (int): The number of input channels. Default: 3.
image_size (int): The input image size. Default: 224 for ImageNet.
has_logits (bool): Whether has logits or not. Default: False.
drop_rate (float): The drop out rate. Default: 0.0.s
drop_path_rate (float): The stochastic depth rate. Default: 0.0.
Returns:
ViT network, MindSpore.nn.Cell
Inputs:
- **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
Examples:
>>> net = vit_b_16_224()
>>> x = ms.Tensor(np.ones([1, 3, 224, 224]), ms.float32)
>>> output = net(x)
>>> print(output.shape)
(1, 1000)
Outputs:
Tensor of shape :math:`(N, CLASSES_{out})`
Supported Platforms:
``GPU``
"""
config = ConfigDict()
config.image_size = image_size
config.num_classes = num_classes
config.patch_size = 16
config.embed_dim = 768
config.mlp_dim = 3072
config.num_heads = 12
config.num_layers = 12
config.dropout = drop_rate
config.attention_dropout = drop_rate #attention-dropout
config.drop_path_rate = drop_path_rate
config.pretrained = pretrained
config.input_channels = in_channels
config.pool = 'cls'
config.representation_size = 768 if has_logits else None
config.url_cfg = default_cfgs['vit_b_16_224']
return vit(**config)
@register_model
def vit_b_16_384(
pretrained: bool = False,
num_classes: int = 1000,
in_channels: int = 3,
image_size: int = 384,
has_logits: bool = False,
drop_rate: float = 0.0,
#attention-dropout: float = 0.0,
drop_path_rate: float = 0.0
) -> ViT:
'''construct and return a ViT network'''
config = ConfigDict()
config.image_size = image_size
config.num_classes = num_classes
config.patch_size = 16
config.embed_dim = 768
config.mlp_dim = 3072
config.num_heads = 12
config.num_layers = 12
config.dropout = drop_rate
config.attention_dropout = drop_rate#attention-dropout
config.drop_path_rate = drop_path_rate
config.pretrained = pretrained
config.input_channels = in_channels
config.pool = 'cls'
config.representation_size = 768 if has_logits else None
config.url_cfg = default_cfgs['vit_b_16_384']
return vit(**config)
@register_model
def vit_l_16_224(
pretrained: bool = False,
num_classes: int = 1000,
in_channels: int = 3,
image_size: int = 224,
has_logits: bool = False,
drop_rate: float = 0.0,
#attention-dropout: float = 0.0,
drop_path_rate: float = 0.0
) -> ViT:
'''construct and return a ViT network'''
config = ConfigDict()
config.image_size = image_size
config.num_classes = num_classes
config.patch_size = 16
config.embed_dim = 1024
config.mlp_dim = 4096
config.num_heads = 16
config.num_layers = 24
config.dropout = drop_rate
config.attention_dropout = drop_rate #attention-dropout
config.drop_path_rate = drop_path_rate
config.input_channels = in_channels
config.pool = 'cls'
config.pretrained = pretrained
config.representation_size = 1024 if has_logits else None
config.url_cfg = default_cfgs['vit_l_16_224']
return vit(**config)
@register_model
def vit_l_16_384(
pretrained: bool = False,
num_classes: int = 1000,
in_channels: int = 3,
image_size: int = 384,
has_logits: bool = False,
drop_rate: float = 0.0,
#attention-dropout: float = 0.0,
drop_path_rate: float = 0.0
) -> ViT:
'''construct and return a ViT network'''
config = ConfigDict()
config.image_size = image_size
config.num_classes = num_classes
config.patch_size = 16
config.embed_dim = 1024
config.mlp_dim = 4096
config.num_heads = 16
config.num_layers = 24
config.dropout = drop_rate
config.attention_dropout = drop_rate #attention-dropout
config.drop_path_rate = drop_path_rate
config.input_channels = in_channels
config.pool = 'cls'
config.pretrained = pretrained
config.representation_size = 1024 if has_logits else None
config.url_cfg = default_cfgs['vit_l_16_384']
return vit(**config)
@register_model
def vit_b_32_224(
pretrained: bool = False,
num_classes: int = 1000,
in_channels: int = 3,
image_size: int = 224,
has_logits: bool = False,
drop_rate: float = 0.0,
#attention-dropout: float = 0.0,
drop_path_rate: float = 0.0
) -> ViT:
'''construct and return a ViT network'''
config = ConfigDict()
config.image_size = image_size
config.num_classes = num_classes
config.patch_size = 32
config.embed_dim = 768
config.mlp_dim = 3072
config.num_heads = 12
config.num_layers = 12
config.dropout = drop_rate
config.attention_dropout = drop_rate #attention-dropout
config.drop_path_rate = drop_path_rate
config.pretrained = pretrained
config.input_channels = in_channels
config.pool = 'cls'
config.representation_size = 768 if has_logits else None
config.url_cfg = default_cfgs['vit_b_32_224']
return vit(**config)
@register_model
def vit_b_32_384(
pretrained: bool = False,
num_classes: int = 1000,
in_channels: int = 3,
image_size: int = 384,
has_logits: bool = False,
drop_rate: float = 0.0,
#attention_dropout: float = 0.0,
drop_path_rate: float = 0.0
) -> ViT:
'''construct and return a ViT network'''
config = ConfigDict()
config.image_size = image_size
config.num_classes = num_classes
config.patch_size = 32
config.embed_dim = 768
config.mlp_dim = 3072
config.num_heads = 12
config.num_layers = 12
config.dropout = drop_rate
config.attention_dropout = drop_rate #attention_dropout
config.drop_path_rate = drop_path_rate
config.pretrained = pretrained
config.input_channels = in_channels
config.pool = 'cls'
config.representation_size = 768 if has_logits else None
config.url_cfg = default_cfgs['vit_b_32_384']
return vit(**config)
@register_model
def vit_l_32_224(
pretrained: bool = False,
num_classes: int = 1000,
in_channels: int = 3,
image_size: int = 224,
has_logits: bool = False,
drop_rate: float = 0.0,
#attention-dropout: float = 0.0,
drop_path_rate: float = 0.0
) -> ViT:
'''construct and return a ViT network'''
config = ConfigDict()
config.image_size = image_size
config.num_classes = num_classes
config.patch_size = 32
config.embed_dim = 1024
config.mlp_dim = 4096
config.num_heads = 16
config.num_layers = 24
config.dropout = drop_rate
config.attention_dropout = drop_rate #attention-dropout
config.drop_path_rate = drop_path_rate
config.pretrained = pretrained
config.input_channels = in_channels
config.pool = 'cls'
config.representation_size = 1024 if has_logits else None
config.url_cfg = default_cfgs['vit_l_32_224']
return vit(**config)