From 7cdfdca1e553aa3d45f81abe9f51e3e01d5ac264 Mon Sep 17 00:00:00 2001 From: BernardZach Date: Thu, 5 Dec 2024 10:24:04 -0500 Subject: [PATCH] rebased files for modular dinov2 --- .../configuration_dinov2_with_registers.py | 22 +-- .../modeling_dinov2_with_registers.py | 179 +++++++++++------- ...rs.py => modular_dinov2_with_registers.py} | 4 +- 3 files changed, 117 insertions(+), 88 deletions(-) rename src/transformers/models/dinov2_with_registers/{diff_dinov2_with_registers.py => modular_dinov2_with_registers.py} (99%) diff --git a/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py index dfc58374745..a2a838d145e 100644 --- a/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py @@ -1,9 +1,9 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the diff. If any change should be done, please apply the change to the -# diff.py file directly. -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # coding=utf-8 # Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved. # @@ -20,18 +20,10 @@ # See the License for the specific language governing permissions and # limitations under the License. - -from transformers import PretrainedConfig - -from ...utils import ( - logging, -) +from ...configuration_utils import PretrainedConfig from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices -logger = logging.get_logger(__name__) - - class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py index ed7a9e79655..a92e33ecac0 100644 --- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py @@ -1,9 +1,9 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the diff. If any change should be done, please apply the change to the -# diff.py file directly. -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # coding=utf-8 # Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved. # @@ -19,23 +19,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import collections.abc import math from typing import Dict, List, Optional, Set, Tuple, Union import torch -import torch.utils.checkpoint from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN -from ...modeling_outputs import ( - BackboneOutput, - BaseModelOutput, - BaseModelOutputWithPooling, - ImageClassifierOutput, -) +from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( @@ -51,6 +44,12 @@ logger = logging.get_logger(__name__) +# Base docstring +_CHECKPOINT_FOR_DOC = "facebook/dinov2_with_registers-base" + +# General docstring +_CONFIG_FOR_DOC = "Dinov2WithRegistersConfig" + class Dinov2WithRegistersPatchEmbeddings(nn.Module): """ @@ -85,14 +84,6 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings -# General docstring -_CONFIG_FOR_DOC = "Dinov2WithRegistersConfig" - -# Base docstring -_CHECKPOINT_FOR_DOC = "facebook/dinov2_with_registers-base" -_EXPECTED_OUTPUT_SHAPE = [1, 257, 768] - - class Dinov2WithRegistersEmbeddings(nn.Module): """ Construct the CLS token, mask token, register tokens, position and patch embeddings. @@ -103,11 +94,7 @@ def __init__(self, config: Dinov2WithRegistersConfig) -> None: self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) - self.register_tokens = ( - nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) - if config.num_register_tokens - else None - ) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config) num_patches = self.patch_embeddings.num_patches self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) @@ -144,7 +131,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: mode="bicubic", align_corners=False, antialias=self.config.interpolate_antialias, - ).to(dtype=target_dtype) + ) + patch_pos_embed = patch_pos_embed.to(dtype=target_dtype) if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: raise ValueError("Width or height does not match with the interpolated position embeddings") patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) @@ -167,22 +155,15 @@ def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Te # add positional encoding to each token embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) - if self.register_tokens is not None: - embeddings = torch.cat( - ( - embeddings[:, :1], - self.register_tokens.expand(embeddings.shape[0], -1, -1), - embeddings[:, 1:], - ), - dim=1, - ) + # add register tokens + embeddings = torch.cat((embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1)), dim=1) + embeddings = torch.cat((embeddings, embeddings[:, 1:]), dim=1) embeddings = self.dropout(embeddings) return embeddings -# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2WithRegisters class Dinov2WithRegistersSelfAttention(nn.Module): def __init__(self, config: Dinov2WithRegistersConfig) -> None: super().__init__() @@ -243,7 +224,47 @@ def forward( return outputs -# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2WithRegisters +class Dinov2WithRegistersSdpaSelfAttention(Dinov2WithRegistersSelfAttention): + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__(config) + self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Dinov2WithRegistersModel is using Dinov2WithRegistersSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions + ) + + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + head_mask, + self.attention_probs_dropout_prob if self.training else 0.0, + is_causal=False, + scale=None, + ) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, None + + class Dinov2WithRegistersSelfOutput(nn.Module): """ The residual connection is defined in Dinov2WithRegistersLayer instead of here (as is the case with other models), due to the @@ -262,7 +283,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2WithRegisters class Dinov2WithRegistersAttention(nn.Module): def __init__(self, config: Dinov2WithRegistersConfig) -> None: super().__init__() @@ -302,6 +322,12 @@ def forward( return outputs +class Dinov2WithRegistersSdpaAttention(Dinov2WithRegistersAttention): + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__(config) + self.attention = Dinov2WithRegistersSdpaSelfAttention(config) + + class Dinov2WithRegistersLayerScale(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -380,6 +406,12 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: return self.weights_out(hidden) +DINOV2_WITH_REGISTERS_ATTENTION_CLASSES = { + "eager": Dinov2WithRegistersAttention, + "sdpa": Dinov2WithRegistersSdpaAttention, +} + + class Dinov2WithRegistersLayer(nn.Module): """This corresponds to the Block class in the original implementation.""" @@ -387,7 +419,7 @@ def __init__(self, config: Dinov2WithRegistersConfig) -> None: super().__init__() self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.attention = Dinov2WithRegistersAttention(config) + self.attention = DINOV2_WITH_REGISTERS_ATTENTION_CLASSES[config._attn_implementation](config) self.layer_scale1 = Dinov2WithRegistersLayerScale(config) self.drop_path = ( Dinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() @@ -433,7 +465,6 @@ def forward( return outputs -# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2WithRegisters class Dinov2WithRegistersEncoder(nn.Module): def __init__(self, config: Dinov2WithRegistersConfig) -> None: super().__init__() @@ -485,10 +516,6 @@ def forward( ) -# Image classification docstring -_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2_with_registers-small-imagenet1k-1-layer" - - class Dinov2WithRegistersPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -500,6 +527,7 @@ class Dinov2WithRegistersPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" supports_gradient_checkpointing = True _no_split_modules = ["Dinov2WithRegistersSwiGLUFFN"] + _supports_sdpa = True def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: """Initialize the weights""" @@ -528,7 +556,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No ).to(module.cls_token.dtype) -_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" +_EXPECTED_OUTPUT_SHAPE = [1, 257, 768] DINOV2_WITH_REGISTERS_START_DOCSTRING = r""" @@ -542,28 +570,6 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ -DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See - [`BitImageProcessor.preprocess`] for details. - - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): @@ -677,6 +683,33 @@ def forward( ) +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2_with_registers-small-imagenet1k-1-layer" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + +DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`BitImageProcessor.preprocess`] for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + @add_start_docstrings( """ Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state @@ -787,14 +820,14 @@ class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMi def __init__(self, config): super().__init__(config) super()._init_backbone(config) - - self.num_register_tokens = config.num_register_tokens self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] self.embeddings = Dinov2WithRegistersEmbeddings(config) self.encoder = Dinov2WithRegistersEncoder(config) self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.num_register_tokens = config.num_register_tokens + # Initialize weights and apply final processing self.post_init() @@ -814,6 +847,10 @@ def forward( Returns: Examples: + Returns: + + Examples: + ```python >>> from transformers import AutoImageProcessor, AutoBackbone diff --git a/src/transformers/models/dinov2_with_registers/diff_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py similarity index 99% rename from src/transformers/models/dinov2_with_registers/diff_dinov2_with_registers.py rename to src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py index bf93cc8742a..295456b52e1 100644 --- a/src/transformers/models/dinov2_with_registers/diff_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py @@ -20,8 +20,7 @@ import torch.utils.checkpoint from torch import nn -from transformers import PretrainedConfig -from transformers.models.dinov2.modeling_dinov2 import ( +from ....transformers.models.dinov2.modeling_dinov2 import ( Dinov2Backbone, Dinov2Encoder, Dinov2ForImageClassification, @@ -29,6 +28,7 @@ Dinov2PatchEmbeddings, ) +from ...configuration_utils import PretrainedConfig from ...modeling_outputs import BackboneOutput from ...utils import logging from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices