huggingface
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/chameleon/image_processing_chameleon.py
Lines changed: 5 additions & 5 deletions b/‎src/transformers/models/chameleon/image_processing_chameleon.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
Lines changed: 4 additions & 3 deletions b/‎src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/falcon_h1/modeling_falcon_h1.py
Lines changed: 3 additions & 4 deletions b/‎src/transformers/models/falcon_h1/modeling_falcon_h1.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎src/transformers/models/gemma3/configuration_gemma3.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/gemma3/configuration_gemma3.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/gemma3/modular_gemma3.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/gemma3/modular_gemma3.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/got_ocr2/configuration_got_ocr2.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/got_ocr2/configuration_got_ocr2.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/got_ocr2/modular_got_ocr2.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/got_ocr2/modular_got_ocr2.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/idefics3/image_processing_idefics3.py
Lines changed: 6 additions & 6 deletions b/‎src/transformers/models/idefics3/image_processing_idefics3.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/janus/configuration_janus.py
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/janus/configuration_janus.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/janus/image_processing_janus.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/janus/image_processing_janus.py
Lines changed: 3 additions & 3 deletions
@@ -86,11 +86,11 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
 
 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
+	python utils/check_docstrings.py --fix_and_overwrite
 	python utils/check_modular_conversion.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_pipeline_typing.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
-	python utils/check_docstrings.py --fix_and_overwrite
 
 # Run tests for the library
 
 
@@ -49,7 +49,7 @@ class ChameleonImageProcessor(BaseImageProcessor):
     Constructs a Chameleon image processor.
 
     Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
+        do_resize (`bool`, *optional*, defaults to True):
             Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
             `do_resize` in the `preprocess` method.
         size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 512}`):
@@ -58,19 +58,19 @@ class ChameleonImageProcessor(BaseImageProcessor):
             method.
         resample (`PILImageResampling`, *optional*, defaults to 1):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
+        do_center_crop (`bool`, *optional*, defaults to True):
             Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
             `preprocess` method.
         crop_size (`dict[str, int]` *optional*, defaults to {"height": 512, "width": 512}):
             Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
             method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
+        do_rescale (`bool`, *optional*, defaults to True):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
             the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to 0.0078):
             Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
             method.
-        do_normalize (`bool`, *optional*, defaults to `True`):
+        do_normalize (`bool`, *optional*, defaults to True):
             Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
         image_mean (`float` or `list[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
@@ -79,7 +79,7 @@ class ChameleonImageProcessor(BaseImageProcessor):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
             Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+        do_convert_rgb (`bool`, *optional*, defaults to True):
             Whether to convert the image to RGB.
     """
 
 
@@ -20,6 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from ...configuration_utils import PretrainedConfig
 from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
@@ -60,7 +61,7 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
+        qkv_bias (`bool`, *optional*, defaults to True):
             Whether to add a bias to the queries, keys and values.
         layerscale_value (`float`, *optional*, defaults to 1.0):
            Initial value to use for layer scale.
@@ -80,9 +81,9 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
             many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
             If unset and `out_features` is unset, will default to the last stage. Must be in the
             same order as defined in the `stage_names` attribute.
-        apply_layernorm (`bool`, *optional*, defaults to `True`):
+        apply_layernorm (`bool`, *optional*, defaults to True):
             Whether to apply layer normalization to the feature maps in case the model is used as backbone.
-        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+        reshape_hidden_states (`bool`, *optional*, defaults to True):
             Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
 
@@ -74,7 +74,7 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
+        qkv_bias (`bool`, *optional*, defaults to True):
             Whether to add a bias to the queries, keys and values.
         layerscale_value (`float`, *optional*, defaults to 1.0):
            Initial value to use for layer scale.
@@ -94,9 +94,9 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
             many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
             If unset and `out_features` is unset, will default to the last stage. Must be in the
             same order as defined in the `stage_names` attribute.
-        apply_layernorm (`bool`, *optional*, defaults to `True`):
+        apply_layernorm (`bool`, *optional*, defaults to True):
             Whether to apply layer normalization to the feature maps in case the model is used as backbone.
-        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+        reshape_hidden_states (`bool`, *optional*, defaults to True):
             Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
 
@@ -30,8 +30,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from transformers.activations import ACT2FN
-
+from ...activations import ACT2FN
 from ...cache_utils import (
     Cache,
     DynamicCache,  # we need __iter__ and __len__ of pkv
@@ -503,7 +502,7 @@ def apply_mask_to_padding_states(hidden_states, attention_mask):
     return hidden_states
 
 
-# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
+# Adapted from ..mamba2.modeling_mamba2.Mamba2Mixer
 class FalconH1Mixer(nn.Module):
     """
     FalconH1Mixer is identical to classic Mamba2 mixer classes but differs on two different things
@@ -1219,7 +1218,7 @@ def compute_mup_vector(config):
 
 
 @auto_docstring
-# Adapted from transformers.models.jamba.modeling_jamba.JambaModel
+# Adapted from ..jamba.modeling_jamba.JambaModel
 class FalconH1Model(FalconH1PreTrainedModel):
     def __init__(self, config: FalconH1Config):
         super().__init__(config)
 
@@ -69,7 +69,7 @@ class Gemma3TextConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
+        use_cache (`bool`, *optional*, defaults to True):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
         pad_token_id (`int`, *optional*, defaults to 0):
@@ -78,11 +78,11 @@ class Gemma3TextConfig(PretrainedConfig):
             End of stream token id.
         bos_token_id (`int`, *optional*, defaults to 2):
             Beginning of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+        tie_word_embeddings (`bool`, *optional*, defaults to True):
             Whether to tie weight embeddings
         rope_theta (`float`, *optional*, defaults to 1000000.0):
             The base period of the RoPE embeddings.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to True):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
 
@@ -95,7 +95,7 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
+        use_cache (`bool`, *optional*, defaults to True):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
         pad_token_id (`int`, *optional*, defaults to 0):
@@ -104,11 +104,11 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
             End of stream token id.
         bos_token_id (`int`, *optional*, defaults to 2):
             Beginning of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+        tie_word_embeddings (`bool`, *optional*, defaults to True):
             Whether to tie weight embeddings
         rope_theta (`float`, *optional*, defaults to 1000000.0):
             The base period of the RoPE embeddings.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to True):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
 
@@ -57,11 +57,11 @@ class GotOcr2VisionConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 1e-10):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
+        qkv_bias (`bool`, *optional*, defaults to True):
             Whether to add a bias to query, key, value projections.
-        use_abs_pos (`bool`, *optional*, defaults to `True`):
+        use_abs_pos (`bool`, *optional*, defaults to True):
             Whether to use absolute position embedding.
-        use_rel_pos (`bool`, *optional*, defaults to `True`):
+        use_rel_pos (`bool`, *optional*, defaults to True):
             Whether to use relative position embedding.
         window_size (`int`, *optional*, defaults to 14):
             Window size for relative position.
 
@@ -72,11 +72,11 @@ class GotOcr2VisionConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 1e-10):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
+        qkv_bias (`bool`, *optional*, defaults to True):
             Whether to add a bias to query, key, value projections.
-        use_abs_pos (`bool`, *optional*, defaults to `True`):
+        use_abs_pos (`bool`, *optional*, defaults to True):
             Whether to use absolute position embedding.
-        use_rel_pos (`bool`, *optional*, defaults to `True`):
+        use_rel_pos (`bool`, *optional*, defaults to True):
             Whether to use relative position embedding.
         window_size (`int`, *optional*, defaults to 14):
             Window size for relative position.
 
@@ -253,10 +253,10 @@ class Idefics3ImageProcessor(BaseImageProcessor):
     r"""
     Constructs a Idefics3 image processor.
     Args:
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+        do_convert_rgb (`bool`, *optional*, defaults to True):
             Whether to convert the image to RGB. This is useful if the input image is of a different format e.g. RGBA.
             Only has an effect if the input image is in the PIL format.
-        do_resize (`bool`, *optional*, defaults to `True`):
+        do_resize (`bool`, *optional*, defaults to True):
             Whether to resize the image. The longest edge of the image is resized to  be <= `size["longest_edge"]`, with the
             shortest edge resized to keep the input aspect ratio.
         size (`Dict`, *optional*, defaults to `{"longest_edge": 4 * 364}`):
@@ -265,16 +265,16 @@ class Idefics3ImageProcessor(BaseImageProcessor):
             to keep the input aspect ratio.
         resample (`Resampling`, *optional*, defaults to `Resampling.LANCZOS`):
             Resampling filter to use when resizing the image.
-        do_image_splitting (`bool`, *optional*, defaults to `True`):
+        do_image_splitting (`bool`, *optional*, defaults to True):
             Whether to split the image into sub-images concatenated with the original image. They are split into patches
             such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
         max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
             Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
-        do_rescale (`bool`, *optional*, defaults to `True`):
+        do_rescale (`bool`, *optional*, defaults to True):
             Whether to rescale the image. If set to `True`, the image is rescaled to have pixel values between 0 and 1.
         rescale_factor (`float`, *optional*, defaults to `1/255`):
             Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-        do_normalize (`bool`, *optional*, defaults to `True`):
+        do_normalize (`bool`, *optional*, defaults to True):
             Whether to normalize the image. If set to `True`, the image is normalized to have a mean of `image_mean` and
             a standard deviation of `image_std`.
         image_mean (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
@@ -285,7 +285,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
             Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_pad (`bool`, *optional*, defaults to `True`):
+        do_pad (`bool`, *optional*, defaults to True):
             Whether or not to pad the images to the largest height and width in the batch and number of images per
             sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
     """
 
@@ -21,9 +21,9 @@
 
 
 from ...configuration_utils import PretrainedConfig
-from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
 from ..auto import CONFIG_MAPPING, AutoConfig
+from ..auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 
 logger = logging.get_logger(__name__)
 
@@ -56,7 +56,7 @@ class JanusVisionConfig(PretrainedConfig):
             `"relu"`, `"selu"`, and `"gelu_new"` are supported.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             Ratio of MLP hidden dimensionality to embedding dimensionality.
-        attention_bias (`bool`, *optional*, defaults to `True`):
+        attention_bias (`bool`, *optional*, defaults to True):
             Whether to add a bias to the queries, keys, and values in the attention layers.
         hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
             The dropout probability for fully connected layers in the encoder.
 
@@ -61,7 +61,7 @@ class JanusImageProcessor(BaseImageProcessor):
     Constructs a JANUS image processor.
 
     Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
+        do_resize (`bool`, *optional*, defaults to True):
             Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
             `do_resize` parameter in the `preprocess` method.
         size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
@@ -73,13 +73,13 @@ class JanusImageProcessor(BaseImageProcessor):
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
+        do_rescale (`bool`, *optional*, defaults to True):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
             `do_rescale` parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
             overridden by the `rescale_factor` parameter in the `preprocess` method.
-        do_normalize (`bool`, *optional*, defaults to `True`):
+        do_normalize (`bool`, *optional*, defaults to True):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
         image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):