Skip to content

Commit 0bddcb3

Browse files
committed
Apply all diffs
1 parent 79f0cc6 commit 0bddcb3

File tree

12 files changed

+189
-205
lines changed

12 files changed

+189
-205
lines changed

src/transformers/models/altclip/modeling_altclip.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -198,15 +198,15 @@ def to_tuple(self) -> Tuple[Any]:
198198

199199
# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->AltRoberta
200200
class AltRobertaEmbeddings(nn.Module):
201-
"""
202-
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
203-
"""
201+
"""Construct the embeddings from word, position and token_type embeddings."""
204202

205-
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
206203
def __init__(self, config):
207204
super().__init__()
205+
self.padding_idx = config.pad_token_id
208206
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
209-
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
207+
self.position_embeddings = nn.Embedding(
208+
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
209+
)
210210
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
211211

212212
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
@@ -222,15 +222,14 @@ def __init__(self, config):
222222
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
223223
)
224224

225-
# End copy
226-
self.padding_idx = config.pad_token_id
227-
self.position_embeddings = nn.Embedding(
228-
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
229-
)
230-
231225
def forward(
232-
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
233-
):
226+
self,
227+
input_ids: Optional[torch.LongTensor] = None,
228+
token_type_ids: Optional[torch.LongTensor] = None,
229+
position_ids: Optional[torch.LongTensor] = None,
230+
inputs_embeds: Optional[torch.FloatTensor] = None,
231+
past_key_values_length: int = 0,
232+
) -> torch.Tensor:
234233
if position_ids is None:
235234
if input_ids is not None:
236235
# Create the position ids from the input token ids. Any padded tokens remain padded.

src/transformers/models/bridgetower/modeling_bridgetower.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -925,15 +925,15 @@ def forward(
925925

926926
# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->BridgeTowerText
927927
class BridgeTowerTextEmbeddings(nn.Module):
928-
"""
929-
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
930-
"""
928+
"""Construct the embeddings from word, position and token_type embeddings."""
931929

932-
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
933930
def __init__(self, config):
934931
super().__init__()
932+
self.padding_idx = config.pad_token_id
935933
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
936-
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
934+
self.position_embeddings = nn.Embedding(
935+
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
936+
)
937937
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
938938

939939
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
@@ -949,15 +949,14 @@ def __init__(self, config):
949949
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
950950
)
951951

952-
# End copy
953-
self.padding_idx = config.pad_token_id
954-
self.position_embeddings = nn.Embedding(
955-
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
956-
)
957-
958952
def forward(
959-
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
960-
):
953+
self,
954+
input_ids: Optional[torch.LongTensor] = None,
955+
token_type_ids: Optional[torch.LongTensor] = None,
956+
position_ids: Optional[torch.LongTensor] = None,
957+
inputs_embeds: Optional[torch.FloatTensor] = None,
958+
past_key_values_length: int = 0,
959+
) -> torch.Tensor:
961960
if position_ids is None:
962961
if input_ids is not None:
963962
# Create the position ids from the input token ids. Any padded tokens remain padded.

src/transformers/models/camembert/modeling_camembert.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,15 @@
7878

7979
# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Camembert
8080
class CamembertEmbeddings(nn.Module):
81-
"""
82-
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
83-
"""
81+
"""Construct the embeddings from word, position and token_type embeddings."""
8482

85-
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
8683
def __init__(self, config):
8784
super().__init__()
85+
self.padding_idx = config.pad_token_id
8886
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
89-
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
87+
self.position_embeddings = nn.Embedding(
88+
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
89+
)
9090
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
9191

9292
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
@@ -102,15 +102,14 @@ def __init__(self, config):
102102
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
103103
)
104104

105-
# End copy
106-
self.padding_idx = config.pad_token_id
107-
self.position_embeddings = nn.Embedding(
108-
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
109-
)
110-
111105
def forward(
112-
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
113-
):
106+
self,
107+
input_ids: Optional[torch.LongTensor] = None,
108+
token_type_ids: Optional[torch.LongTensor] = None,
109+
position_ids: Optional[torch.LongTensor] = None,
110+
inputs_embeds: Optional[torch.FloatTensor] = None,
111+
past_key_values_length: int = 0,
112+
) -> torch.Tensor:
114113
if position_ids is None:
115114
if input_ids is not None:
116115
# Create the position ids from the input token ids. Any padded tokens remain padded.
@@ -1468,14 +1467,14 @@ def __init__(self, config):
14681467
)
14691468
def forward(
14701469
self,
1471-
input_ids: Optional[torch.LongTensor] = None,
1472-
attention_mask: Optional[torch.FloatTensor] = None,
1473-
token_type_ids: Optional[torch.LongTensor] = None,
1474-
position_ids: Optional[torch.LongTensor] = None,
1475-
head_mask: Optional[torch.FloatTensor] = None,
1476-
inputs_embeds: Optional[torch.FloatTensor] = None,
1477-
start_positions: Optional[torch.LongTensor] = None,
1478-
end_positions: Optional[torch.LongTensor] = None,
1470+
input_ids: Optional[torch.Tensor] = None,
1471+
attention_mask: Optional[torch.Tensor] = None,
1472+
token_type_ids: Optional[torch.Tensor] = None,
1473+
position_ids: Optional[torch.Tensor] = None,
1474+
head_mask: Optional[torch.Tensor] = None,
1475+
inputs_embeds: Optional[torch.Tensor] = None,
1476+
start_positions: Optional[torch.Tensor] = None,
1477+
end_positions: Optional[torch.Tensor] = None,
14791478
output_attentions: Optional[bool] = None,
14801479
output_hidden_states: Optional[bool] = None,
14811480
return_dict: Optional[bool] = None,

src/transformers/models/clap/modeling_clap.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,15 +1139,15 @@ def forward(self, hidden_states):
11391139

11401140
# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText, persistent=False->persistent=True
11411141
class ClapTextEmbeddings(nn.Module):
1142-
"""
1143-
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
1144-
"""
1142+
"""Construct the embeddings from word, position and token_type embeddings."""
11451143

1146-
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
11471144
def __init__(self, config):
11481145
super().__init__()
1146+
self.padding_idx = config.pad_token_id
11491147
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
1150-
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
1148+
self.position_embeddings = nn.Embedding(
1149+
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
1150+
)
11511151
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
11521152

11531153
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
@@ -1163,15 +1163,14 @@ def __init__(self, config):
11631163
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True
11641164
)
11651165

1166-
# End copy
1167-
self.padding_idx = config.pad_token_id
1168-
self.position_embeddings = nn.Embedding(
1169-
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
1170-
)
1171-
11721166
def forward(
1173-
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
1174-
):
1167+
self,
1168+
input_ids: Optional[torch.LongTensor] = None,
1169+
token_type_ids: Optional[torch.LongTensor] = None,
1170+
position_ids: Optional[torch.LongTensor] = None,
1171+
inputs_embeds: Optional[torch.FloatTensor] = None,
1172+
past_key_values_length: int = 0,
1173+
) -> torch.Tensor:
11751174
if position_ids is None:
11761175
if input_ids is not None:
11771176
# Create the position ids from the input token ids. Any padded tokens remain padded.

src/transformers/models/data2vec/modeling_data2vec_text.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,17 +56,17 @@
5656
_CONFIG_FOR_DOC = "Data2VecTextConfig"
5757

5858

59-
# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2VecText
59+
# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2VecTextForText
6060
class Data2VecTextForTextEmbeddings(nn.Module):
61-
"""
62-
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
63-
"""
61+
"""Construct the embeddings from word, position and token_type embeddings."""
6462

65-
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
6663
def __init__(self, config):
6764
super().__init__()
65+
self.padding_idx = config.pad_token_id
6866
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
69-
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
67+
self.position_embeddings = nn.Embedding(
68+
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
69+
)
7070
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
7171

7272
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
@@ -82,15 +82,14 @@ def __init__(self, config):
8282
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
8383
)
8484

85-
# End copy
86-
self.padding_idx = config.pad_token_id
87-
self.position_embeddings = nn.Embedding(
88-
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
89-
)
90-
9185
def forward(
92-
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
93-
):
86+
self,
87+
input_ids: Optional[torch.LongTensor] = None,
88+
token_type_ids: Optional[torch.LongTensor] = None,
89+
position_ids: Optional[torch.LongTensor] = None,
90+
inputs_embeds: Optional[torch.FloatTensor] = None,
91+
past_key_values_length: int = 0,
92+
) -> torch.Tensor:
9493
if position_ids is None:
9594
if input_ids is not None:
9695
# Create the position ids from the input token ids. Any padded tokens remain padded.

src/transformers/models/gemma2/modeling_gemma2.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,12 +195,12 @@ def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
195195
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
196196
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
197197
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
198-
self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
199198
self.rotary_emb = Gemma2RotaryEmbedding(
200199
self.head_dim,
201200
max_position_embeddings=self.max_position_embeddings,
202201
base=self.rope_theta,
203202
)
203+
self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
204204

205205
def forward(
206206
self,
@@ -492,12 +492,12 @@ def __init__(self, config: Gemma2Config, layer_idx: int):
492492
self.self_attn = GEMMA2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
493493
self.mlp = Gemma2MLP(config)
494494
self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
495+
self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
495496
self.config = config
496497
self.is_sliding = not bool(layer_idx % 2)
497498
self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
498499
self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
499500
self.sliding_window = config.sliding_window
500-
self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
501501

502502
def forward(
503503
self,
@@ -862,6 +862,7 @@ def forward(
862862
attentions=all_self_attns,
863863
)
864864

865+
@torch.no_grad()
865866
def _update_causal_mask(
866867
self,
867868
attention_mask: torch.Tensor,

src/transformers/models/llava_next_video/modular_llava_next_video.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from ...configuration_utils import PretrainedConfig
3131
from ...utils import (
3232
logging,
33-
replace_return_docstrings,
3433
)
3534
from ..auto import CONFIG_MAPPING
3635

@@ -269,7 +268,6 @@ def _get_video_features(self, pixel_values):
269268
image_features = torch.split(image_features, frames, dim=0)
270269
return image_features
271270

272-
@replace_return_docstrings(output_type=LlavaNextVideoCausalLMOutputWithPast, config_class="LlavaNextVideoConfig")
273271
def forward(
274272
self,
275273
input_ids: torch.LongTensor = None,

0 commit comments

Comments
 (0)