Skip to content

Commit 296a4c8

Browse files
committed
update
1 parent a13e790 commit 296a4c8

File tree

2 files changed

+40
-271
lines changed

2 files changed

+40
-271
lines changed

src/transformers/models/roberta/modular_roberta.py

+14-257
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@
5050
BertSdpaSelfAttention,
5151
BertSelfAttention,
5252
BertSelfOutput,
53+
BertForMultipleChoice,
54+
BertForTokenClassification,
55+
BertForQuestionAnswering,
5356
)
5457
from .configuration_roberta import RobertaConfig
5558

@@ -79,8 +82,8 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
7982

8083
class RobertaEmbeddings(BertEmbeddings):
8184
def __init__(self, config):
82-
super().__init__(config)
8385
self.padding_idx = config.pad_token_id
86+
super().__init__(config)
8487
self.position_embeddings = nn.Embedding(
8588
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
8689
)
@@ -184,10 +187,6 @@ class RobertaLayer(BertLayer):
184187
def __init__(self, config):
185188
super().__init__(config)
186189
self.attention = RobertaAttention(config)
187-
if self.add_cross_attention:
188-
if not self.is_decoder:
189-
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
190-
self.crossattention = RobertaAttention(config, position_embedding_type="absolute")
191190
self.intermediate = RobertaIntermediate(config)
192191
self.output = RobertaOutput(config)
193192

@@ -237,8 +236,6 @@ def __init__(self, config, add_pooling_layer=True):
237236
self.embeddings = RobertaEmbeddings(config)
238237
self.encoder = RobertaEncoder(config)
239238
self.pooler = RobertaPooler(config) if add_pooling_layer else None
240-
# Initialize weights and apply final processing
241-
self.post_init()
242239

243240

244241
ROBERTA_INPUTS_DOCSTRING = None # To use value from modeling_bert
@@ -631,119 +628,16 @@ def forward(
631628
)
632629

633630

634-
@add_start_docstrings(
635-
"""
636-
Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
637-
softmax) e.g. for RocStories/SWAG tasks.
638-
""",
639-
ROBERTA_START_DOCSTRING,
640-
)
641-
class RobertaForMultipleChoice(RobertaPreTrainedModel):
631+
class RobertaForMultipleChoice(BertForMultipleChoice):
642632
def __init__(self, config):
643633
super().__init__(config)
644-
645634
self.roberta = RobertaModel(config)
646635
self.dropout = nn.Dropout(config.hidden_dropout_prob)
647636
self.classifier = nn.Linear(config.hidden_size, 1)
637+
del classifier_dropout
648638

649-
# Initialize weights and apply final processing
650-
self.post_init()
651639

652-
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
653-
@add_code_sample_docstrings(
654-
checkpoint=_CHECKPOINT_FOR_DOC,
655-
output_type=MultipleChoiceModelOutput,
656-
config_class=_CONFIG_FOR_DOC,
657-
)
658-
def forward(
659-
self,
660-
input_ids: Optional[torch.LongTensor] = None,
661-
token_type_ids: Optional[torch.LongTensor] = None,
662-
attention_mask: Optional[torch.FloatTensor] = None,
663-
labels: Optional[torch.LongTensor] = None,
664-
position_ids: Optional[torch.LongTensor] = None,
665-
head_mask: Optional[torch.FloatTensor] = None,
666-
inputs_embeds: Optional[torch.FloatTensor] = None,
667-
output_attentions: Optional[bool] = None,
668-
output_hidden_states: Optional[bool] = None,
669-
return_dict: Optional[bool] = None,
670-
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
671-
r"""
672-
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
673-
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
674-
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
675-
`input_ids` above)
676-
"""
677-
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
678-
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
679-
680-
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
681-
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
682-
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
683-
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
684-
flat_inputs_embeds = (
685-
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
686-
if inputs_embeds is not None
687-
else None
688-
)
689-
690-
outputs = self.roberta(
691-
flat_input_ids,
692-
position_ids=flat_position_ids,
693-
token_type_ids=flat_token_type_ids,
694-
attention_mask=flat_attention_mask,
695-
head_mask=head_mask,
696-
inputs_embeds=flat_inputs_embeds,
697-
output_attentions=output_attentions,
698-
output_hidden_states=output_hidden_states,
699-
return_dict=return_dict,
700-
)
701-
pooled_output = outputs[1]
702-
703-
pooled_output = self.dropout(pooled_output)
704-
logits = self.classifier(pooled_output)
705-
reshaped_logits = logits.view(-1, num_choices)
706-
707-
loss = None
708-
if labels is not None:
709-
# move labels to correct device to enable model parallelism
710-
labels = labels.to(reshaped_logits.device)
711-
loss_fct = CrossEntropyLoss()
712-
loss = loss_fct(reshaped_logits, labels)
713-
714-
if not return_dict:
715-
output = (reshaped_logits,) + outputs[2:]
716-
return ((loss,) + output) if loss is not None else output
717-
718-
return MultipleChoiceModelOutput(
719-
loss=loss,
720-
logits=reshaped_logits,
721-
hidden_states=outputs.hidden_states,
722-
attentions=outputs.attentions,
723-
)
724-
725-
726-
@add_start_docstrings(
727-
"""
728-
Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
729-
Named-Entity-Recognition (NER) tasks.
730-
""",
731-
ROBERTA_START_DOCSTRING,
732-
)
733-
class RobertaForTokenClassification(RobertaPreTrainedModel):
734-
def __init__(self, config):
735-
super().__init__(config)
736-
self.num_labels = config.num_labels
737-
738-
self.roberta = RobertaModel(config, add_pooling_layer=False)
739-
classifier_dropout = (
740-
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
741-
)
742-
self.dropout = nn.Dropout(classifier_dropout)
743-
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
744-
745-
# Initialize weights and apply final processing
746-
self.post_init()
640+
class RobertaForTokenClassification(BertForTokenClassification):
747641

748642
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
749643
@add_code_sample_docstrings(
@@ -753,59 +647,8 @@ def __init__(self, config):
753647
expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
754648
expected_loss=0.01,
755649
)
756-
def forward(
757-
self,
758-
input_ids: Optional[torch.LongTensor] = None,
759-
attention_mask: Optional[torch.FloatTensor] = None,
760-
token_type_ids: Optional[torch.LongTensor] = None,
761-
position_ids: Optional[torch.LongTensor] = None,
762-
head_mask: Optional[torch.FloatTensor] = None,
763-
inputs_embeds: Optional[torch.FloatTensor] = None,
764-
labels: Optional[torch.LongTensor] = None,
765-
output_attentions: Optional[bool] = None,
766-
output_hidden_states: Optional[bool] = None,
767-
return_dict: Optional[bool] = None,
768-
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
769-
r"""
770-
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
771-
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
772-
"""
773-
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
774-
775-
outputs = self.roberta(
776-
input_ids,
777-
attention_mask=attention_mask,
778-
token_type_ids=token_type_ids,
779-
position_ids=position_ids,
780-
head_mask=head_mask,
781-
inputs_embeds=inputs_embeds,
782-
output_attentions=output_attentions,
783-
output_hidden_states=output_hidden_states,
784-
return_dict=return_dict,
785-
)
786-
787-
sequence_output = outputs[0]
788-
789-
sequence_output = self.dropout(sequence_output)
790-
logits = self.classifier(sequence_output)
791-
792-
loss = None
793-
if labels is not None:
794-
# move labels to correct device to enable model parallelism
795-
labels = labels.to(logits.device)
796-
loss_fct = CrossEntropyLoss()
797-
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
798-
799-
if not return_dict:
800-
output = (logits,) + outputs[2:]
801-
return ((loss,) + output) if loss is not None else output
802-
803-
return TokenClassifierOutput(
804-
loss=loss,
805-
logits=logits,
806-
hidden_states=outputs.hidden_states,
807-
attentions=outputs.attentions,
808-
)
650+
def forward(**super_kwargs) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
651+
super().forward()
809652

810653

811654
class RobertaClassificationHead(nn.Module):
@@ -830,24 +673,8 @@ def forward(self, features, **kwargs):
830673
return x
831674

832675

833-
@add_start_docstrings(
834-
"""
835-
Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
836-
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
837-
""",
838-
ROBERTA_START_DOCSTRING,
839-
)
840-
class RobertaForQuestionAnswering(RobertaPreTrainedModel):
841-
def __init__(self, config):
842-
super().__init__(config)
843-
self.num_labels = config.num_labels
844-
845-
self.roberta = RobertaModel(config, add_pooling_layer=False)
846-
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
847-
848-
# Initialize weights and apply final processing
849-
self.post_init()
850-
676+
class RobertaForQuestionAnswering(BertForQuestionAnswering):
677+
851678
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
852679
@add_code_sample_docstrings(
853680
checkpoint="deepset/roberta-base-squad2",
@@ -856,76 +683,6 @@ def __init__(self, config):
856683
expected_output="' puppet'",
857684
expected_loss=0.86,
858685
)
859-
def forward(
860-
self,
861-
input_ids: Optional[torch.LongTensor] = None,
862-
attention_mask: Optional[torch.FloatTensor] = None,
863-
token_type_ids: Optional[torch.LongTensor] = None,
864-
position_ids: Optional[torch.LongTensor] = None,
865-
head_mask: Optional[torch.FloatTensor] = None,
866-
inputs_embeds: Optional[torch.FloatTensor] = None,
867-
start_positions: Optional[torch.LongTensor] = None,
868-
end_positions: Optional[torch.LongTensor] = None,
869-
output_attentions: Optional[bool] = None,
870-
output_hidden_states: Optional[bool] = None,
871-
return_dict: Optional[bool] = None,
872-
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
873-
r"""
874-
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
875-
Labels for position (index) of the start of the labelled span for computing the token classification loss.
876-
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
877-
are not taken into account for computing the loss.
878-
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
879-
Labels for position (index) of the end of the labelled span for computing the token classification loss.
880-
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
881-
are not taken into account for computing the loss.
882-
"""
883-
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
884-
885-
outputs = self.roberta(
886-
input_ids,
887-
attention_mask=attention_mask,
888-
token_type_ids=token_type_ids,
889-
position_ids=position_ids,
890-
head_mask=head_mask,
891-
inputs_embeds=inputs_embeds,
892-
output_attentions=output_attentions,
893-
output_hidden_states=output_hidden_states,
894-
return_dict=return_dict,
895-
)
896-
897-
sequence_output = outputs[0]
898-
899-
logits = self.qa_outputs(sequence_output)
900-
start_logits, end_logits = logits.split(1, dim=-1)
901-
start_logits = start_logits.squeeze(-1).contiguous()
902-
end_logits = end_logits.squeeze(-1).contiguous()
903-
904-
total_loss = None
905-
if start_positions is not None and end_positions is not None:
906-
# If we are on multi-GPU, split add a dimension
907-
if len(start_positions.size()) > 1:
908-
start_positions = start_positions.squeeze(-1)
909-
if len(end_positions.size()) > 1:
910-
end_positions = end_positions.squeeze(-1)
911-
# sometimes the start/end positions are outside our model inputs, we ignore these terms
912-
ignored_index = start_logits.size(1)
913-
start_positions = start_positions.clamp(0, ignored_index)
914-
end_positions = end_positions.clamp(0, ignored_index)
915-
916-
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
917-
start_loss = loss_fct(start_logits, start_positions)
918-
end_loss = loss_fct(end_logits, end_positions)
919-
total_loss = (start_loss + end_loss) / 2
920-
921-
if not return_dict:
922-
output = (start_logits, end_logits) + outputs[2:]
923-
return ((total_loss,) + output) if total_loss is not None else output
924-
925-
return QuestionAnsweringModelOutput(
926-
loss=total_loss,
927-
start_logits=start_logits,
928-
end_logits=end_logits,
929-
hidden_states=outputs.hidden_states,
930-
attentions=outputs.attentions,
931-
)
686+
def forward(**super_kwargs) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
687+
super().forward()
688+

0 commit comments

Comments
 (0)