50
50
BertSdpaSelfAttention ,
51
51
BertSelfAttention ,
52
52
BertSelfOutput ,
53
+ BertForMultipleChoice ,
54
+ BertForTokenClassification ,
55
+ BertForQuestionAnswering ,
53
56
)
54
57
from .configuration_roberta import RobertaConfig
55
58
@@ -79,8 +82,8 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
79
82
80
83
class RobertaEmbeddings (BertEmbeddings ):
81
84
def __init__ (self , config ):
82
- super ().__init__ (config )
83
85
self .padding_idx = config .pad_token_id
86
+ super ().__init__ (config )
84
87
self .position_embeddings = nn .Embedding (
85
88
config .max_position_embeddings , config .hidden_size , padding_idx = self .padding_idx
86
89
)
@@ -184,10 +187,6 @@ class RobertaLayer(BertLayer):
184
187
def __init__ (self , config ):
185
188
super ().__init__ (config )
186
189
self .attention = RobertaAttention (config )
187
- if self .add_cross_attention :
188
- if not self .is_decoder :
189
- raise ValueError (f"{ self } should be used as a decoder model if cross attention is added" )
190
- self .crossattention = RobertaAttention (config , position_embedding_type = "absolute" )
191
190
self .intermediate = RobertaIntermediate (config )
192
191
self .output = RobertaOutput (config )
193
192
@@ -237,8 +236,6 @@ def __init__(self, config, add_pooling_layer=True):
237
236
self .embeddings = RobertaEmbeddings (config )
238
237
self .encoder = RobertaEncoder (config )
239
238
self .pooler = RobertaPooler (config ) if add_pooling_layer else None
240
- # Initialize weights and apply final processing
241
- self .post_init ()
242
239
243
240
244
241
ROBERTA_INPUTS_DOCSTRING = None # To use value from modeling_bert
@@ -631,119 +628,16 @@ def forward(
631
628
)
632
629
633
630
634
- @add_start_docstrings (
635
- """
636
- Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
637
- softmax) e.g. for RocStories/SWAG tasks.
638
- """ ,
639
- ROBERTA_START_DOCSTRING ,
640
- )
641
- class RobertaForMultipleChoice (RobertaPreTrainedModel ):
631
+ class RobertaForMultipleChoice (BertForMultipleChoice ):
642
632
def __init__ (self , config ):
643
633
super ().__init__ (config )
644
-
645
634
self .roberta = RobertaModel (config )
646
635
self .dropout = nn .Dropout (config .hidden_dropout_prob )
647
636
self .classifier = nn .Linear (config .hidden_size , 1 )
637
+ del classifier_dropout
648
638
649
- # Initialize weights and apply final processing
650
- self .post_init ()
651
639
652
- @add_start_docstrings_to_model_forward (ROBERTA_INPUTS_DOCSTRING .format ("batch_size, num_choices, sequence_length" ))
653
- @add_code_sample_docstrings (
654
- checkpoint = _CHECKPOINT_FOR_DOC ,
655
- output_type = MultipleChoiceModelOutput ,
656
- config_class = _CONFIG_FOR_DOC ,
657
- )
658
- def forward (
659
- self ,
660
- input_ids : Optional [torch .LongTensor ] = None ,
661
- token_type_ids : Optional [torch .LongTensor ] = None ,
662
- attention_mask : Optional [torch .FloatTensor ] = None ,
663
- labels : Optional [torch .LongTensor ] = None ,
664
- position_ids : Optional [torch .LongTensor ] = None ,
665
- head_mask : Optional [torch .FloatTensor ] = None ,
666
- inputs_embeds : Optional [torch .FloatTensor ] = None ,
667
- output_attentions : Optional [bool ] = None ,
668
- output_hidden_states : Optional [bool ] = None ,
669
- return_dict : Optional [bool ] = None ,
670
- ) -> Union [Tuple [torch .Tensor ], MultipleChoiceModelOutput ]:
671
- r"""
672
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
673
- Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
674
- num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
675
- `input_ids` above)
676
- """
677
- return_dict = return_dict if return_dict is not None else self .config .use_return_dict
678
- num_choices = input_ids .shape [1 ] if input_ids is not None else inputs_embeds .shape [1 ]
679
-
680
- flat_input_ids = input_ids .view (- 1 , input_ids .size (- 1 )) if input_ids is not None else None
681
- flat_position_ids = position_ids .view (- 1 , position_ids .size (- 1 )) if position_ids is not None else None
682
- flat_token_type_ids = token_type_ids .view (- 1 , token_type_ids .size (- 1 )) if token_type_ids is not None else None
683
- flat_attention_mask = attention_mask .view (- 1 , attention_mask .size (- 1 )) if attention_mask is not None else None
684
- flat_inputs_embeds = (
685
- inputs_embeds .view (- 1 , inputs_embeds .size (- 2 ), inputs_embeds .size (- 1 ))
686
- if inputs_embeds is not None
687
- else None
688
- )
689
-
690
- outputs = self .roberta (
691
- flat_input_ids ,
692
- position_ids = flat_position_ids ,
693
- token_type_ids = flat_token_type_ids ,
694
- attention_mask = flat_attention_mask ,
695
- head_mask = head_mask ,
696
- inputs_embeds = flat_inputs_embeds ,
697
- output_attentions = output_attentions ,
698
- output_hidden_states = output_hidden_states ,
699
- return_dict = return_dict ,
700
- )
701
- pooled_output = outputs [1 ]
702
-
703
- pooled_output = self .dropout (pooled_output )
704
- logits = self .classifier (pooled_output )
705
- reshaped_logits = logits .view (- 1 , num_choices )
706
-
707
- loss = None
708
- if labels is not None :
709
- # move labels to correct device to enable model parallelism
710
- labels = labels .to (reshaped_logits .device )
711
- loss_fct = CrossEntropyLoss ()
712
- loss = loss_fct (reshaped_logits , labels )
713
-
714
- if not return_dict :
715
- output = (reshaped_logits ,) + outputs [2 :]
716
- return ((loss ,) + output ) if loss is not None else output
717
-
718
- return MultipleChoiceModelOutput (
719
- loss = loss ,
720
- logits = reshaped_logits ,
721
- hidden_states = outputs .hidden_states ,
722
- attentions = outputs .attentions ,
723
- )
724
-
725
-
726
- @add_start_docstrings (
727
- """
728
- Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
729
- Named-Entity-Recognition (NER) tasks.
730
- """ ,
731
- ROBERTA_START_DOCSTRING ,
732
- )
733
- class RobertaForTokenClassification (RobertaPreTrainedModel ):
734
- def __init__ (self , config ):
735
- super ().__init__ (config )
736
- self .num_labels = config .num_labels
737
-
738
- self .roberta = RobertaModel (config , add_pooling_layer = False )
739
- classifier_dropout = (
740
- config .classifier_dropout if config .classifier_dropout is not None else config .hidden_dropout_prob
741
- )
742
- self .dropout = nn .Dropout (classifier_dropout )
743
- self .classifier = nn .Linear (config .hidden_size , config .num_labels )
744
-
745
- # Initialize weights and apply final processing
746
- self .post_init ()
640
+ class RobertaForTokenClassification (BertForTokenClassification ):
747
641
748
642
@add_start_docstrings_to_model_forward (ROBERTA_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
749
643
@add_code_sample_docstrings (
@@ -753,59 +647,8 @@ def __init__(self, config):
753
647
expected_output = "['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']" ,
754
648
expected_loss = 0.01 ,
755
649
)
756
- def forward (
757
- self ,
758
- input_ids : Optional [torch .LongTensor ] = None ,
759
- attention_mask : Optional [torch .FloatTensor ] = None ,
760
- token_type_ids : Optional [torch .LongTensor ] = None ,
761
- position_ids : Optional [torch .LongTensor ] = None ,
762
- head_mask : Optional [torch .FloatTensor ] = None ,
763
- inputs_embeds : Optional [torch .FloatTensor ] = None ,
764
- labels : Optional [torch .LongTensor ] = None ,
765
- output_attentions : Optional [bool ] = None ,
766
- output_hidden_states : Optional [bool ] = None ,
767
- return_dict : Optional [bool ] = None ,
768
- ) -> Union [Tuple [torch .Tensor ], TokenClassifierOutput ]:
769
- r"""
770
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
771
- Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
772
- """
773
- return_dict = return_dict if return_dict is not None else self .config .use_return_dict
774
-
775
- outputs = self .roberta (
776
- input_ids ,
777
- attention_mask = attention_mask ,
778
- token_type_ids = token_type_ids ,
779
- position_ids = position_ids ,
780
- head_mask = head_mask ,
781
- inputs_embeds = inputs_embeds ,
782
- output_attentions = output_attentions ,
783
- output_hidden_states = output_hidden_states ,
784
- return_dict = return_dict ,
785
- )
786
-
787
- sequence_output = outputs [0 ]
788
-
789
- sequence_output = self .dropout (sequence_output )
790
- logits = self .classifier (sequence_output )
791
-
792
- loss = None
793
- if labels is not None :
794
- # move labels to correct device to enable model parallelism
795
- labels = labels .to (logits .device )
796
- loss_fct = CrossEntropyLoss ()
797
- loss = loss_fct (logits .view (- 1 , self .num_labels ), labels .view (- 1 ))
798
-
799
- if not return_dict :
800
- output = (logits ,) + outputs [2 :]
801
- return ((loss ,) + output ) if loss is not None else output
802
-
803
- return TokenClassifierOutput (
804
- loss = loss ,
805
- logits = logits ,
806
- hidden_states = outputs .hidden_states ,
807
- attentions = outputs .attentions ,
808
- )
650
+ def forward (** super_kwargs ) -> Union [Tuple [torch .Tensor ], TokenClassifierOutput ]:
651
+ super ().forward ()
809
652
810
653
811
654
class RobertaClassificationHead (nn .Module ):
@@ -830,24 +673,8 @@ def forward(self, features, **kwargs):
830
673
return x
831
674
832
675
833
- @add_start_docstrings (
834
- """
835
- Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
836
- layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
837
- """ ,
838
- ROBERTA_START_DOCSTRING ,
839
- )
840
- class RobertaForQuestionAnswering (RobertaPreTrainedModel ):
841
- def __init__ (self , config ):
842
- super ().__init__ (config )
843
- self .num_labels = config .num_labels
844
-
845
- self .roberta = RobertaModel (config , add_pooling_layer = False )
846
- self .qa_outputs = nn .Linear (config .hidden_size , config .num_labels )
847
-
848
- # Initialize weights and apply final processing
849
- self .post_init ()
850
-
676
+ class RobertaForQuestionAnswering (BertForQuestionAnswering ):
677
+
851
678
@add_start_docstrings_to_model_forward (ROBERTA_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
852
679
@add_code_sample_docstrings (
853
680
checkpoint = "deepset/roberta-base-squad2" ,
@@ -856,76 +683,6 @@ def __init__(self, config):
856
683
expected_output = "' puppet'" ,
857
684
expected_loss = 0.86 ,
858
685
)
859
- def forward (
860
- self ,
861
- input_ids : Optional [torch .LongTensor ] = None ,
862
- attention_mask : Optional [torch .FloatTensor ] = None ,
863
- token_type_ids : Optional [torch .LongTensor ] = None ,
864
- position_ids : Optional [torch .LongTensor ] = None ,
865
- head_mask : Optional [torch .FloatTensor ] = None ,
866
- inputs_embeds : Optional [torch .FloatTensor ] = None ,
867
- start_positions : Optional [torch .LongTensor ] = None ,
868
- end_positions : Optional [torch .LongTensor ] = None ,
869
- output_attentions : Optional [bool ] = None ,
870
- output_hidden_states : Optional [bool ] = None ,
871
- return_dict : Optional [bool ] = None ,
872
- ) -> Union [Tuple [torch .Tensor ], QuestionAnsweringModelOutput ]:
873
- r"""
874
- start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
875
- Labels for position (index) of the start of the labelled span for computing the token classification loss.
876
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
877
- are not taken into account for computing the loss.
878
- end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
879
- Labels for position (index) of the end of the labelled span for computing the token classification loss.
880
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
881
- are not taken into account for computing the loss.
882
- """
883
- return_dict = return_dict if return_dict is not None else self .config .use_return_dict
884
-
885
- outputs = self .roberta (
886
- input_ids ,
887
- attention_mask = attention_mask ,
888
- token_type_ids = token_type_ids ,
889
- position_ids = position_ids ,
890
- head_mask = head_mask ,
891
- inputs_embeds = inputs_embeds ,
892
- output_attentions = output_attentions ,
893
- output_hidden_states = output_hidden_states ,
894
- return_dict = return_dict ,
895
- )
896
-
897
- sequence_output = outputs [0 ]
898
-
899
- logits = self .qa_outputs (sequence_output )
900
- start_logits , end_logits = logits .split (1 , dim = - 1 )
901
- start_logits = start_logits .squeeze (- 1 ).contiguous ()
902
- end_logits = end_logits .squeeze (- 1 ).contiguous ()
903
-
904
- total_loss = None
905
- if start_positions is not None and end_positions is not None :
906
- # If we are on multi-GPU, split add a dimension
907
- if len (start_positions .size ()) > 1 :
908
- start_positions = start_positions .squeeze (- 1 )
909
- if len (end_positions .size ()) > 1 :
910
- end_positions = end_positions .squeeze (- 1 )
911
- # sometimes the start/end positions are outside our model inputs, we ignore these terms
912
- ignored_index = start_logits .size (1 )
913
- start_positions = start_positions .clamp (0 , ignored_index )
914
- end_positions = end_positions .clamp (0 , ignored_index )
915
-
916
- loss_fct = CrossEntropyLoss (ignore_index = ignored_index )
917
- start_loss = loss_fct (start_logits , start_positions )
918
- end_loss = loss_fct (end_logits , end_positions )
919
- total_loss = (start_loss + end_loss ) / 2
920
-
921
- if not return_dict :
922
- output = (start_logits , end_logits ) + outputs [2 :]
923
- return ((total_loss ,) + output ) if total_loss is not None else output
924
-
925
- return QuestionAnsweringModelOutput (
926
- loss = total_loss ,
927
- start_logits = start_logits ,
928
- end_logits = end_logits ,
929
- hidden_states = outputs .hidden_states ,
930
- attentions = outputs .attentions ,
931
- )
686
+ def forward (** super_kwargs ) -> Union [Tuple [torch .Tensor ], QuestionAnsweringModelOutput ]:
687
+ super ().forward ()
688
+
0 commit comments