modeling_tf_roberta.py 78 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770
  1. # coding=utf-8
  2. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """TF 2.0 RoBERTa model."""
  17. from __future__ import annotations
  18. import math
  19. import warnings
  20. from typing import Optional, Tuple, Union
  21. import numpy as np
  22. import tensorflow as tf
  23. from ...activations_tf import get_tf_activation
  24. from ...modeling_tf_outputs import (
  25. TFBaseModelOutputWithPastAndCrossAttentions,
  26. TFBaseModelOutputWithPoolingAndCrossAttentions,
  27. TFCausalLMOutputWithCrossAttentions,
  28. TFMaskedLMOutput,
  29. TFMultipleChoiceModelOutput,
  30. TFQuestionAnsweringModelOutput,
  31. TFSequenceClassifierOutput,
  32. TFTokenClassifierOutput,
  33. )
  34. from ...modeling_tf_utils import (
  35. TFCausalLanguageModelingLoss,
  36. TFMaskedLanguageModelingLoss,
  37. TFModelInputType,
  38. TFMultipleChoiceLoss,
  39. TFPreTrainedModel,
  40. TFQuestionAnsweringLoss,
  41. TFSequenceClassificationLoss,
  42. TFTokenClassificationLoss,
  43. get_initializer,
  44. keras,
  45. keras_serializable,
  46. unpack_inputs,
  47. )
  48. from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
  49. from ...utils import (
  50. add_code_sample_docstrings,
  51. add_start_docstrings,
  52. add_start_docstrings_to_model_forward,
  53. logging,
  54. )
  55. from .configuration_roberta import RobertaConfig
  56. logger = logging.get_logger(__name__)
  57. _CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
  58. _CONFIG_FOR_DOC = "RobertaConfig"
  59. class TFRobertaEmbeddings(keras.layers.Layer):
  60. """
  61. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
  62. """
  63. def __init__(self, config, **kwargs):
  64. super().__init__(**kwargs)
  65. self.padding_idx = 1
  66. self.config = config
  67. self.hidden_size = config.hidden_size
  68. self.max_position_embeddings = config.max_position_embeddings
  69. self.initializer_range = config.initializer_range
  70. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  71. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  72. def build(self, input_shape=None):
  73. with tf.name_scope("word_embeddings"):
  74. self.weight = self.add_weight(
  75. name="weight",
  76. shape=[self.config.vocab_size, self.hidden_size],
  77. initializer=get_initializer(self.initializer_range),
  78. )
  79. with tf.name_scope("token_type_embeddings"):
  80. self.token_type_embeddings = self.add_weight(
  81. name="embeddings",
  82. shape=[self.config.type_vocab_size, self.hidden_size],
  83. initializer=get_initializer(self.initializer_range),
  84. )
  85. with tf.name_scope("position_embeddings"):
  86. self.position_embeddings = self.add_weight(
  87. name="embeddings",
  88. shape=[self.max_position_embeddings, self.hidden_size],
  89. initializer=get_initializer(self.initializer_range),
  90. )
  91. if self.built:
  92. return
  93. self.built = True
  94. if getattr(self, "LayerNorm", None) is not None:
  95. with tf.name_scope(self.LayerNorm.name):
  96. self.LayerNorm.build([None, None, self.config.hidden_size])
  97. def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
  98. """
  99. Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
  100. symbols are ignored. This is modified from fairseq's `utils.make_positions`.
  101. Args:
  102. input_ids: tf.Tensor
  103. Returns: tf.Tensor
  104. """
  105. mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
  106. incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
  107. return incremental_indices + self.padding_idx
  108. def call(
  109. self,
  110. input_ids=None,
  111. position_ids=None,
  112. token_type_ids=None,
  113. inputs_embeds=None,
  114. past_key_values_length=0,
  115. training=False,
  116. ):
  117. """
  118. Applies embedding based on inputs tensor.
  119. Returns:
  120. final_embeddings (`tf.Tensor`): output embedding tensor.
  121. """
  122. assert not (input_ids is None and inputs_embeds is None)
  123. if input_ids is not None:
  124. check_embeddings_within_bounds(input_ids, self.config.vocab_size)
  125. inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
  126. input_shape = shape_list(inputs_embeds)[:-1]
  127. if token_type_ids is None:
  128. token_type_ids = tf.fill(dims=input_shape, value=0)
  129. if position_ids is None:
  130. if input_ids is not None:
  131. # Create the position ids from the input token ids. Any padded tokens remain padded.
  132. position_ids = self.create_position_ids_from_input_ids(
  133. input_ids=input_ids, past_key_values_length=past_key_values_length
  134. )
  135. else:
  136. position_ids = tf.expand_dims(
  137. tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
  138. )
  139. position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
  140. token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
  141. final_embeddings = inputs_embeds + position_embeds + token_type_embeds
  142. final_embeddings = self.LayerNorm(inputs=final_embeddings)
  143. final_embeddings = self.dropout(inputs=final_embeddings, training=training)
  144. return final_embeddings
  145. # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Roberta
  146. class TFRobertaPooler(keras.layers.Layer):
  147. def __init__(self, config: RobertaConfig, **kwargs):
  148. super().__init__(**kwargs)
  149. self.dense = keras.layers.Dense(
  150. units=config.hidden_size,
  151. kernel_initializer=get_initializer(config.initializer_range),
  152. activation="tanh",
  153. name="dense",
  154. )
  155. self.config = config
  156. def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
  157. # We "pool" the model by simply taking the hidden state corresponding
  158. # to the first token.
  159. first_token_tensor = hidden_states[:, 0]
  160. pooled_output = self.dense(inputs=first_token_tensor)
  161. return pooled_output
  162. def build(self, input_shape=None):
  163. if self.built:
  164. return
  165. self.built = True
  166. if getattr(self, "dense", None) is not None:
  167. with tf.name_scope(self.dense.name):
  168. self.dense.build([None, None, self.config.hidden_size])
  169. # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta
  170. class TFRobertaSelfAttention(keras.layers.Layer):
  171. def __init__(self, config: RobertaConfig, **kwargs):
  172. super().__init__(**kwargs)
  173. if config.hidden_size % config.num_attention_heads != 0:
  174. raise ValueError(
  175. f"The hidden size ({config.hidden_size}) is not a multiple of the number "
  176. f"of attention heads ({config.num_attention_heads})"
  177. )
  178. self.num_attention_heads = config.num_attention_heads
  179. self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
  180. self.all_head_size = self.num_attention_heads * self.attention_head_size
  181. self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
  182. self.query = keras.layers.Dense(
  183. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
  184. )
  185. self.key = keras.layers.Dense(
  186. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
  187. )
  188. self.value = keras.layers.Dense(
  189. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
  190. )
  191. self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
  192. self.is_decoder = config.is_decoder
  193. self.config = config
  194. def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
  195. # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
  196. tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
  197. # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
  198. return tf.transpose(tensor, perm=[0, 2, 1, 3])
  199. def call(
  200. self,
  201. hidden_states: tf.Tensor,
  202. attention_mask: tf.Tensor,
  203. head_mask: tf.Tensor,
  204. encoder_hidden_states: tf.Tensor,
  205. encoder_attention_mask: tf.Tensor,
  206. past_key_value: Tuple[tf.Tensor],
  207. output_attentions: bool,
  208. training: bool = False,
  209. ) -> Tuple[tf.Tensor]:
  210. batch_size = shape_list(hidden_states)[0]
  211. mixed_query_layer = self.query(inputs=hidden_states)
  212. # If this is instantiated as a cross-attention module, the keys
  213. # and values come from an encoder; the attention mask needs to be
  214. # such that the encoder's padding tokens are not attended to.
  215. is_cross_attention = encoder_hidden_states is not None
  216. if is_cross_attention and past_key_value is not None:
  217. # reuse k,v, cross_attentions
  218. key_layer = past_key_value[0]
  219. value_layer = past_key_value[1]
  220. attention_mask = encoder_attention_mask
  221. elif is_cross_attention:
  222. key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
  223. value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
  224. attention_mask = encoder_attention_mask
  225. elif past_key_value is not None:
  226. key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
  227. value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
  228. key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
  229. value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
  230. else:
  231. key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
  232. value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
  233. query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
  234. if self.is_decoder:
  235. # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
  236. # Further calls to cross_attention layer can then reuse all cross-attention
  237. # key/value_states (first "if" case)
  238. # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
  239. # all previous decoder key/value_states. Further calls to uni-directional self-attention
  240. # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
  241. # if encoder bi-directional self-attention `past_key_value` is always `None`
  242. past_key_value = (key_layer, value_layer)
  243. # Take the dot product between "query" and "key" to get the raw attention scores.
  244. # (batch size, num_heads, seq_len_q, seq_len_k)
  245. attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  246. dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
  247. attention_scores = tf.divide(attention_scores, dk)
  248. if attention_mask is not None:
  249. # Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function)
  250. attention_scores = tf.add(attention_scores, attention_mask)
  251. # Normalize the attention scores to probabilities.
  252. attention_probs = stable_softmax(logits=attention_scores, axis=-1)
  253. # This is actually dropping out entire tokens to attend to, which might
  254. # seem a bit unusual, but is taken from the original Transformer paper.
  255. attention_probs = self.dropout(inputs=attention_probs, training=training)
  256. # Mask heads if we want to
  257. if head_mask is not None:
  258. attention_probs = tf.multiply(attention_probs, head_mask)
  259. attention_output = tf.matmul(attention_probs, value_layer)
  260. attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
  261. # (batch_size, seq_len_q, all_head_size)
  262. attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
  263. outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
  264. if self.is_decoder:
  265. outputs = outputs + (past_key_value,)
  266. return outputs
  267. def build(self, input_shape=None):
  268. if self.built:
  269. return
  270. self.built = True
  271. if getattr(self, "query", None) is not None:
  272. with tf.name_scope(self.query.name):
  273. self.query.build([None, None, self.config.hidden_size])
  274. if getattr(self, "key", None) is not None:
  275. with tf.name_scope(self.key.name):
  276. self.key.build([None, None, self.config.hidden_size])
  277. if getattr(self, "value", None) is not None:
  278. with tf.name_scope(self.value.name):
  279. self.value.build([None, None, self.config.hidden_size])
  280. # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta
  281. class TFRobertaSelfOutput(keras.layers.Layer):
  282. def __init__(self, config: RobertaConfig, **kwargs):
  283. super().__init__(**kwargs)
  284. self.dense = keras.layers.Dense(
  285. units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  286. )
  287. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  288. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  289. self.config = config
  290. def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
  291. hidden_states = self.dense(inputs=hidden_states)
  292. hidden_states = self.dropout(inputs=hidden_states, training=training)
  293. hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
  294. return hidden_states
  295. def build(self, input_shape=None):
  296. if self.built:
  297. return
  298. self.built = True
  299. if getattr(self, "dense", None) is not None:
  300. with tf.name_scope(self.dense.name):
  301. self.dense.build([None, None, self.config.hidden_size])
  302. if getattr(self, "LayerNorm", None) is not None:
  303. with tf.name_scope(self.LayerNorm.name):
  304. self.LayerNorm.build([None, None, self.config.hidden_size])
  305. # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta
  306. class TFRobertaAttention(keras.layers.Layer):
  307. def __init__(self, config: RobertaConfig, **kwargs):
  308. super().__init__(**kwargs)
  309. self.self_attention = TFRobertaSelfAttention(config, name="self")
  310. self.dense_output = TFRobertaSelfOutput(config, name="output")
  311. def prune_heads(self, heads):
  312. raise NotImplementedError
  313. def call(
  314. self,
  315. input_tensor: tf.Tensor,
  316. attention_mask: tf.Tensor,
  317. head_mask: tf.Tensor,
  318. encoder_hidden_states: tf.Tensor,
  319. encoder_attention_mask: tf.Tensor,
  320. past_key_value: Tuple[tf.Tensor],
  321. output_attentions: bool,
  322. training: bool = False,
  323. ) -> Tuple[tf.Tensor]:
  324. self_outputs = self.self_attention(
  325. hidden_states=input_tensor,
  326. attention_mask=attention_mask,
  327. head_mask=head_mask,
  328. encoder_hidden_states=encoder_hidden_states,
  329. encoder_attention_mask=encoder_attention_mask,
  330. past_key_value=past_key_value,
  331. output_attentions=output_attentions,
  332. training=training,
  333. )
  334. attention_output = self.dense_output(
  335. hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
  336. )
  337. # add attentions (possibly with past_key_value) if we output them
  338. outputs = (attention_output,) + self_outputs[1:]
  339. return outputs
  340. def build(self, input_shape=None):
  341. if self.built:
  342. return
  343. self.built = True
  344. if getattr(self, "self_attention", None) is not None:
  345. with tf.name_scope(self.self_attention.name):
  346. self.self_attention.build(None)
  347. if getattr(self, "dense_output", None) is not None:
  348. with tf.name_scope(self.dense_output.name):
  349. self.dense_output.build(None)
  350. # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta
  351. class TFRobertaIntermediate(keras.layers.Layer):
  352. def __init__(self, config: RobertaConfig, **kwargs):
  353. super().__init__(**kwargs)
  354. self.dense = keras.layers.Dense(
  355. units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  356. )
  357. if isinstance(config.hidden_act, str):
  358. self.intermediate_act_fn = get_tf_activation(config.hidden_act)
  359. else:
  360. self.intermediate_act_fn = config.hidden_act
  361. self.config = config
  362. def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
  363. hidden_states = self.dense(inputs=hidden_states)
  364. hidden_states = self.intermediate_act_fn(hidden_states)
  365. return hidden_states
  366. def build(self, input_shape=None):
  367. if self.built:
  368. return
  369. self.built = True
  370. if getattr(self, "dense", None) is not None:
  371. with tf.name_scope(self.dense.name):
  372. self.dense.build([None, None, self.config.hidden_size])
  373. # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta
  374. class TFRobertaOutput(keras.layers.Layer):
  375. def __init__(self, config: RobertaConfig, **kwargs):
  376. super().__init__(**kwargs)
  377. self.dense = keras.layers.Dense(
  378. units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  379. )
  380. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  381. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  382. self.config = config
  383. def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
  384. hidden_states = self.dense(inputs=hidden_states)
  385. hidden_states = self.dropout(inputs=hidden_states, training=training)
  386. hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
  387. return hidden_states
  388. def build(self, input_shape=None):
  389. if self.built:
  390. return
  391. self.built = True
  392. if getattr(self, "dense", None) is not None:
  393. with tf.name_scope(self.dense.name):
  394. self.dense.build([None, None, self.config.intermediate_size])
  395. if getattr(self, "LayerNorm", None) is not None:
  396. with tf.name_scope(self.LayerNorm.name):
  397. self.LayerNorm.build([None, None, self.config.hidden_size])
  398. # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta
  399. class TFRobertaLayer(keras.layers.Layer):
  400. def __init__(self, config: RobertaConfig, **kwargs):
  401. super().__init__(**kwargs)
  402. self.attention = TFRobertaAttention(config, name="attention")
  403. self.is_decoder = config.is_decoder
  404. self.add_cross_attention = config.add_cross_attention
  405. if self.add_cross_attention:
  406. if not self.is_decoder:
  407. raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
  408. self.crossattention = TFRobertaAttention(config, name="crossattention")
  409. self.intermediate = TFRobertaIntermediate(config, name="intermediate")
  410. self.bert_output = TFRobertaOutput(config, name="output")
  411. def call(
  412. self,
  413. hidden_states: tf.Tensor,
  414. attention_mask: tf.Tensor,
  415. head_mask: tf.Tensor,
  416. encoder_hidden_states: tf.Tensor | None,
  417. encoder_attention_mask: tf.Tensor | None,
  418. past_key_value: Tuple[tf.Tensor] | None,
  419. output_attentions: bool,
  420. training: bool = False,
  421. ) -> Tuple[tf.Tensor]:
  422. # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
  423. self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
  424. self_attention_outputs = self.attention(
  425. input_tensor=hidden_states,
  426. attention_mask=attention_mask,
  427. head_mask=head_mask,
  428. encoder_hidden_states=None,
  429. encoder_attention_mask=None,
  430. past_key_value=self_attn_past_key_value,
  431. output_attentions=output_attentions,
  432. training=training,
  433. )
  434. attention_output = self_attention_outputs[0]
  435. # if decoder, the last output is tuple of self-attn cache
  436. if self.is_decoder:
  437. outputs = self_attention_outputs[1:-1]
  438. present_key_value = self_attention_outputs[-1]
  439. else:
  440. outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
  441. cross_attn_present_key_value = None
  442. if self.is_decoder and encoder_hidden_states is not None:
  443. if not hasattr(self, "crossattention"):
  444. raise ValueError(
  445. f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
  446. " by setting `config.add_cross_attention=True`"
  447. )
  448. # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
  449. cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
  450. cross_attention_outputs = self.crossattention(
  451. input_tensor=attention_output,
  452. attention_mask=attention_mask,
  453. head_mask=head_mask,
  454. encoder_hidden_states=encoder_hidden_states,
  455. encoder_attention_mask=encoder_attention_mask,
  456. past_key_value=cross_attn_past_key_value,
  457. output_attentions=output_attentions,
  458. training=training,
  459. )
  460. attention_output = cross_attention_outputs[0]
  461. outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
  462. # add cross-attn cache to positions 3,4 of present_key_value tuple
  463. cross_attn_present_key_value = cross_attention_outputs[-1]
  464. present_key_value = present_key_value + cross_attn_present_key_value
  465. intermediate_output = self.intermediate(hidden_states=attention_output)
  466. layer_output = self.bert_output(
  467. hidden_states=intermediate_output, input_tensor=attention_output, training=training
  468. )
  469. outputs = (layer_output,) + outputs # add attentions if we output them
  470. # if decoder, return the attn key/values as the last output
  471. if self.is_decoder:
  472. outputs = outputs + (present_key_value,)
  473. return outputs
  474. def build(self, input_shape=None):
  475. if self.built:
  476. return
  477. self.built = True
  478. if getattr(self, "attention", None) is not None:
  479. with tf.name_scope(self.attention.name):
  480. self.attention.build(None)
  481. if getattr(self, "intermediate", None) is not None:
  482. with tf.name_scope(self.intermediate.name):
  483. self.intermediate.build(None)
  484. if getattr(self, "bert_output", None) is not None:
  485. with tf.name_scope(self.bert_output.name):
  486. self.bert_output.build(None)
  487. if getattr(self, "crossattention", None) is not None:
  488. with tf.name_scope(self.crossattention.name):
  489. self.crossattention.build(None)
  490. # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta
  491. class TFRobertaEncoder(keras.layers.Layer):
  492. def __init__(self, config: RobertaConfig, **kwargs):
  493. super().__init__(**kwargs)
  494. self.config = config
  495. self.layer = [TFRobertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
  496. def call(
  497. self,
  498. hidden_states: tf.Tensor,
  499. attention_mask: tf.Tensor,
  500. head_mask: tf.Tensor,
  501. encoder_hidden_states: tf.Tensor | None,
  502. encoder_attention_mask: tf.Tensor | None,
  503. past_key_values: Tuple[Tuple[tf.Tensor]] | None,
  504. use_cache: Optional[bool],
  505. output_attentions: bool,
  506. output_hidden_states: bool,
  507. return_dict: bool,
  508. training: bool = False,
  509. ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
  510. all_hidden_states = () if output_hidden_states else None
  511. all_attentions = () if output_attentions else None
  512. all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
  513. next_decoder_cache = () if use_cache else None
  514. for i, layer_module in enumerate(self.layer):
  515. if output_hidden_states:
  516. all_hidden_states = all_hidden_states + (hidden_states,)
  517. past_key_value = past_key_values[i] if past_key_values is not None else None
  518. layer_outputs = layer_module(
  519. hidden_states=hidden_states,
  520. attention_mask=attention_mask,
  521. head_mask=head_mask[i],
  522. encoder_hidden_states=encoder_hidden_states,
  523. encoder_attention_mask=encoder_attention_mask,
  524. past_key_value=past_key_value,
  525. output_attentions=output_attentions,
  526. training=training,
  527. )
  528. hidden_states = layer_outputs[0]
  529. if use_cache:
  530. next_decoder_cache += (layer_outputs[-1],)
  531. if output_attentions:
  532. all_attentions = all_attentions + (layer_outputs[1],)
  533. if self.config.add_cross_attention and encoder_hidden_states is not None:
  534. all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
  535. # Add last layer
  536. if output_hidden_states:
  537. all_hidden_states = all_hidden_states + (hidden_states,)
  538. if not return_dict:
  539. return tuple(
  540. v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
  541. )
  542. return TFBaseModelOutputWithPastAndCrossAttentions(
  543. last_hidden_state=hidden_states,
  544. past_key_values=next_decoder_cache,
  545. hidden_states=all_hidden_states,
  546. attentions=all_attentions,
  547. cross_attentions=all_cross_attentions,
  548. )
  549. def build(self, input_shape=None):
  550. if self.built:
  551. return
  552. self.built = True
  553. if getattr(self, "layer", None) is not None:
  554. for layer in self.layer:
  555. with tf.name_scope(layer.name):
  556. layer.build(None)
  557. @keras_serializable
  558. class TFRobertaMainLayer(keras.layers.Layer):
  559. config_class = RobertaConfig
  560. def __init__(self, config, add_pooling_layer=True, **kwargs):
  561. super().__init__(**kwargs)
  562. self.config = config
  563. self.is_decoder = config.is_decoder
  564. self.num_hidden_layers = config.num_hidden_layers
  565. self.initializer_range = config.initializer_range
  566. self.output_attentions = config.output_attentions
  567. self.output_hidden_states = config.output_hidden_states
  568. self.return_dict = config.use_return_dict
  569. self.encoder = TFRobertaEncoder(config, name="encoder")
  570. self.pooler = TFRobertaPooler(config, name="pooler") if add_pooling_layer else None
  571. # The embeddings must be the last declaration in order to follow the weights order
  572. self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
  573. # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
  574. def get_input_embeddings(self) -> keras.layers.Layer:
  575. return self.embeddings
  576. # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
  577. def set_input_embeddings(self, value: tf.Variable):
  578. self.embeddings.weight = value
  579. self.embeddings.vocab_size = shape_list(value)[0]
  580. # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
  581. def _prune_heads(self, heads_to_prune):
  582. """
  583. Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
  584. class PreTrainedModel
  585. """
  586. raise NotImplementedError
  587. @unpack_inputs
  588. # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
  589. def call(
  590. self,
  591. input_ids: TFModelInputType | None = None,
  592. attention_mask: np.ndarray | tf.Tensor | None = None,
  593. token_type_ids: np.ndarray | tf.Tensor | None = None,
  594. position_ids: np.ndarray | tf.Tensor | None = None,
  595. head_mask: np.ndarray | tf.Tensor | None = None,
  596. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  597. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  598. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  599. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  600. use_cache: Optional[bool] = None,
  601. output_attentions: Optional[bool] = None,
  602. output_hidden_states: Optional[bool] = None,
  603. return_dict: Optional[bool] = None,
  604. training: bool = False,
  605. ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
  606. if not self.config.is_decoder:
  607. use_cache = False
  608. if input_ids is not None and inputs_embeds is not None:
  609. raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
  610. elif input_ids is not None:
  611. input_shape = shape_list(input_ids)
  612. elif inputs_embeds is not None:
  613. input_shape = shape_list(inputs_embeds)[:-1]
  614. else:
  615. raise ValueError("You have to specify either input_ids or inputs_embeds")
  616. batch_size, seq_length = input_shape
  617. if past_key_values is None:
  618. past_key_values_length = 0
  619. past_key_values = [None] * len(self.encoder.layer)
  620. else:
  621. past_key_values_length = shape_list(past_key_values[0][0])[-2]
  622. if attention_mask is None:
  623. attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
  624. if token_type_ids is None:
  625. token_type_ids = tf.fill(dims=input_shape, value=0)
  626. embedding_output = self.embeddings(
  627. input_ids=input_ids,
  628. position_ids=position_ids,
  629. token_type_ids=token_type_ids,
  630. inputs_embeds=inputs_embeds,
  631. past_key_values_length=past_key_values_length,
  632. training=training,
  633. )
  634. # We create a 3D attention mask from a 2D tensor mask.
  635. # Sizes are [batch_size, 1, 1, to_seq_length]
  636. # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
  637. # this attention mask is more simple than the triangular masking of causal attention
  638. # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
  639. attention_mask_shape = shape_list(attention_mask)
  640. mask_seq_length = seq_length + past_key_values_length
  641. # Copied from `modeling_tf_t5.py`
  642. # Provided a padding mask of dimensions [batch_size, mask_seq_length]
  643. # - if the model is a decoder, apply a causal mask in addition to the padding mask
  644. # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
  645. if self.is_decoder:
  646. seq_ids = tf.range(mask_seq_length)
  647. causal_mask = tf.less_equal(
  648. tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
  649. seq_ids[None, :, None],
  650. )
  651. causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
  652. extended_attention_mask = causal_mask * attention_mask[:, None, :]
  653. attention_mask_shape = shape_list(extended_attention_mask)
  654. extended_attention_mask = tf.reshape(
  655. extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
  656. )
  657. if past_key_values[0] is not None:
  658. # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length]
  659. extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
  660. else:
  661. extended_attention_mask = tf.reshape(
  662. attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
  663. )
  664. # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
  665. # masked positions, this operation will create a tensor which is 0.0 for
  666. # positions we want to attend and -10000.0 for masked positions.
  667. # Since we are adding it to the raw scores before the softmax, this is
  668. # effectively the same as removing these entirely.
  669. extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
  670. one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
  671. ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
  672. extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
  673. # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
  674. if self.is_decoder and encoder_attention_mask is not None:
  675. # If a 2D ou 3D attention mask is provided for the cross-attention
  676. # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
  677. # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
  678. encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype)
  679. num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
  680. if num_dims_encoder_attention_mask == 3:
  681. encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
  682. if num_dims_encoder_attention_mask == 2:
  683. encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
  684. # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
  685. # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
  686. # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
  687. # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
  688. encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
  689. else:
  690. encoder_extended_attention_mask = None
  691. # Prepare head mask if needed
  692. # 1.0 in head_mask indicate we keep the head
  693. # attention_probs has shape bsz x n_heads x N x N
  694. # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
  695. # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
  696. if head_mask is not None:
  697. raise NotImplementedError
  698. else:
  699. head_mask = [None] * self.config.num_hidden_layers
  700. encoder_outputs = self.encoder(
  701. hidden_states=embedding_output,
  702. attention_mask=extended_attention_mask,
  703. head_mask=head_mask,
  704. encoder_hidden_states=encoder_hidden_states,
  705. encoder_attention_mask=encoder_extended_attention_mask,
  706. past_key_values=past_key_values,
  707. use_cache=use_cache,
  708. output_attentions=output_attentions,
  709. output_hidden_states=output_hidden_states,
  710. return_dict=return_dict,
  711. training=training,
  712. )
  713. sequence_output = encoder_outputs[0]
  714. pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
  715. if not return_dict:
  716. return (
  717. sequence_output,
  718. pooled_output,
  719. ) + encoder_outputs[1:]
  720. return TFBaseModelOutputWithPoolingAndCrossAttentions(
  721. last_hidden_state=sequence_output,
  722. pooler_output=pooled_output,
  723. past_key_values=encoder_outputs.past_key_values,
  724. hidden_states=encoder_outputs.hidden_states,
  725. attentions=encoder_outputs.attentions,
  726. cross_attentions=encoder_outputs.cross_attentions,
  727. )
  728. def build(self, input_shape=None):
  729. if self.built:
  730. return
  731. self.built = True
  732. if getattr(self, "encoder", None) is not None:
  733. with tf.name_scope(self.encoder.name):
  734. self.encoder.build(None)
  735. if getattr(self, "pooler", None) is not None:
  736. with tf.name_scope(self.pooler.name):
  737. self.pooler.build(None)
  738. if getattr(self, "embeddings", None) is not None:
  739. with tf.name_scope(self.embeddings.name):
  740. self.embeddings.build(None)
  741. class TFRobertaPreTrainedModel(TFPreTrainedModel):
  742. """
  743. An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
  744. models.
  745. """
  746. config_class = RobertaConfig
  747. base_model_prefix = "roberta"
  748. ROBERTA_START_DOCSTRING = r"""
  749. This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
  750. library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  751. etc.)
  752. This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
  753. as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
  754. behavior.
  755. <Tip>
  756. TensorFlow models and layers in `transformers` accept two formats as input:
  757. - having all inputs as keyword arguments (like PyTorch models), or
  758. - having all inputs as a list, tuple or dict in the first positional argument.
  759. The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
  760. and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
  761. pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
  762. format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
  763. the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
  764. positional argument:
  765. - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
  766. - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
  767. `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
  768. - a dictionary with one or several input Tensors associated to the input names given in the docstring:
  769. `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
  770. Note that when creating models and layers with
  771. [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
  772. about any of this, as you can just pass inputs like you would to any other Python function!
  773. </Tip>
  774. Parameters:
  775. config ([`RobertaConfig`]): Model configuration class with all the parameters of the
  776. model. Initializing with a config file does not load the weights associated with the model, only the
  777. configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
  778. """
  779. ROBERTA_INPUTS_DOCSTRING = r"""
  780. Args:
  781. input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
  782. Indices of input sequence tokens in the vocabulary.
  783. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
  784. [`PreTrainedTokenizer.encode`] for details.
  785. [What are input IDs?](../glossary#input-ids)
  786. attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
  787. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  788. - 1 for tokens that are **not masked**,
  789. - 0 for tokens that are **masked**.
  790. [What are attention masks?](../glossary#attention-mask)
  791. token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
  792. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
  793. 1]`:
  794. - 0 corresponds to a *sentence A* token,
  795. - 1 corresponds to a *sentence B* token.
  796. [What are token type IDs?](../glossary#token-type-ids)
  797. position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
  798. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  799. config.max_position_embeddings - 1]`.
  800. [What are position IDs?](../glossary#position-ids)
  801. head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
  802. Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
  803. - 1 indicates the head is **not masked**,
  804. - 0 indicates the head is **masked**.
  805. inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
  806. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
  807. is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
  808. model's internal embedding lookup matrix.
  809. output_attentions (`bool`, *optional*):
  810. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  811. tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
  812. config will be used instead.
  813. output_hidden_states (`bool`, *optional*):
  814. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  815. more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
  816. used instead.
  817. return_dict (`bool`, *optional*):
  818. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
  819. eager mode, in graph mode the value will always be set to True.
  820. training (`bool`, *optional*, defaults to `False`):
  821. Whether or not to use the model in training mode (some modules like dropout modules have different
  822. behaviors between training and evaluation).
  823. """
  824. @add_start_docstrings(
  825. "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
  826. ROBERTA_START_DOCSTRING,
  827. )
  828. class TFRobertaModel(TFRobertaPreTrainedModel):
  829. def __init__(self, config, *inputs, **kwargs):
  830. super().__init__(config, *inputs, **kwargs)
  831. self.roberta = TFRobertaMainLayer(config, name="roberta")
  832. @unpack_inputs
  833. @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  834. @add_code_sample_docstrings(
  835. checkpoint=_CHECKPOINT_FOR_DOC,
  836. output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
  837. config_class=_CONFIG_FOR_DOC,
  838. )
  839. def call(
  840. self,
  841. input_ids: TFModelInputType | None = None,
  842. attention_mask: np.ndarray | tf.Tensor | None = None,
  843. token_type_ids: np.ndarray | tf.Tensor | None = None,
  844. position_ids: np.ndarray | tf.Tensor | None = None,
  845. head_mask: np.ndarray | tf.Tensor | None = None,
  846. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  847. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  848. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  849. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  850. use_cache: Optional[bool] = None,
  851. output_attentions: Optional[bool] = None,
  852. output_hidden_states: Optional[bool] = None,
  853. return_dict: Optional[bool] = None,
  854. training: Optional[bool] = False,
  855. ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
  856. r"""
  857. encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  858. Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
  859. the model is configured as a decoder.
  860. encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  861. Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
  862. the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
  863. - 1 for tokens that are **not masked**,
  864. - 0 for tokens that are **masked**.
  865. past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
  866. contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
  867. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
  868. don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
  869. `decoder_input_ids` of shape `(batch_size, sequence_length)`.
  870. use_cache (`bool`, *optional*, defaults to `True`):
  871. If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
  872. `past_key_values`). Set to `False` during training, `True` during generation
  873. """
  874. outputs = self.roberta(
  875. input_ids=input_ids,
  876. attention_mask=attention_mask,
  877. token_type_ids=token_type_ids,
  878. position_ids=position_ids,
  879. head_mask=head_mask,
  880. inputs_embeds=inputs_embeds,
  881. encoder_hidden_states=encoder_hidden_states,
  882. encoder_attention_mask=encoder_attention_mask,
  883. past_key_values=past_key_values,
  884. use_cache=use_cache,
  885. output_attentions=output_attentions,
  886. output_hidden_states=output_hidden_states,
  887. return_dict=return_dict,
  888. training=training,
  889. )
  890. return outputs
  891. def build(self, input_shape=None):
  892. if self.built:
  893. return
  894. self.built = True
  895. if getattr(self, "roberta", None) is not None:
  896. with tf.name_scope(self.roberta.name):
  897. self.roberta.build(None)
  898. class TFRobertaLMHead(keras.layers.Layer):
  899. """Roberta Head for masked language modeling."""
  900. def __init__(self, config, input_embeddings, **kwargs):
  901. super().__init__(**kwargs)
  902. self.config = config
  903. self.hidden_size = config.hidden_size
  904. self.dense = keras.layers.Dense(
  905. config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  906. )
  907. self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
  908. self.act = get_tf_activation("gelu")
  909. # The output weights are the same as the input embeddings, but there is
  910. # an output-only bias for each token.
  911. self.decoder = input_embeddings
  912. def build(self, input_shape=None):
  913. self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
  914. if self.built:
  915. return
  916. self.built = True
  917. if getattr(self, "dense", None) is not None:
  918. with tf.name_scope(self.dense.name):
  919. self.dense.build([None, None, self.config.hidden_size])
  920. if getattr(self, "layer_norm", None) is not None:
  921. with tf.name_scope(self.layer_norm.name):
  922. self.layer_norm.build([None, None, self.config.hidden_size])
  923. def get_output_embeddings(self):
  924. return self.decoder
  925. def set_output_embeddings(self, value):
  926. self.decoder.weight = value
  927. self.decoder.vocab_size = shape_list(value)[0]
  928. def get_bias(self):
  929. return {"bias": self.bias}
  930. def set_bias(self, value):
  931. self.bias = value["bias"]
  932. self.config.vocab_size = shape_list(value["bias"])[0]
  933. def call(self, hidden_states):
  934. hidden_states = self.dense(hidden_states)
  935. hidden_states = self.act(hidden_states)
  936. hidden_states = self.layer_norm(hidden_states)
  937. # project back to size of vocabulary with bias
  938. seq_length = shape_list(tensor=hidden_states)[1]
  939. hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
  940. hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
  941. hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
  942. hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
  943. return hidden_states
  944. @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
  945. class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLoss):
  946. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  947. _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
  948. def __init__(self, config, *inputs, **kwargs):
  949. super().__init__(config, *inputs, **kwargs)
  950. self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
  951. self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
  952. def get_lm_head(self):
  953. return self.lm_head
  954. def get_prefix_bias_name(self):
  955. warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
  956. return self.name + "/" + self.lm_head.name
  957. @unpack_inputs
  958. @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  959. @add_code_sample_docstrings(
  960. checkpoint=_CHECKPOINT_FOR_DOC,
  961. output_type=TFMaskedLMOutput,
  962. config_class=_CONFIG_FOR_DOC,
  963. mask="<mask>",
  964. expected_output="' Paris'",
  965. expected_loss=0.1,
  966. )
  967. def call(
  968. self,
  969. input_ids: TFModelInputType | None = None,
  970. attention_mask: np.ndarray | tf.Tensor | None = None,
  971. token_type_ids: np.ndarray | tf.Tensor | None = None,
  972. position_ids: np.ndarray | tf.Tensor | None = None,
  973. head_mask: np.ndarray | tf.Tensor | None = None,
  974. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  975. output_attentions: Optional[bool] = None,
  976. output_hidden_states: Optional[bool] = None,
  977. return_dict: Optional[bool] = None,
  978. labels: np.ndarray | tf.Tensor | None = None,
  979. training: Optional[bool] = False,
  980. ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
  981. r"""
  982. labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  983. Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
  984. config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
  985. loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
  986. """
  987. outputs = self.roberta(
  988. input_ids,
  989. attention_mask=attention_mask,
  990. token_type_ids=token_type_ids,
  991. position_ids=position_ids,
  992. head_mask=head_mask,
  993. inputs_embeds=inputs_embeds,
  994. output_attentions=output_attentions,
  995. output_hidden_states=output_hidden_states,
  996. return_dict=return_dict,
  997. training=training,
  998. )
  999. sequence_output = outputs[0]
  1000. prediction_scores = self.lm_head(sequence_output)
  1001. loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
  1002. if not return_dict:
  1003. output = (prediction_scores,) + outputs[2:]
  1004. return ((loss,) + output) if loss is not None else output
  1005. return TFMaskedLMOutput(
  1006. loss=loss,
  1007. logits=prediction_scores,
  1008. hidden_states=outputs.hidden_states,
  1009. attentions=outputs.attentions,
  1010. )
  1011. def build(self, input_shape=None):
  1012. if self.built:
  1013. return
  1014. self.built = True
  1015. if getattr(self, "roberta", None) is not None:
  1016. with tf.name_scope(self.roberta.name):
  1017. self.roberta.build(None)
  1018. if getattr(self, "lm_head", None) is not None:
  1019. with tf.name_scope(self.lm_head.name):
  1020. self.lm_head.build(None)
  1021. class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
  1022. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  1023. _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
  1024. def __init__(self, config: RobertaConfig, *inputs, **kwargs):
  1025. super().__init__(config, *inputs, **kwargs)
  1026. if not config.is_decoder:
  1027. logger.warning("If you want to use `TFRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
  1028. self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
  1029. self.lm_head = TFRobertaLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")
  1030. def get_lm_head(self):
  1031. return self.lm_head
  1032. def get_prefix_bias_name(self):
  1033. warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
  1034. return self.name + "/" + self.lm_head.name
  1035. # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation
  1036. def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
  1037. input_shape = input_ids.shape
  1038. # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
  1039. if attention_mask is None:
  1040. attention_mask = tf.ones(input_shape)
  1041. # cut decoder_input_ids if past is used
  1042. if past_key_values is not None:
  1043. input_ids = input_ids[:, -1:]
  1044. return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
  1045. @unpack_inputs
  1046. @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1047. @add_code_sample_docstrings(
  1048. checkpoint=_CHECKPOINT_FOR_DOC,
  1049. output_type=TFCausalLMOutputWithCrossAttentions,
  1050. config_class=_CONFIG_FOR_DOC,
  1051. )
  1052. def call(
  1053. self,
  1054. input_ids: TFModelInputType | None = None,
  1055. attention_mask: np.ndarray | tf.Tensor | None = None,
  1056. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1057. position_ids: np.ndarray | tf.Tensor | None = None,
  1058. head_mask: np.ndarray | tf.Tensor | None = None,
  1059. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1060. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  1061. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  1062. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  1063. use_cache: Optional[bool] = None,
  1064. output_attentions: Optional[bool] = None,
  1065. output_hidden_states: Optional[bool] = None,
  1066. return_dict: Optional[bool] = None,
  1067. labels: np.ndarray | tf.Tensor | None = None,
  1068. training: Optional[bool] = False,
  1069. ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
  1070. r"""
  1071. encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  1072. Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
  1073. the model is configured as a decoder.
  1074. encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  1075. Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
  1076. the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
  1077. - 1 for tokens that are **not masked**,
  1078. - 0 for tokens that are **masked**.
  1079. past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
  1080. contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
  1081. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
  1082. don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
  1083. `decoder_input_ids` of shape `(batch_size, sequence_length)`.
  1084. use_cache (`bool`, *optional*, defaults to `True`):
  1085. If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
  1086. `past_key_values`). Set to `False` during training, `True` during generation
  1087. labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
  1088. Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
  1089. config.vocab_size - 1]`.
  1090. """
  1091. outputs = self.roberta(
  1092. input_ids=input_ids,
  1093. attention_mask=attention_mask,
  1094. token_type_ids=token_type_ids,
  1095. position_ids=position_ids,
  1096. head_mask=head_mask,
  1097. inputs_embeds=inputs_embeds,
  1098. encoder_hidden_states=encoder_hidden_states,
  1099. encoder_attention_mask=encoder_attention_mask,
  1100. past_key_values=past_key_values,
  1101. use_cache=use_cache,
  1102. output_attentions=output_attentions,
  1103. output_hidden_states=output_hidden_states,
  1104. return_dict=return_dict,
  1105. training=training,
  1106. )
  1107. sequence_output = outputs[0]
  1108. logits = self.lm_head(hidden_states=sequence_output, training=training)
  1109. loss = None
  1110. if labels is not None:
  1111. # shift labels to the left and cut last logit token
  1112. shifted_logits = logits[:, :-1]
  1113. labels = labels[:, 1:]
  1114. loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)
  1115. if not return_dict:
  1116. output = (logits,) + outputs[2:]
  1117. return ((loss,) + output) if loss is not None else output
  1118. return TFCausalLMOutputWithCrossAttentions(
  1119. loss=loss,
  1120. logits=logits,
  1121. past_key_values=outputs.past_key_values,
  1122. hidden_states=outputs.hidden_states,
  1123. attentions=outputs.attentions,
  1124. cross_attentions=outputs.cross_attentions,
  1125. )
  1126. def build(self, input_shape=None):
  1127. if self.built:
  1128. return
  1129. self.built = True
  1130. if getattr(self, "roberta", None) is not None:
  1131. with tf.name_scope(self.roberta.name):
  1132. self.roberta.build(None)
  1133. if getattr(self, "lm_head", None) is not None:
  1134. with tf.name_scope(self.lm_head.name):
  1135. self.lm_head.build(None)
  1136. class TFRobertaClassificationHead(keras.layers.Layer):
  1137. """Head for sentence-level classification tasks."""
  1138. def __init__(self, config, **kwargs):
  1139. super().__init__(**kwargs)
  1140. self.dense = keras.layers.Dense(
  1141. config.hidden_size,
  1142. kernel_initializer=get_initializer(config.initializer_range),
  1143. activation="tanh",
  1144. name="dense",
  1145. )
  1146. classifier_dropout = (
  1147. config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
  1148. )
  1149. self.dropout = keras.layers.Dropout(classifier_dropout)
  1150. self.out_proj = keras.layers.Dense(
  1151. config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
  1152. )
  1153. self.config = config
  1154. def call(self, features, training=False):
  1155. x = features[:, 0, :] # take <s> token (equiv. to [CLS])
  1156. x = self.dropout(x, training=training)
  1157. x = self.dense(x)
  1158. x = self.dropout(x, training=training)
  1159. x = self.out_proj(x)
  1160. return x
  1161. def build(self, input_shape=None):
  1162. if self.built:
  1163. return
  1164. self.built = True
  1165. if getattr(self, "dense", None) is not None:
  1166. with tf.name_scope(self.dense.name):
  1167. self.dense.build([None, None, self.config.hidden_size])
  1168. if getattr(self, "out_proj", None) is not None:
  1169. with tf.name_scope(self.out_proj.name):
  1170. self.out_proj.build([None, None, self.config.hidden_size])
  1171. @add_start_docstrings(
  1172. """
  1173. RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
  1174. pooled output) e.g. for GLUE tasks.
  1175. """,
  1176. ROBERTA_START_DOCSTRING,
  1177. )
  1178. class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceClassificationLoss):
  1179. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  1180. _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
  1181. def __init__(self, config, *inputs, **kwargs):
  1182. super().__init__(config, *inputs, **kwargs)
  1183. self.num_labels = config.num_labels
  1184. self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
  1185. self.classifier = TFRobertaClassificationHead(config, name="classifier")
  1186. @unpack_inputs
  1187. @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1188. @add_code_sample_docstrings(
  1189. checkpoint="cardiffnlp/twitter-roberta-base-emotion",
  1190. output_type=TFSequenceClassifierOutput,
  1191. config_class=_CONFIG_FOR_DOC,
  1192. expected_output="'optimism'",
  1193. expected_loss=0.08,
  1194. )
  1195. def call(
  1196. self,
  1197. input_ids: TFModelInputType | None = None,
  1198. attention_mask: np.ndarray | tf.Tensor | None = None,
  1199. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1200. position_ids: np.ndarray | tf.Tensor | None = None,
  1201. head_mask: np.ndarray | tf.Tensor | None = None,
  1202. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1203. output_attentions: Optional[bool] = None,
  1204. output_hidden_states: Optional[bool] = None,
  1205. return_dict: Optional[bool] = None,
  1206. labels: np.ndarray | tf.Tensor | None = None,
  1207. training: Optional[bool] = False,
  1208. ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
  1209. r"""
  1210. labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
  1211. Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
  1212. config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
  1213. `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
  1214. """
  1215. outputs = self.roberta(
  1216. input_ids,
  1217. attention_mask=attention_mask,
  1218. token_type_ids=token_type_ids,
  1219. position_ids=position_ids,
  1220. head_mask=head_mask,
  1221. inputs_embeds=inputs_embeds,
  1222. output_attentions=output_attentions,
  1223. output_hidden_states=output_hidden_states,
  1224. return_dict=return_dict,
  1225. training=training,
  1226. )
  1227. sequence_output = outputs[0]
  1228. logits = self.classifier(sequence_output, training=training)
  1229. loss = None if labels is None else self.hf_compute_loss(labels, logits)
  1230. if not return_dict:
  1231. output = (logits,) + outputs[2:]
  1232. return ((loss,) + output) if loss is not None else output
  1233. return TFSequenceClassifierOutput(
  1234. loss=loss,
  1235. logits=logits,
  1236. hidden_states=outputs.hidden_states,
  1237. attentions=outputs.attentions,
  1238. )
  1239. def build(self, input_shape=None):
  1240. if self.built:
  1241. return
  1242. self.built = True
  1243. if getattr(self, "roberta", None) is not None:
  1244. with tf.name_scope(self.roberta.name):
  1245. self.roberta.build(None)
  1246. if getattr(self, "classifier", None) is not None:
  1247. with tf.name_scope(self.classifier.name):
  1248. self.classifier.build(None)
  1249. @add_start_docstrings(
  1250. """
  1251. Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
  1252. softmax) e.g. for RocStories/SWAG tasks.
  1253. """,
  1254. ROBERTA_START_DOCSTRING,
  1255. )
  1256. class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss):
  1257. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  1258. _keys_to_ignore_on_load_unexpected = [r"lm_head"]
  1259. _keys_to_ignore_on_load_missing = [r"dropout"]
  1260. def __init__(self, config, *inputs, **kwargs):
  1261. super().__init__(config, *inputs, **kwargs)
  1262. self.roberta = TFRobertaMainLayer(config, name="roberta")
  1263. self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
  1264. self.classifier = keras.layers.Dense(
  1265. 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
  1266. )
  1267. self.config = config
  1268. @unpack_inputs
  1269. @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
  1270. @add_code_sample_docstrings(
  1271. checkpoint=_CHECKPOINT_FOR_DOC,
  1272. output_type=TFMultipleChoiceModelOutput,
  1273. config_class=_CONFIG_FOR_DOC,
  1274. )
  1275. def call(
  1276. self,
  1277. input_ids: TFModelInputType | None = None,
  1278. attention_mask: np.ndarray | tf.Tensor | None = None,
  1279. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1280. position_ids: np.ndarray | tf.Tensor | None = None,
  1281. head_mask: np.ndarray | tf.Tensor | None = None,
  1282. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1283. output_attentions: Optional[bool] = None,
  1284. output_hidden_states: Optional[bool] = None,
  1285. return_dict: Optional[bool] = None,
  1286. labels: np.ndarray | tf.Tensor | None = None,
  1287. training: Optional[bool] = False,
  1288. ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
  1289. r"""
  1290. labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
  1291. Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
  1292. where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
  1293. """
  1294. if input_ids is not None:
  1295. num_choices = shape_list(input_ids)[1]
  1296. seq_length = shape_list(input_ids)[2]
  1297. else:
  1298. num_choices = shape_list(inputs_embeds)[1]
  1299. seq_length = shape_list(inputs_embeds)[2]
  1300. flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
  1301. flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
  1302. flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
  1303. flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
  1304. outputs = self.roberta(
  1305. flat_input_ids,
  1306. flat_attention_mask,
  1307. flat_token_type_ids,
  1308. flat_position_ids,
  1309. head_mask,
  1310. inputs_embeds,
  1311. output_attentions,
  1312. output_hidden_states,
  1313. return_dict=return_dict,
  1314. training=training,
  1315. )
  1316. pooled_output = outputs[1]
  1317. pooled_output = self.dropout(pooled_output, training=training)
  1318. logits = self.classifier(pooled_output)
  1319. reshaped_logits = tf.reshape(logits, (-1, num_choices))
  1320. loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
  1321. if not return_dict:
  1322. output = (reshaped_logits,) + outputs[2:]
  1323. return ((loss,) + output) if loss is not None else output
  1324. return TFMultipleChoiceModelOutput(
  1325. loss=loss,
  1326. logits=reshaped_logits,
  1327. hidden_states=outputs.hidden_states,
  1328. attentions=outputs.attentions,
  1329. )
  1330. def build(self, input_shape=None):
  1331. if self.built:
  1332. return
  1333. self.built = True
  1334. if getattr(self, "roberta", None) is not None:
  1335. with tf.name_scope(self.roberta.name):
  1336. self.roberta.build(None)
  1337. if getattr(self, "classifier", None) is not None:
  1338. with tf.name_scope(self.classifier.name):
  1339. self.classifier.build([None, None, self.config.hidden_size])
  1340. @add_start_docstrings(
  1341. """
  1342. RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
  1343. Named-Entity-Recognition (NER) tasks.
  1344. """,
  1345. ROBERTA_START_DOCSTRING,
  1346. )
  1347. class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassificationLoss):
  1348. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  1349. _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
  1350. _keys_to_ignore_on_load_missing = [r"dropout"]
  1351. def __init__(self, config, *inputs, **kwargs):
  1352. super().__init__(config, *inputs, **kwargs)
  1353. self.num_labels = config.num_labels
  1354. self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
  1355. classifier_dropout = (
  1356. config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
  1357. )
  1358. self.dropout = keras.layers.Dropout(classifier_dropout)
  1359. self.classifier = keras.layers.Dense(
  1360. config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
  1361. )
  1362. self.config = config
  1363. @unpack_inputs
  1364. @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1365. @add_code_sample_docstrings(
  1366. checkpoint="ydshieh/roberta-large-ner-english",
  1367. output_type=TFTokenClassifierOutput,
  1368. config_class=_CONFIG_FOR_DOC,
  1369. expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
  1370. expected_loss=0.01,
  1371. )
  1372. def call(
  1373. self,
  1374. input_ids: TFModelInputType | None = None,
  1375. attention_mask: np.ndarray | tf.Tensor | None = None,
  1376. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1377. position_ids: np.ndarray | tf.Tensor | None = None,
  1378. head_mask: np.ndarray | tf.Tensor | None = None,
  1379. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1380. output_attentions: Optional[bool] = None,
  1381. output_hidden_states: Optional[bool] = None,
  1382. return_dict: Optional[bool] = None,
  1383. labels: np.ndarray | tf.Tensor | None = None,
  1384. training: Optional[bool] = False,
  1385. ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
  1386. r"""
  1387. labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  1388. Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
  1389. """
  1390. outputs = self.roberta(
  1391. input_ids,
  1392. attention_mask=attention_mask,
  1393. token_type_ids=token_type_ids,
  1394. position_ids=position_ids,
  1395. head_mask=head_mask,
  1396. inputs_embeds=inputs_embeds,
  1397. output_attentions=output_attentions,
  1398. output_hidden_states=output_hidden_states,
  1399. return_dict=return_dict,
  1400. training=training,
  1401. )
  1402. sequence_output = outputs[0]
  1403. sequence_output = self.dropout(sequence_output, training=training)
  1404. logits = self.classifier(sequence_output)
  1405. loss = None if labels is None else self.hf_compute_loss(labels, logits)
  1406. if not return_dict:
  1407. output = (logits,) + outputs[2:]
  1408. return ((loss,) + output) if loss is not None else output
  1409. return TFTokenClassifierOutput(
  1410. loss=loss,
  1411. logits=logits,
  1412. hidden_states=outputs.hidden_states,
  1413. attentions=outputs.attentions,
  1414. )
  1415. def build(self, input_shape=None):
  1416. if self.built:
  1417. return
  1418. self.built = True
  1419. if getattr(self, "roberta", None) is not None:
  1420. with tf.name_scope(self.roberta.name):
  1421. self.roberta.build(None)
  1422. if getattr(self, "classifier", None) is not None:
  1423. with tf.name_scope(self.classifier.name):
  1424. self.classifier.build([None, None, self.config.hidden_size])
  1425. @add_start_docstrings(
  1426. """
  1427. RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
  1428. layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
  1429. """,
  1430. ROBERTA_START_DOCSTRING,
  1431. )
  1432. class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnsweringLoss):
  1433. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  1434. _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
  1435. def __init__(self, config, *inputs, **kwargs):
  1436. super().__init__(config, *inputs, **kwargs)
  1437. self.num_labels = config.num_labels
  1438. self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
  1439. self.qa_outputs = keras.layers.Dense(
  1440. config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
  1441. )
  1442. self.config = config
  1443. @unpack_inputs
  1444. @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1445. @add_code_sample_docstrings(
  1446. checkpoint="ydshieh/roberta-base-squad2",
  1447. output_type=TFQuestionAnsweringModelOutput,
  1448. config_class=_CONFIG_FOR_DOC,
  1449. expected_output="' puppet'",
  1450. expected_loss=0.86,
  1451. )
  1452. def call(
  1453. self,
  1454. input_ids: TFModelInputType | None = None,
  1455. attention_mask: np.ndarray | tf.Tensor | None = None,
  1456. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1457. position_ids: np.ndarray | tf.Tensor | None = None,
  1458. head_mask: np.ndarray | tf.Tensor | None = None,
  1459. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1460. output_attentions: Optional[bool] = None,
  1461. output_hidden_states: Optional[bool] = None,
  1462. return_dict: Optional[bool] = None,
  1463. start_positions: np.ndarray | tf.Tensor | None = None,
  1464. end_positions: np.ndarray | tf.Tensor | None = None,
  1465. training: Optional[bool] = False,
  1466. ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
  1467. r"""
  1468. start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
  1469. Labels for position (index) of the start of the labelled span for computing the token classification loss.
  1470. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
  1471. are not taken into account for computing the loss.
  1472. end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
  1473. Labels for position (index) of the end of the labelled span for computing the token classification loss.
  1474. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
  1475. are not taken into account for computing the loss.
  1476. """
  1477. outputs = self.roberta(
  1478. input_ids,
  1479. attention_mask=attention_mask,
  1480. token_type_ids=token_type_ids,
  1481. position_ids=position_ids,
  1482. head_mask=head_mask,
  1483. inputs_embeds=inputs_embeds,
  1484. output_attentions=output_attentions,
  1485. output_hidden_states=output_hidden_states,
  1486. return_dict=return_dict,
  1487. training=training,
  1488. )
  1489. sequence_output = outputs[0]
  1490. logits = self.qa_outputs(sequence_output)
  1491. start_logits, end_logits = tf.split(logits, 2, axis=-1)
  1492. start_logits = tf.squeeze(start_logits, axis=-1)
  1493. end_logits = tf.squeeze(end_logits, axis=-1)
  1494. loss = None
  1495. if start_positions is not None and end_positions is not None:
  1496. labels = {"start_position": start_positions}
  1497. labels["end_position"] = end_positions
  1498. loss = self.hf_compute_loss(labels, (start_logits, end_logits))
  1499. if not return_dict:
  1500. output = (start_logits, end_logits) + outputs[2:]
  1501. return ((loss,) + output) if loss is not None else output
  1502. return TFQuestionAnsweringModelOutput(
  1503. loss=loss,
  1504. start_logits=start_logits,
  1505. end_logits=end_logits,
  1506. hidden_states=outputs.hidden_states,
  1507. attentions=outputs.attentions,
  1508. )
  1509. def build(self, input_shape=None):
  1510. if self.built:
  1511. return
  1512. self.built = True
  1513. if getattr(self, "roberta", None) is not None:
  1514. with tf.name_scope(self.roberta.name):
  1515. self.roberta.build(None)
  1516. if getattr(self, "qa_outputs", None) is not None:
  1517. with tf.name_scope(self.qa_outputs.name):
  1518. self.qa_outputs.build([None, None, self.config.hidden_size])