modeling_tf_electra.py 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764
  1. # coding=utf-8
  2. # Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """TF Electra model."""
  16. from __future__ import annotations
  17. import math
  18. import warnings
  19. from dataclasses import dataclass
  20. from typing import Optional, Tuple, Union
  21. import numpy as np
  22. import tensorflow as tf
  23. from ...activations_tf import get_tf_activation
  24. from ...modeling_tf_outputs import (
  25. TFBaseModelOutputWithPastAndCrossAttentions,
  26. TFMaskedLMOutput,
  27. TFMultipleChoiceModelOutput,
  28. TFQuestionAnsweringModelOutput,
  29. TFSequenceClassifierOutput,
  30. TFTokenClassifierOutput,
  31. )
  32. from ...modeling_tf_utils import (
  33. TFMaskedLanguageModelingLoss,
  34. TFModelInputType,
  35. TFMultipleChoiceLoss,
  36. TFPreTrainedModel,
  37. TFQuestionAnsweringLoss,
  38. TFSequenceClassificationLoss,
  39. TFSequenceSummary,
  40. TFTokenClassificationLoss,
  41. get_initializer,
  42. keras,
  43. keras_serializable,
  44. unpack_inputs,
  45. )
  46. from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
  47. from ...utils import (
  48. ModelOutput,
  49. add_code_sample_docstrings,
  50. add_start_docstrings,
  51. add_start_docstrings_to_model_forward,
  52. logging,
  53. replace_return_docstrings,
  54. )
  55. from .configuration_electra import ElectraConfig
  56. logger = logging.get_logger(__name__)
  57. _CHECKPOINT_FOR_DOC = "google/electra-small-discriminator"
  58. _CONFIG_FOR_DOC = "ElectraConfig"
  59. # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra
  60. class TFElectraSelfAttention(keras.layers.Layer):
  61. def __init__(self, config: ElectraConfig, **kwargs):
  62. super().__init__(**kwargs)
  63. if config.hidden_size % config.num_attention_heads != 0:
  64. raise ValueError(
  65. f"The hidden size ({config.hidden_size}) is not a multiple of the number "
  66. f"of attention heads ({config.num_attention_heads})"
  67. )
  68. self.num_attention_heads = config.num_attention_heads
  69. self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
  70. self.all_head_size = self.num_attention_heads * self.attention_head_size
  71. self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
  72. self.query = keras.layers.Dense(
  73. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
  74. )
  75. self.key = keras.layers.Dense(
  76. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
  77. )
  78. self.value = keras.layers.Dense(
  79. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
  80. )
  81. self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
  82. self.is_decoder = config.is_decoder
  83. self.config = config
  84. def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
  85. # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
  86. tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
  87. # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
  88. return tf.transpose(tensor, perm=[0, 2, 1, 3])
  89. def call(
  90. self,
  91. hidden_states: tf.Tensor,
  92. attention_mask: tf.Tensor,
  93. head_mask: tf.Tensor,
  94. encoder_hidden_states: tf.Tensor,
  95. encoder_attention_mask: tf.Tensor,
  96. past_key_value: Tuple[tf.Tensor],
  97. output_attentions: bool,
  98. training: bool = False,
  99. ) -> Tuple[tf.Tensor]:
  100. batch_size = shape_list(hidden_states)[0]
  101. mixed_query_layer = self.query(inputs=hidden_states)
  102. # If this is instantiated as a cross-attention module, the keys
  103. # and values come from an encoder; the attention mask needs to be
  104. # such that the encoder's padding tokens are not attended to.
  105. is_cross_attention = encoder_hidden_states is not None
  106. if is_cross_attention and past_key_value is not None:
  107. # reuse k,v, cross_attentions
  108. key_layer = past_key_value[0]
  109. value_layer = past_key_value[1]
  110. attention_mask = encoder_attention_mask
  111. elif is_cross_attention:
  112. key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
  113. value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
  114. attention_mask = encoder_attention_mask
  115. elif past_key_value is not None:
  116. key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
  117. value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
  118. key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
  119. value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
  120. else:
  121. key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
  122. value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
  123. query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
  124. if self.is_decoder:
  125. # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
  126. # Further calls to cross_attention layer can then reuse all cross-attention
  127. # key/value_states (first "if" case)
  128. # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
  129. # all previous decoder key/value_states. Further calls to uni-directional self-attention
  130. # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
  131. # if encoder bi-directional self-attention `past_key_value` is always `None`
  132. past_key_value = (key_layer, value_layer)
  133. # Take the dot product between "query" and "key" to get the raw attention scores.
  134. # (batch size, num_heads, seq_len_q, seq_len_k)
  135. attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  136. dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
  137. attention_scores = tf.divide(attention_scores, dk)
  138. if attention_mask is not None:
  139. # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function)
  140. attention_scores = tf.add(attention_scores, attention_mask)
  141. # Normalize the attention scores to probabilities.
  142. attention_probs = stable_softmax(logits=attention_scores, axis=-1)
  143. # This is actually dropping out entire tokens to attend to, which might
  144. # seem a bit unusual, but is taken from the original Transformer paper.
  145. attention_probs = self.dropout(inputs=attention_probs, training=training)
  146. # Mask heads if we want to
  147. if head_mask is not None:
  148. attention_probs = tf.multiply(attention_probs, head_mask)
  149. attention_output = tf.matmul(attention_probs, value_layer)
  150. attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
  151. # (batch_size, seq_len_q, all_head_size)
  152. attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
  153. outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
  154. if self.is_decoder:
  155. outputs = outputs + (past_key_value,)
  156. return outputs
  157. def build(self, input_shape=None):
  158. if self.built:
  159. return
  160. self.built = True
  161. if getattr(self, "query", None) is not None:
  162. with tf.name_scope(self.query.name):
  163. self.query.build([None, None, self.config.hidden_size])
  164. if getattr(self, "key", None) is not None:
  165. with tf.name_scope(self.key.name):
  166. self.key.build([None, None, self.config.hidden_size])
  167. if getattr(self, "value", None) is not None:
  168. with tf.name_scope(self.value.name):
  169. self.value.build([None, None, self.config.hidden_size])
  170. # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra
  171. class TFElectraSelfOutput(keras.layers.Layer):
  172. def __init__(self, config: ElectraConfig, **kwargs):
  173. super().__init__(**kwargs)
  174. self.dense = keras.layers.Dense(
  175. units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  176. )
  177. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  178. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  179. self.config = config
  180. def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
  181. hidden_states = self.dense(inputs=hidden_states)
  182. hidden_states = self.dropout(inputs=hidden_states, training=training)
  183. hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
  184. return hidden_states
  185. def build(self, input_shape=None):
  186. if self.built:
  187. return
  188. self.built = True
  189. if getattr(self, "dense", None) is not None:
  190. with tf.name_scope(self.dense.name):
  191. self.dense.build([None, None, self.config.hidden_size])
  192. if getattr(self, "LayerNorm", None) is not None:
  193. with tf.name_scope(self.LayerNorm.name):
  194. self.LayerNorm.build([None, None, self.config.hidden_size])
  195. # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra
  196. class TFElectraAttention(keras.layers.Layer):
  197. def __init__(self, config: ElectraConfig, **kwargs):
  198. super().__init__(**kwargs)
  199. self.self_attention = TFElectraSelfAttention(config, name="self")
  200. self.dense_output = TFElectraSelfOutput(config, name="output")
  201. def prune_heads(self, heads):
  202. raise NotImplementedError
  203. def call(
  204. self,
  205. input_tensor: tf.Tensor,
  206. attention_mask: tf.Tensor,
  207. head_mask: tf.Tensor,
  208. encoder_hidden_states: tf.Tensor,
  209. encoder_attention_mask: tf.Tensor,
  210. past_key_value: Tuple[tf.Tensor],
  211. output_attentions: bool,
  212. training: bool = False,
  213. ) -> Tuple[tf.Tensor]:
  214. self_outputs = self.self_attention(
  215. hidden_states=input_tensor,
  216. attention_mask=attention_mask,
  217. head_mask=head_mask,
  218. encoder_hidden_states=encoder_hidden_states,
  219. encoder_attention_mask=encoder_attention_mask,
  220. past_key_value=past_key_value,
  221. output_attentions=output_attentions,
  222. training=training,
  223. )
  224. attention_output = self.dense_output(
  225. hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
  226. )
  227. # add attentions (possibly with past_key_value) if we output them
  228. outputs = (attention_output,) + self_outputs[1:]
  229. return outputs
  230. def build(self, input_shape=None):
  231. if self.built:
  232. return
  233. self.built = True
  234. if getattr(self, "self_attention", None) is not None:
  235. with tf.name_scope(self.self_attention.name):
  236. self.self_attention.build(None)
  237. if getattr(self, "dense_output", None) is not None:
  238. with tf.name_scope(self.dense_output.name):
  239. self.dense_output.build(None)
  240. # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra
  241. class TFElectraIntermediate(keras.layers.Layer):
  242. def __init__(self, config: ElectraConfig, **kwargs):
  243. super().__init__(**kwargs)
  244. self.dense = keras.layers.Dense(
  245. units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  246. )
  247. if isinstance(config.hidden_act, str):
  248. self.intermediate_act_fn = get_tf_activation(config.hidden_act)
  249. else:
  250. self.intermediate_act_fn = config.hidden_act
  251. self.config = config
  252. def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
  253. hidden_states = self.dense(inputs=hidden_states)
  254. hidden_states = self.intermediate_act_fn(hidden_states)
  255. return hidden_states
  256. def build(self, input_shape=None):
  257. if self.built:
  258. return
  259. self.built = True
  260. if getattr(self, "dense", None) is not None:
  261. with tf.name_scope(self.dense.name):
  262. self.dense.build([None, None, self.config.hidden_size])
  263. # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra
  264. class TFElectraOutput(keras.layers.Layer):
  265. def __init__(self, config: ElectraConfig, **kwargs):
  266. super().__init__(**kwargs)
  267. self.dense = keras.layers.Dense(
  268. units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  269. )
  270. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  271. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  272. self.config = config
  273. def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
  274. hidden_states = self.dense(inputs=hidden_states)
  275. hidden_states = self.dropout(inputs=hidden_states, training=training)
  276. hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
  277. return hidden_states
  278. def build(self, input_shape=None):
  279. if self.built:
  280. return
  281. self.built = True
  282. if getattr(self, "dense", None) is not None:
  283. with tf.name_scope(self.dense.name):
  284. self.dense.build([None, None, self.config.intermediate_size])
  285. if getattr(self, "LayerNorm", None) is not None:
  286. with tf.name_scope(self.LayerNorm.name):
  287. self.LayerNorm.build([None, None, self.config.hidden_size])
  288. # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra
  289. class TFElectraLayer(keras.layers.Layer):
  290. def __init__(self, config: ElectraConfig, **kwargs):
  291. super().__init__(**kwargs)
  292. self.attention = TFElectraAttention(config, name="attention")
  293. self.is_decoder = config.is_decoder
  294. self.add_cross_attention = config.add_cross_attention
  295. if self.add_cross_attention:
  296. if not self.is_decoder:
  297. raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
  298. self.crossattention = TFElectraAttention(config, name="crossattention")
  299. self.intermediate = TFElectraIntermediate(config, name="intermediate")
  300. self.bert_output = TFElectraOutput(config, name="output")
  301. def call(
  302. self,
  303. hidden_states: tf.Tensor,
  304. attention_mask: tf.Tensor,
  305. head_mask: tf.Tensor,
  306. encoder_hidden_states: tf.Tensor | None,
  307. encoder_attention_mask: tf.Tensor | None,
  308. past_key_value: Tuple[tf.Tensor] | None,
  309. output_attentions: bool,
  310. training: bool = False,
  311. ) -> Tuple[tf.Tensor]:
  312. # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
  313. self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
  314. self_attention_outputs = self.attention(
  315. input_tensor=hidden_states,
  316. attention_mask=attention_mask,
  317. head_mask=head_mask,
  318. encoder_hidden_states=None,
  319. encoder_attention_mask=None,
  320. past_key_value=self_attn_past_key_value,
  321. output_attentions=output_attentions,
  322. training=training,
  323. )
  324. attention_output = self_attention_outputs[0]
  325. # if decoder, the last output is tuple of self-attn cache
  326. if self.is_decoder:
  327. outputs = self_attention_outputs[1:-1]
  328. present_key_value = self_attention_outputs[-1]
  329. else:
  330. outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
  331. cross_attn_present_key_value = None
  332. if self.is_decoder and encoder_hidden_states is not None:
  333. if not hasattr(self, "crossattention"):
  334. raise ValueError(
  335. f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
  336. " by setting `config.add_cross_attention=True`"
  337. )
  338. # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
  339. cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
  340. cross_attention_outputs = self.crossattention(
  341. input_tensor=attention_output,
  342. attention_mask=attention_mask,
  343. head_mask=head_mask,
  344. encoder_hidden_states=encoder_hidden_states,
  345. encoder_attention_mask=encoder_attention_mask,
  346. past_key_value=cross_attn_past_key_value,
  347. output_attentions=output_attentions,
  348. training=training,
  349. )
  350. attention_output = cross_attention_outputs[0]
  351. outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
  352. # add cross-attn cache to positions 3,4 of present_key_value tuple
  353. cross_attn_present_key_value = cross_attention_outputs[-1]
  354. present_key_value = present_key_value + cross_attn_present_key_value
  355. intermediate_output = self.intermediate(hidden_states=attention_output)
  356. layer_output = self.bert_output(
  357. hidden_states=intermediate_output, input_tensor=attention_output, training=training
  358. )
  359. outputs = (layer_output,) + outputs # add attentions if we output them
  360. # if decoder, return the attn key/values as the last output
  361. if self.is_decoder:
  362. outputs = outputs + (present_key_value,)
  363. return outputs
  364. def build(self, input_shape=None):
  365. if self.built:
  366. return
  367. self.built = True
  368. if getattr(self, "attention", None) is not None:
  369. with tf.name_scope(self.attention.name):
  370. self.attention.build(None)
  371. if getattr(self, "intermediate", None) is not None:
  372. with tf.name_scope(self.intermediate.name):
  373. self.intermediate.build(None)
  374. if getattr(self, "bert_output", None) is not None:
  375. with tf.name_scope(self.bert_output.name):
  376. self.bert_output.build(None)
  377. if getattr(self, "crossattention", None) is not None:
  378. with tf.name_scope(self.crossattention.name):
  379. self.crossattention.build(None)
  380. # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra
  381. class TFElectraEncoder(keras.layers.Layer):
  382. def __init__(self, config: ElectraConfig, **kwargs):
  383. super().__init__(**kwargs)
  384. self.config = config
  385. self.layer = [TFElectraLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
  386. def call(
  387. self,
  388. hidden_states: tf.Tensor,
  389. attention_mask: tf.Tensor,
  390. head_mask: tf.Tensor,
  391. encoder_hidden_states: tf.Tensor | None,
  392. encoder_attention_mask: tf.Tensor | None,
  393. past_key_values: Tuple[Tuple[tf.Tensor]] | None,
  394. use_cache: Optional[bool],
  395. output_attentions: bool,
  396. output_hidden_states: bool,
  397. return_dict: bool,
  398. training: bool = False,
  399. ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
  400. all_hidden_states = () if output_hidden_states else None
  401. all_attentions = () if output_attentions else None
  402. all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
  403. next_decoder_cache = () if use_cache else None
  404. for i, layer_module in enumerate(self.layer):
  405. if output_hidden_states:
  406. all_hidden_states = all_hidden_states + (hidden_states,)
  407. past_key_value = past_key_values[i] if past_key_values is not None else None
  408. layer_outputs = layer_module(
  409. hidden_states=hidden_states,
  410. attention_mask=attention_mask,
  411. head_mask=head_mask[i],
  412. encoder_hidden_states=encoder_hidden_states,
  413. encoder_attention_mask=encoder_attention_mask,
  414. past_key_value=past_key_value,
  415. output_attentions=output_attentions,
  416. training=training,
  417. )
  418. hidden_states = layer_outputs[0]
  419. if use_cache:
  420. next_decoder_cache += (layer_outputs[-1],)
  421. if output_attentions:
  422. all_attentions = all_attentions + (layer_outputs[1],)
  423. if self.config.add_cross_attention and encoder_hidden_states is not None:
  424. all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
  425. # Add last layer
  426. if output_hidden_states:
  427. all_hidden_states = all_hidden_states + (hidden_states,)
  428. if not return_dict:
  429. return tuple(
  430. v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
  431. )
  432. return TFBaseModelOutputWithPastAndCrossAttentions(
  433. last_hidden_state=hidden_states,
  434. past_key_values=next_decoder_cache,
  435. hidden_states=all_hidden_states,
  436. attentions=all_attentions,
  437. cross_attentions=all_cross_attentions,
  438. )
  439. def build(self, input_shape=None):
  440. if self.built:
  441. return
  442. self.built = True
  443. if getattr(self, "layer", None) is not None:
  444. for layer in self.layer:
  445. with tf.name_scope(layer.name):
  446. layer.build(None)
  447. # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra
  448. class TFElectraPooler(keras.layers.Layer):
  449. def __init__(self, config: ElectraConfig, **kwargs):
  450. super().__init__(**kwargs)
  451. self.dense = keras.layers.Dense(
  452. units=config.hidden_size,
  453. kernel_initializer=get_initializer(config.initializer_range),
  454. activation="tanh",
  455. name="dense",
  456. )
  457. self.config = config
  458. def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
  459. # We "pool" the model by simply taking the hidden state corresponding
  460. # to the first token.
  461. first_token_tensor = hidden_states[:, 0]
  462. pooled_output = self.dense(inputs=first_token_tensor)
  463. return pooled_output
  464. def build(self, input_shape=None):
  465. if self.built:
  466. return
  467. self.built = True
  468. if getattr(self, "dense", None) is not None:
  469. with tf.name_scope(self.dense.name):
  470. self.dense.build([None, None, self.config.hidden_size])
  471. # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->Electra
  472. class TFElectraEmbeddings(keras.layers.Layer):
  473. """Construct the embeddings from word, position and token_type embeddings."""
  474. def __init__(self, config: ElectraConfig, **kwargs):
  475. super().__init__(**kwargs)
  476. self.config = config
  477. self.embedding_size = config.embedding_size
  478. self.max_position_embeddings = config.max_position_embeddings
  479. self.initializer_range = config.initializer_range
  480. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  481. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  482. def build(self, input_shape=None):
  483. with tf.name_scope("word_embeddings"):
  484. self.weight = self.add_weight(
  485. name="weight",
  486. shape=[self.config.vocab_size, self.embedding_size],
  487. initializer=get_initializer(self.initializer_range),
  488. )
  489. with tf.name_scope("token_type_embeddings"):
  490. self.token_type_embeddings = self.add_weight(
  491. name="embeddings",
  492. shape=[self.config.type_vocab_size, self.embedding_size],
  493. initializer=get_initializer(self.initializer_range),
  494. )
  495. with tf.name_scope("position_embeddings"):
  496. self.position_embeddings = self.add_weight(
  497. name="embeddings",
  498. shape=[self.max_position_embeddings, self.embedding_size],
  499. initializer=get_initializer(self.initializer_range),
  500. )
  501. if self.built:
  502. return
  503. self.built = True
  504. if getattr(self, "LayerNorm", None) is not None:
  505. with tf.name_scope(self.LayerNorm.name):
  506. self.LayerNorm.build([None, None, self.config.embedding_size])
  507. # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
  508. def call(
  509. self,
  510. input_ids: tf.Tensor = None,
  511. position_ids: tf.Tensor = None,
  512. token_type_ids: tf.Tensor = None,
  513. inputs_embeds: tf.Tensor = None,
  514. past_key_values_length=0,
  515. training: bool = False,
  516. ) -> tf.Tensor:
  517. """
  518. Applies embedding based on inputs tensor.
  519. Returns:
  520. final_embeddings (`tf.Tensor`): output embedding tensor.
  521. """
  522. if input_ids is None and inputs_embeds is None:
  523. raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
  524. if input_ids is not None:
  525. check_embeddings_within_bounds(input_ids, self.config.vocab_size)
  526. inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
  527. input_shape = shape_list(inputs_embeds)[:-1]
  528. if token_type_ids is None:
  529. token_type_ids = tf.fill(dims=input_shape, value=0)
  530. if position_ids is None:
  531. position_ids = tf.expand_dims(
  532. tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
  533. )
  534. position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
  535. token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
  536. final_embeddings = inputs_embeds + position_embeds + token_type_embeds
  537. final_embeddings = self.LayerNorm(inputs=final_embeddings)
  538. final_embeddings = self.dropout(inputs=final_embeddings, training=training)
  539. return final_embeddings
  540. class TFElectraDiscriminatorPredictions(keras.layers.Layer):
  541. def __init__(self, config, **kwargs):
  542. super().__init__(**kwargs)
  543. self.dense = keras.layers.Dense(config.hidden_size, name="dense")
  544. self.dense_prediction = keras.layers.Dense(1, name="dense_prediction")
  545. self.config = config
  546. def call(self, discriminator_hidden_states, training=False):
  547. hidden_states = self.dense(discriminator_hidden_states)
  548. hidden_states = get_tf_activation(self.config.hidden_act)(hidden_states)
  549. logits = tf.squeeze(self.dense_prediction(hidden_states), -1)
  550. return logits
  551. def build(self, input_shape=None):
  552. if self.built:
  553. return
  554. self.built = True
  555. if getattr(self, "dense", None) is not None:
  556. with tf.name_scope(self.dense.name):
  557. self.dense.build([None, None, self.config.hidden_size])
  558. if getattr(self, "dense_prediction", None) is not None:
  559. with tf.name_scope(self.dense_prediction.name):
  560. self.dense_prediction.build([None, None, self.config.hidden_size])
  561. class TFElectraGeneratorPredictions(keras.layers.Layer):
  562. def __init__(self, config, **kwargs):
  563. super().__init__(**kwargs)
  564. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  565. self.dense = keras.layers.Dense(config.embedding_size, name="dense")
  566. self.config = config
  567. def call(self, generator_hidden_states, training=False):
  568. hidden_states = self.dense(generator_hidden_states)
  569. hidden_states = get_tf_activation("gelu")(hidden_states)
  570. hidden_states = self.LayerNorm(hidden_states)
  571. return hidden_states
  572. def build(self, input_shape=None):
  573. if self.built:
  574. return
  575. self.built = True
  576. if getattr(self, "LayerNorm", None) is not None:
  577. with tf.name_scope(self.LayerNorm.name):
  578. self.LayerNorm.build([None, None, self.config.embedding_size])
  579. if getattr(self, "dense", None) is not None:
  580. with tf.name_scope(self.dense.name):
  581. self.dense.build([None, None, self.config.hidden_size])
  582. class TFElectraPreTrainedModel(TFPreTrainedModel):
  583. """
  584. An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
  585. models.
  586. """
  587. config_class = ElectraConfig
  588. base_model_prefix = "electra"
  589. # When the model is loaded from a PT model
  590. _keys_to_ignore_on_load_unexpected = [r"generator_lm_head.weight"]
  591. _keys_to_ignore_on_load_missing = [r"dropout"]
  592. @keras_serializable
  593. class TFElectraMainLayer(keras.layers.Layer):
  594. config_class = ElectraConfig
  595. def __init__(self, config, **kwargs):
  596. super().__init__(**kwargs)
  597. self.config = config
  598. self.is_decoder = config.is_decoder
  599. self.embeddings = TFElectraEmbeddings(config, name="embeddings")
  600. if config.embedding_size != config.hidden_size:
  601. self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project")
  602. self.encoder = TFElectraEncoder(config, name="encoder")
  603. def get_input_embeddings(self):
  604. return self.embeddings
  605. def set_input_embeddings(self, value):
  606. self.embeddings.weight = value
  607. self.embeddings.vocab_size = shape_list(value)[0]
  608. def _prune_heads(self, heads_to_prune):
  609. """
  610. Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
  611. class PreTrainedModel
  612. """
  613. raise NotImplementedError
  614. def get_extended_attention_mask(self, attention_mask, input_shape, dtype, past_key_values_length=0):
  615. batch_size, seq_length = input_shape
  616. if attention_mask is None:
  617. attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
  618. # We create a 3D attention mask from a 2D tensor mask.
  619. # Sizes are [batch_size, 1, 1, to_seq_length]
  620. # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
  621. # this attention mask is more simple than the triangular masking of causal attention
  622. # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
  623. attention_mask_shape = shape_list(attention_mask)
  624. mask_seq_length = seq_length + past_key_values_length
  625. # Copied from `modeling_tf_t5.py`
  626. # Provided a padding mask of dimensions [batch_size, mask_seq_length]
  627. # - if the model is a decoder, apply a causal mask in addition to the padding mask
  628. # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
  629. if self.is_decoder:
  630. seq_ids = tf.range(mask_seq_length)
  631. causal_mask = tf.less_equal(
  632. tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
  633. seq_ids[None, :, None],
  634. )
  635. causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
  636. extended_attention_mask = causal_mask * attention_mask[:, None, :]
  637. attention_mask_shape = shape_list(extended_attention_mask)
  638. extended_attention_mask = tf.reshape(
  639. extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
  640. )
  641. if past_key_values_length > 0:
  642. extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
  643. else:
  644. extended_attention_mask = tf.reshape(
  645. attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
  646. )
  647. # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
  648. # masked positions, this operation will create a tensor which is 0.0 for
  649. # positions we want to attend and -10000.0 for masked positions.
  650. # Since we are adding it to the raw scores before the softmax, this is
  651. # effectively the same as removing these entirely.
  652. extended_attention_mask = tf.cast(extended_attention_mask, dtype=dtype)
  653. one_cst = tf.constant(1.0, dtype=dtype)
  654. ten_thousand_cst = tf.constant(-10000.0, dtype=dtype)
  655. extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
  656. return extended_attention_mask
  657. def get_head_mask(self, head_mask):
  658. if head_mask is not None:
  659. raise NotImplementedError
  660. else:
  661. head_mask = [None] * self.config.num_hidden_layers
  662. return head_mask
  663. @unpack_inputs
  664. def call(
  665. self,
  666. input_ids: TFModelInputType | None = None,
  667. attention_mask: np.ndarray | tf.Tensor | None = None,
  668. token_type_ids: np.ndarray | tf.Tensor | None = None,
  669. position_ids: np.ndarray | tf.Tensor | None = None,
  670. head_mask: np.ndarray | tf.Tensor | None = None,
  671. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  672. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  673. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  674. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  675. use_cache: Optional[bool] = None,
  676. output_attentions: Optional[bool] = None,
  677. output_hidden_states: Optional[bool] = None,
  678. return_dict: Optional[bool] = None,
  679. training: Optional[bool] = False,
  680. ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
  681. if not self.config.is_decoder:
  682. use_cache = False
  683. if input_ids is not None and inputs_embeds is not None:
  684. raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
  685. elif input_ids is not None:
  686. input_shape = shape_list(input_ids)
  687. elif inputs_embeds is not None:
  688. input_shape = shape_list(inputs_embeds)[:-1]
  689. else:
  690. raise ValueError("You have to specify either input_ids or inputs_embeds")
  691. batch_size, seq_length = input_shape
  692. if past_key_values is None:
  693. past_key_values_length = 0
  694. past_key_values = [None] * len(self.encoder.layer)
  695. else:
  696. past_key_values_length = shape_list(past_key_values[0][0])[-2]
  697. if attention_mask is None:
  698. attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
  699. if token_type_ids is None:
  700. token_type_ids = tf.fill(dims=input_shape, value=0)
  701. hidden_states = self.embeddings(
  702. input_ids=input_ids,
  703. position_ids=position_ids,
  704. token_type_ids=token_type_ids,
  705. inputs_embeds=inputs_embeds,
  706. past_key_values_length=past_key_values_length,
  707. training=training,
  708. )
  709. extended_attention_mask = self.get_extended_attention_mask(
  710. attention_mask, input_shape, hidden_states.dtype, past_key_values_length
  711. )
  712. # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
  713. if self.is_decoder and encoder_attention_mask is not None:
  714. # If a 2D ou 3D attention mask is provided for the cross-attention
  715. # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
  716. # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
  717. encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype)
  718. num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
  719. if num_dims_encoder_attention_mask == 3:
  720. encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
  721. if num_dims_encoder_attention_mask == 2:
  722. encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
  723. # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
  724. # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
  725. # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
  726. # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
  727. encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
  728. else:
  729. encoder_extended_attention_mask = None
  730. head_mask = self.get_head_mask(head_mask)
  731. if hasattr(self, "embeddings_project"):
  732. hidden_states = self.embeddings_project(hidden_states, training=training)
  733. hidden_states = self.encoder(
  734. hidden_states=hidden_states,
  735. attention_mask=extended_attention_mask,
  736. head_mask=head_mask,
  737. encoder_hidden_states=encoder_hidden_states,
  738. encoder_attention_mask=encoder_extended_attention_mask,
  739. past_key_values=past_key_values,
  740. use_cache=use_cache,
  741. output_attentions=output_attentions,
  742. output_hidden_states=output_hidden_states,
  743. return_dict=return_dict,
  744. training=training,
  745. )
  746. return hidden_states
  747. def build(self, input_shape=None):
  748. if self.built:
  749. return
  750. self.built = True
  751. if getattr(self, "embeddings", None) is not None:
  752. with tf.name_scope(self.embeddings.name):
  753. self.embeddings.build(None)
  754. if getattr(self, "encoder", None) is not None:
  755. with tf.name_scope(self.encoder.name):
  756. self.encoder.build(None)
  757. if getattr(self, "embeddings_project", None) is not None:
  758. with tf.name_scope(self.embeddings_project.name):
  759. self.embeddings_project.build([None, None, self.config.embedding_size])
  760. @dataclass
  761. class TFElectraForPreTrainingOutput(ModelOutput):
  762. """
  763. Output type of [`TFElectraForPreTraining`].
  764. Args:
  765. loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
  766. Total loss of the ELECTRA objective.
  767. logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  768. Prediction scores of the head (scores for each token before SoftMax).
  769. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  770. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  771. `(batch_size, sequence_length, hidden_size)`.
  772. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  773. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  774. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  775. sequence_length)`.
  776. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  777. heads.
  778. """
  779. logits: tf.Tensor = None
  780. hidden_states: Tuple[tf.Tensor] | None = None
  781. attentions: Tuple[tf.Tensor] | None = None
  782. ELECTRA_START_DOCSTRING = r"""
  783. This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
  784. library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  785. etc.)
  786. This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
  787. as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
  788. behavior.
  789. <Tip>
  790. TensorFlow models and layers in `transformers` accept two formats as input:
  791. - having all inputs as keyword arguments (like PyTorch models), or
  792. - having all inputs as a list, tuple or dict in the first positional argument.
  793. The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
  794. and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
  795. pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
  796. format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
  797. the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
  798. positional argument:
  799. - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
  800. - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
  801. `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
  802. - a dictionary with one or several input Tensors associated to the input names given in the docstring:
  803. `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
  804. Note that when creating models and layers with
  805. [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
  806. about any of this, as you can just pass inputs like you would to any other Python function!
  807. </Tip>
  808. Parameters:
  809. config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
  810. Initializing with a config file does not load the weights associated with the model, only the
  811. configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
  812. """
  813. ELECTRA_INPUTS_DOCSTRING = r"""
  814. Args:
  815. input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
  816. Indices of input sequence tokens in the vocabulary.
  817. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
  818. [`PreTrainedTokenizer.encode`] for details.
  819. [What are input IDs?](../glossary#input-ids)
  820. attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
  821. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  822. - 1 for tokens that are **not masked**,
  823. - 0 for tokens that are **masked**.
  824. [What are attention masks?](../glossary#attention-mask)
  825. position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
  826. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  827. config.max_position_embeddings - 1]`.
  828. [What are position IDs?](../glossary#position-ids)
  829. head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
  830. Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
  831. - 1 indicates the head is **not masked**,
  832. - 0 indicates the head is **masked**.
  833. inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
  834. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
  835. is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
  836. model's internal embedding lookup matrix.
  837. output_attentions (`bool`, *optional*):
  838. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  839. tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
  840. config will be used instead.
  841. output_hidden_states (`bool`, *optional*):
  842. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  843. more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
  844. used instead.
  845. return_dict (`bool`, *optional*):
  846. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
  847. eager mode, in graph mode the value will always be set to True.
  848. training (`bool`, *optional*, defaults to `False`):
  849. Whether or not to use the model in training mode (some modules like dropout modules have different
  850. behaviors between training and evaluation).
  851. """
  852. @add_start_docstrings(
  853. "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
  854. "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
  855. "hidden size and embedding size are different. "
  856. ""
  857. "Both the generator and discriminator checkpoints may be loaded into this model.",
  858. ELECTRA_START_DOCSTRING,
  859. )
  860. class TFElectraModel(TFElectraPreTrainedModel):
  861. def __init__(self, config, *inputs, **kwargs):
  862. super().__init__(config, *inputs, **kwargs)
  863. self.electra = TFElectraMainLayer(config, name="electra")
  864. @unpack_inputs
  865. @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  866. @add_code_sample_docstrings(
  867. checkpoint=_CHECKPOINT_FOR_DOC,
  868. output_type=TFBaseModelOutputWithPastAndCrossAttentions,
  869. config_class=_CONFIG_FOR_DOC,
  870. )
  871. def call(
  872. self,
  873. input_ids: TFModelInputType | None = None,
  874. attention_mask: np.ndarray | tf.Tensor | None = None,
  875. token_type_ids: np.ndarray | tf.Tensor | None = None,
  876. position_ids: np.ndarray | tf.Tensor | None = None,
  877. head_mask: np.ndarray | tf.Tensor | None = None,
  878. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  879. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  880. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  881. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  882. use_cache: Optional[bool] = None,
  883. output_attentions: Optional[bool] = None,
  884. output_hidden_states: Optional[bool] = None,
  885. return_dict: Optional[bool] = None,
  886. training: Optional[bool] = False,
  887. ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
  888. r"""
  889. encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  890. Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
  891. the model is configured as a decoder.
  892. encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  893. Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
  894. the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
  895. - 1 for tokens that are **not masked**,
  896. - 0 for tokens that are **masked**.
  897. past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
  898. contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
  899. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
  900. don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
  901. `decoder_input_ids` of shape `(batch_size, sequence_length)`.
  902. use_cache (`bool`, *optional*, defaults to `True`):
  903. If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
  904. `past_key_values`). Set to `False` during training, `True` during generation
  905. """
  906. outputs = self.electra(
  907. input_ids=input_ids,
  908. attention_mask=attention_mask,
  909. token_type_ids=token_type_ids,
  910. position_ids=position_ids,
  911. head_mask=head_mask,
  912. encoder_hidden_states=encoder_hidden_states,
  913. encoder_attention_mask=encoder_attention_mask,
  914. past_key_values=past_key_values,
  915. use_cache=use_cache,
  916. inputs_embeds=inputs_embeds,
  917. output_attentions=output_attentions,
  918. output_hidden_states=output_hidden_states,
  919. return_dict=return_dict,
  920. training=training,
  921. )
  922. return outputs
  923. def build(self, input_shape=None):
  924. if self.built:
  925. return
  926. self.built = True
  927. if getattr(self, "electra", None) is not None:
  928. with tf.name_scope(self.electra.name):
  929. self.electra.build(None)
  930. @add_start_docstrings(
  931. """
  932. Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
  933. Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model
  934. of the two to have the correct classification head to be used for this model.
  935. """,
  936. ELECTRA_START_DOCSTRING,
  937. )
  938. class TFElectraForPreTraining(TFElectraPreTrainedModel):
  939. def __init__(self, config, **kwargs):
  940. super().__init__(config, **kwargs)
  941. self.electra = TFElectraMainLayer(config, name="electra")
  942. self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
  943. @unpack_inputs
  944. @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  945. @replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
  946. def call(
  947. self,
  948. input_ids: TFModelInputType | None = None,
  949. attention_mask: np.ndarray | tf.Tensor | None = None,
  950. token_type_ids: np.ndarray | tf.Tensor | None = None,
  951. position_ids: np.ndarray | tf.Tensor | None = None,
  952. head_mask: np.ndarray | tf.Tensor | None = None,
  953. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  954. output_attentions: Optional[bool] = None,
  955. output_hidden_states: Optional[bool] = None,
  956. return_dict: Optional[bool] = None,
  957. training: Optional[bool] = False,
  958. ) -> Union[TFElectraForPreTrainingOutput, Tuple[tf.Tensor]]:
  959. r"""
  960. Returns:
  961. Examples:
  962. ```python
  963. >>> import tensorflow as tf
  964. >>> from transformers import AutoTokenizer, TFElectraForPreTraining
  965. >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
  966. >>> model = TFElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
  967. >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
  968. >>> outputs = model(input_ids)
  969. >>> scores = outputs[0]
  970. ```"""
  971. discriminator_hidden_states = self.electra(
  972. input_ids=input_ids,
  973. attention_mask=attention_mask,
  974. token_type_ids=token_type_ids,
  975. position_ids=position_ids,
  976. head_mask=head_mask,
  977. inputs_embeds=inputs_embeds,
  978. output_attentions=output_attentions,
  979. output_hidden_states=output_hidden_states,
  980. return_dict=return_dict,
  981. training=training,
  982. )
  983. discriminator_sequence_output = discriminator_hidden_states[0]
  984. logits = self.discriminator_predictions(discriminator_sequence_output)
  985. if not return_dict:
  986. return (logits,) + discriminator_hidden_states[1:]
  987. return TFElectraForPreTrainingOutput(
  988. logits=logits,
  989. hidden_states=discriminator_hidden_states.hidden_states,
  990. attentions=discriminator_hidden_states.attentions,
  991. )
  992. def build(self, input_shape=None):
  993. if self.built:
  994. return
  995. self.built = True
  996. if getattr(self, "electra", None) is not None:
  997. with tf.name_scope(self.electra.name):
  998. self.electra.build(None)
  999. if getattr(self, "discriminator_predictions", None) is not None:
  1000. with tf.name_scope(self.discriminator_predictions.name):
  1001. self.discriminator_predictions.build(None)
  1002. class TFElectraMaskedLMHead(keras.layers.Layer):
  1003. def __init__(self, config, input_embeddings, **kwargs):
  1004. super().__init__(**kwargs)
  1005. self.config = config
  1006. self.embedding_size = config.embedding_size
  1007. self.input_embeddings = input_embeddings
  1008. def build(self, input_shape):
  1009. self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
  1010. super().build(input_shape)
  1011. def get_output_embeddings(self):
  1012. return self.input_embeddings
  1013. def set_output_embeddings(self, value):
  1014. self.input_embeddings.weight = value
  1015. self.input_embeddings.vocab_size = shape_list(value)[0]
  1016. def get_bias(self):
  1017. return {"bias": self.bias}
  1018. def set_bias(self, value):
  1019. self.bias = value["bias"]
  1020. self.config.vocab_size = shape_list(value["bias"])[0]
  1021. def call(self, hidden_states):
  1022. seq_length = shape_list(tensor=hidden_states)[1]
  1023. hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
  1024. hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
  1025. hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
  1026. hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
  1027. return hidden_states
  1028. @add_start_docstrings(
  1029. """
  1030. Electra model with a language modeling head on top.
  1031. Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
  1032. the two to have been trained for the masked language modeling task.
  1033. """,
  1034. ELECTRA_START_DOCSTRING,
  1035. )
  1036. class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss):
  1037. def __init__(self, config, **kwargs):
  1038. super().__init__(config, **kwargs)
  1039. self.config = config
  1040. self.electra = TFElectraMainLayer(config, name="electra")
  1041. self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
  1042. if isinstance(config.hidden_act, str):
  1043. self.activation = get_tf_activation(config.hidden_act)
  1044. else:
  1045. self.activation = config.hidden_act
  1046. self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
  1047. def get_lm_head(self):
  1048. return self.generator_lm_head
  1049. def get_prefix_bias_name(self):
  1050. warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
  1051. return self.name + "/" + self.generator_lm_head.name
  1052. @unpack_inputs
  1053. @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1054. @add_code_sample_docstrings(
  1055. checkpoint="google/electra-small-generator",
  1056. output_type=TFMaskedLMOutput,
  1057. config_class=_CONFIG_FOR_DOC,
  1058. mask="[MASK]",
  1059. expected_output="'paris'",
  1060. expected_loss=1.22,
  1061. )
  1062. def call(
  1063. self,
  1064. input_ids: TFModelInputType | None = None,
  1065. attention_mask: np.ndarray | tf.Tensor | None = None,
  1066. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1067. position_ids: np.ndarray | tf.Tensor | None = None,
  1068. head_mask: np.ndarray | tf.Tensor | None = None,
  1069. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1070. output_attentions: Optional[bool] = None,
  1071. output_hidden_states: Optional[bool] = None,
  1072. return_dict: Optional[bool] = None,
  1073. labels: np.ndarray | tf.Tensor | None = None,
  1074. training: Optional[bool] = False,
  1075. ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
  1076. r"""
  1077. labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  1078. Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
  1079. config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
  1080. loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
  1081. """
  1082. generator_hidden_states = self.electra(
  1083. input_ids=input_ids,
  1084. attention_mask=attention_mask,
  1085. token_type_ids=token_type_ids,
  1086. position_ids=position_ids,
  1087. head_mask=head_mask,
  1088. inputs_embeds=inputs_embeds,
  1089. output_attentions=output_attentions,
  1090. output_hidden_states=output_hidden_states,
  1091. return_dict=return_dict,
  1092. training=training,
  1093. )
  1094. generator_sequence_output = generator_hidden_states[0]
  1095. prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
  1096. prediction_scores = self.generator_lm_head(prediction_scores, training=training)
  1097. loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
  1098. if not return_dict:
  1099. output = (prediction_scores,) + generator_hidden_states[1:]
  1100. return ((loss,) + output) if loss is not None else output
  1101. return TFMaskedLMOutput(
  1102. loss=loss,
  1103. logits=prediction_scores,
  1104. hidden_states=generator_hidden_states.hidden_states,
  1105. attentions=generator_hidden_states.attentions,
  1106. )
  1107. def build(self, input_shape=None):
  1108. if self.built:
  1109. return
  1110. self.built = True
  1111. if getattr(self, "electra", None) is not None:
  1112. with tf.name_scope(self.electra.name):
  1113. self.electra.build(None)
  1114. if getattr(self, "generator_predictions", None) is not None:
  1115. with tf.name_scope(self.generator_predictions.name):
  1116. self.generator_predictions.build(None)
  1117. if getattr(self, "generator_lm_head", None) is not None:
  1118. with tf.name_scope(self.generator_lm_head.name):
  1119. self.generator_lm_head.build(None)
  1120. class TFElectraClassificationHead(keras.layers.Layer):
  1121. """Head for sentence-level classification tasks."""
  1122. def __init__(self, config, **kwargs):
  1123. super().__init__(**kwargs)
  1124. self.dense = keras.layers.Dense(
  1125. config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  1126. )
  1127. classifier_dropout = (
  1128. config.classifhidden_dropout_probier_dropout
  1129. if config.classifier_dropout is not None
  1130. else config.hidden_dropout_prob
  1131. )
  1132. self.dropout = keras.layers.Dropout(classifier_dropout)
  1133. self.out_proj = keras.layers.Dense(
  1134. config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
  1135. )
  1136. self.config = config
  1137. def call(self, inputs, **kwargs):
  1138. x = inputs[:, 0, :] # take <s> token (equiv. to [CLS])
  1139. x = self.dropout(x)
  1140. x = self.dense(x)
  1141. x = get_tf_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here
  1142. x = self.dropout(x)
  1143. x = self.out_proj(x)
  1144. return x
  1145. def build(self, input_shape=None):
  1146. if self.built:
  1147. return
  1148. self.built = True
  1149. if getattr(self, "dense", None) is not None:
  1150. with tf.name_scope(self.dense.name):
  1151. self.dense.build([None, None, self.config.hidden_size])
  1152. if getattr(self, "out_proj", None) is not None:
  1153. with tf.name_scope(self.out_proj.name):
  1154. self.out_proj.build([None, None, self.config.hidden_size])
  1155. @add_start_docstrings(
  1156. """
  1157. ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
  1158. pooled output) e.g. for GLUE tasks.
  1159. """,
  1160. ELECTRA_START_DOCSTRING,
  1161. )
  1162. class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceClassificationLoss):
  1163. def __init__(self, config, *inputs, **kwargs):
  1164. super().__init__(config, *inputs, **kwargs)
  1165. self.num_labels = config.num_labels
  1166. self.electra = TFElectraMainLayer(config, name="electra")
  1167. self.classifier = TFElectraClassificationHead(config, name="classifier")
  1168. @unpack_inputs
  1169. @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1170. @add_code_sample_docstrings(
  1171. checkpoint="bhadresh-savani/electra-base-emotion",
  1172. output_type=TFSequenceClassifierOutput,
  1173. config_class=_CONFIG_FOR_DOC,
  1174. expected_output="'joy'",
  1175. expected_loss=0.06,
  1176. )
  1177. def call(
  1178. self,
  1179. input_ids: TFModelInputType | None = None,
  1180. attention_mask: np.ndarray | tf.Tensor | None = None,
  1181. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1182. position_ids: np.ndarray | tf.Tensor | None = None,
  1183. head_mask: np.ndarray | tf.Tensor | None = None,
  1184. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1185. output_attentions: Optional[bool] = None,
  1186. output_hidden_states: Optional[bool] = None,
  1187. return_dict: Optional[bool] = None,
  1188. labels: np.ndarray | tf.Tensor | None = None,
  1189. training: Optional[bool] = False,
  1190. ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
  1191. r"""
  1192. labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
  1193. Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
  1194. config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
  1195. `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
  1196. """
  1197. outputs = self.electra(
  1198. input_ids=input_ids,
  1199. attention_mask=attention_mask,
  1200. token_type_ids=token_type_ids,
  1201. position_ids=position_ids,
  1202. head_mask=head_mask,
  1203. inputs_embeds=inputs_embeds,
  1204. output_attentions=output_attentions,
  1205. output_hidden_states=output_hidden_states,
  1206. return_dict=return_dict,
  1207. training=training,
  1208. )
  1209. logits = self.classifier(outputs[0])
  1210. loss = None if labels is None else self.hf_compute_loss(labels, logits)
  1211. if not return_dict:
  1212. output = (logits,) + outputs[1:]
  1213. return ((loss,) + output) if loss is not None else output
  1214. return TFSequenceClassifierOutput(
  1215. loss=loss,
  1216. logits=logits,
  1217. hidden_states=outputs.hidden_states,
  1218. attentions=outputs.attentions,
  1219. )
  1220. def build(self, input_shape=None):
  1221. if self.built:
  1222. return
  1223. self.built = True
  1224. if getattr(self, "electra", None) is not None:
  1225. with tf.name_scope(self.electra.name):
  1226. self.electra.build(None)
  1227. if getattr(self, "classifier", None) is not None:
  1228. with tf.name_scope(self.classifier.name):
  1229. self.classifier.build(None)
  1230. @add_start_docstrings(
  1231. """
  1232. ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
  1233. softmax) e.g. for RocStories/SWAG tasks.
  1234. """,
  1235. ELECTRA_START_DOCSTRING,
  1236. )
  1237. class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss):
  1238. def __init__(self, config, *inputs, **kwargs):
  1239. super().__init__(config, *inputs, **kwargs)
  1240. self.electra = TFElectraMainLayer(config, name="electra")
  1241. self.sequence_summary = TFSequenceSummary(
  1242. config, initializer_range=config.initializer_range, name="sequence_summary"
  1243. )
  1244. self.classifier = keras.layers.Dense(
  1245. 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
  1246. )
  1247. self.config = config
  1248. @unpack_inputs
  1249. @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
  1250. @add_code_sample_docstrings(
  1251. checkpoint=_CHECKPOINT_FOR_DOC,
  1252. output_type=TFMultipleChoiceModelOutput,
  1253. config_class=_CONFIG_FOR_DOC,
  1254. )
  1255. def call(
  1256. self,
  1257. input_ids: TFModelInputType | None = None,
  1258. attention_mask: np.ndarray | tf.Tensor | None = None,
  1259. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1260. position_ids: np.ndarray | tf.Tensor | None = None,
  1261. head_mask: np.ndarray | tf.Tensor | None = None,
  1262. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1263. output_attentions: Optional[bool] = None,
  1264. output_hidden_states: Optional[bool] = None,
  1265. return_dict: Optional[bool] = None,
  1266. labels: np.ndarray | tf.Tensor | None = None,
  1267. training: Optional[bool] = False,
  1268. ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
  1269. r"""
  1270. labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
  1271. Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
  1272. where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
  1273. """
  1274. if input_ids is not None:
  1275. num_choices = shape_list(input_ids)[1]
  1276. seq_length = shape_list(input_ids)[2]
  1277. else:
  1278. num_choices = shape_list(inputs_embeds)[1]
  1279. seq_length = shape_list(inputs_embeds)[2]
  1280. flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
  1281. flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
  1282. flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
  1283. flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
  1284. flat_inputs_embeds = (
  1285. tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
  1286. if inputs_embeds is not None
  1287. else None
  1288. )
  1289. outputs = self.electra(
  1290. input_ids=flat_input_ids,
  1291. attention_mask=flat_attention_mask,
  1292. token_type_ids=flat_token_type_ids,
  1293. position_ids=flat_position_ids,
  1294. head_mask=head_mask,
  1295. inputs_embeds=flat_inputs_embeds,
  1296. output_attentions=output_attentions,
  1297. output_hidden_states=output_hidden_states,
  1298. return_dict=return_dict,
  1299. training=training,
  1300. )
  1301. logits = self.sequence_summary(outputs[0])
  1302. logits = self.classifier(logits)
  1303. reshaped_logits = tf.reshape(logits, (-1, num_choices))
  1304. loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
  1305. if not return_dict:
  1306. output = (reshaped_logits,) + outputs[1:]
  1307. return ((loss,) + output) if loss is not None else output
  1308. return TFMultipleChoiceModelOutput(
  1309. loss=loss,
  1310. logits=reshaped_logits,
  1311. hidden_states=outputs.hidden_states,
  1312. attentions=outputs.attentions,
  1313. )
  1314. def build(self, input_shape=None):
  1315. if self.built:
  1316. return
  1317. self.built = True
  1318. if getattr(self, "electra", None) is not None:
  1319. with tf.name_scope(self.electra.name):
  1320. self.electra.build(None)
  1321. if getattr(self, "sequence_summary", None) is not None:
  1322. with tf.name_scope(self.sequence_summary.name):
  1323. self.sequence_summary.build(None)
  1324. if getattr(self, "classifier", None) is not None:
  1325. with tf.name_scope(self.classifier.name):
  1326. self.classifier.build([None, None, self.config.hidden_size])
  1327. @add_start_docstrings(
  1328. """
  1329. Electra model with a token classification head on top.
  1330. Both the discriminator and generator may be loaded into this model.
  1331. """,
  1332. ELECTRA_START_DOCSTRING,
  1333. )
  1334. class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassificationLoss):
  1335. def __init__(self, config, **kwargs):
  1336. super().__init__(config, **kwargs)
  1337. self.electra = TFElectraMainLayer(config, name="electra")
  1338. classifier_dropout = (
  1339. config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
  1340. )
  1341. self.dropout = keras.layers.Dropout(classifier_dropout)
  1342. self.classifier = keras.layers.Dense(
  1343. config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
  1344. )
  1345. self.config = config
  1346. @unpack_inputs
  1347. @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1348. @add_code_sample_docstrings(
  1349. checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english",
  1350. output_type=TFTokenClassifierOutput,
  1351. config_class=_CONFIG_FOR_DOC,
  1352. expected_output="['B-LOC', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC']",
  1353. expected_loss=0.11,
  1354. )
  1355. def call(
  1356. self,
  1357. input_ids: TFModelInputType | None = None,
  1358. attention_mask: np.ndarray | tf.Tensor | None = None,
  1359. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1360. position_ids: np.ndarray | tf.Tensor | None = None,
  1361. head_mask: np.ndarray | tf.Tensor | None = None,
  1362. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1363. output_attentions: Optional[bool] = None,
  1364. output_hidden_states: Optional[bool] = None,
  1365. return_dict: Optional[bool] = None,
  1366. labels: np.ndarray | tf.Tensor | None = None,
  1367. training: Optional[bool] = False,
  1368. ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
  1369. r"""
  1370. labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  1371. Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
  1372. """
  1373. discriminator_hidden_states = self.electra(
  1374. input_ids=input_ids,
  1375. attention_mask=attention_mask,
  1376. token_type_ids=token_type_ids,
  1377. position_ids=position_ids,
  1378. head_mask=head_mask,
  1379. inputs_embeds=inputs_embeds,
  1380. output_attentions=output_attentions,
  1381. output_hidden_states=output_hidden_states,
  1382. return_dict=return_dict,
  1383. training=training,
  1384. )
  1385. discriminator_sequence_output = discriminator_hidden_states[0]
  1386. discriminator_sequence_output = self.dropout(discriminator_sequence_output)
  1387. logits = self.classifier(discriminator_sequence_output)
  1388. loss = None if labels is None else self.hf_compute_loss(labels, logits)
  1389. if not return_dict:
  1390. output = (logits,) + discriminator_hidden_states[1:]
  1391. return ((loss,) + output) if loss is not None else output
  1392. return TFTokenClassifierOutput(
  1393. loss=loss,
  1394. logits=logits,
  1395. hidden_states=discriminator_hidden_states.hidden_states,
  1396. attentions=discriminator_hidden_states.attentions,
  1397. )
  1398. def build(self, input_shape=None):
  1399. if self.built:
  1400. return
  1401. self.built = True
  1402. if getattr(self, "electra", None) is not None:
  1403. with tf.name_scope(self.electra.name):
  1404. self.electra.build(None)
  1405. if getattr(self, "classifier", None) is not None:
  1406. with tf.name_scope(self.classifier.name):
  1407. self.classifier.build([None, None, self.config.hidden_size])
  1408. @add_start_docstrings(
  1409. """
  1410. Electra Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
  1411. layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
  1412. """,
  1413. ELECTRA_START_DOCSTRING,
  1414. )
  1415. class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnsweringLoss):
  1416. def __init__(self, config, *inputs, **kwargs):
  1417. super().__init__(config, *inputs, **kwargs)
  1418. self.num_labels = config.num_labels
  1419. self.electra = TFElectraMainLayer(config, name="electra")
  1420. self.qa_outputs = keras.layers.Dense(
  1421. config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
  1422. )
  1423. self.config = config
  1424. @unpack_inputs
  1425. @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1426. @add_code_sample_docstrings(
  1427. checkpoint="bhadresh-savani/electra-base-squad2",
  1428. output_type=TFQuestionAnsweringModelOutput,
  1429. config_class=_CONFIG_FOR_DOC,
  1430. qa_target_start_index=11,
  1431. qa_target_end_index=12,
  1432. expected_output="'a nice puppet'",
  1433. expected_loss=2.64,
  1434. )
  1435. def call(
  1436. self,
  1437. input_ids: TFModelInputType | None = None,
  1438. attention_mask: np.ndarray | tf.Tensor | None = None,
  1439. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1440. position_ids: np.ndarray | tf.Tensor | None = None,
  1441. head_mask: np.ndarray | tf.Tensor | None = None,
  1442. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1443. output_attentions: Optional[bool] = None,
  1444. output_hidden_states: Optional[bool] = None,
  1445. return_dict: Optional[bool] = None,
  1446. start_positions: np.ndarray | tf.Tensor | None = None,
  1447. end_positions: np.ndarray | tf.Tensor | None = None,
  1448. training: Optional[bool] = False,
  1449. ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
  1450. r"""
  1451. start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
  1452. Labels for position (index) of the start of the labelled span for computing the token classification loss.
  1453. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
  1454. are not taken into account for computing the loss.
  1455. end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
  1456. Labels for position (index) of the end of the labelled span for computing the token classification loss.
  1457. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
  1458. are not taken into account for computing the loss.
  1459. """
  1460. discriminator_hidden_states = self.electra(
  1461. input_ids=input_ids,
  1462. attention_mask=attention_mask,
  1463. token_type_ids=token_type_ids,
  1464. position_ids=position_ids,
  1465. head_mask=head_mask,
  1466. inputs_embeds=inputs_embeds,
  1467. output_attentions=output_attentions,
  1468. output_hidden_states=output_hidden_states,
  1469. return_dict=return_dict,
  1470. training=training,
  1471. )
  1472. discriminator_sequence_output = discriminator_hidden_states[0]
  1473. logits = self.qa_outputs(discriminator_sequence_output)
  1474. start_logits, end_logits = tf.split(logits, 2, axis=-1)
  1475. start_logits = tf.squeeze(start_logits, axis=-1)
  1476. end_logits = tf.squeeze(end_logits, axis=-1)
  1477. loss = None
  1478. if start_positions is not None and end_positions is not None:
  1479. labels = {"start_position": start_positions}
  1480. labels["end_position"] = end_positions
  1481. loss = self.hf_compute_loss(labels, (start_logits, end_logits))
  1482. if not return_dict:
  1483. output = (
  1484. start_logits,
  1485. end_logits,
  1486. ) + discriminator_hidden_states[1:]
  1487. return ((loss,) + output) if loss is not None else output
  1488. return TFQuestionAnsweringModelOutput(
  1489. loss=loss,
  1490. start_logits=start_logits,
  1491. end_logits=end_logits,
  1492. hidden_states=discriminator_hidden_states.hidden_states,
  1493. attentions=discriminator_hidden_states.attentions,
  1494. )
  1495. def build(self, input_shape=None):
  1496. if self.built:
  1497. return
  1498. self.built = True
  1499. if getattr(self, "electra", None) is not None:
  1500. with tf.name_scope(self.electra.name):
  1501. self.electra.build(None)
  1502. if getattr(self, "qa_outputs", None) is not None:
  1503. with tf.name_scope(self.qa_outputs.name):
  1504. self.qa_outputs.build([None, None, self.config.hidden_size])