modeling_tf_bart.py 79 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711
  1. # coding=utf-8
  2. # Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """TF 2.0 Bart model."""
  16. from __future__ import annotations
  17. import random
  18. from typing import Optional, Tuple, Union
  19. import numpy as np
  20. import tensorflow as tf
  21. from ...activations_tf import get_tf_activation
  22. from ...modeling_tf_outputs import (
  23. TFBaseModelOutput,
  24. TFBaseModelOutputWithPastAndCrossAttentions,
  25. TFSeq2SeqLMOutput,
  26. TFSeq2SeqModelOutput,
  27. TFSeq2SeqSequenceClassifierOutput,
  28. )
  29. # Public API
  30. from ...modeling_tf_utils import (
  31. TFCausalLanguageModelingLoss,
  32. TFModelInputType,
  33. TFPreTrainedModel,
  34. TFSequenceClassificationLoss,
  35. keras,
  36. keras_serializable,
  37. unpack_inputs,
  38. )
  39. from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
  40. from ...utils import (
  41. add_code_sample_docstrings,
  42. add_end_docstrings,
  43. add_start_docstrings,
  44. add_start_docstrings_to_model_forward,
  45. logging,
  46. replace_return_docstrings,
  47. )
  48. from .configuration_bart import BartConfig
  49. logger = logging.get_logger(__name__)
  50. _CHECKPOINT_FOR_DOC = "facebook/bart-large"
  51. _CONFIG_FOR_DOC = "BartConfig"
  52. LARGE_NEGATIVE = -1e8
  53. def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
  54. pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
  55. decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
  56. start_tokens = tf.fill(
  57. (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
  58. )
  59. shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
  60. # replace possible -100 values in labels by `pad_token_id`
  61. shifted_input_ids = tf.where(
  62. shifted_input_ids == -100,
  63. tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
  64. shifted_input_ids,
  65. )
  66. # "Verify that `labels` has only positive values and -100"
  67. assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
  68. # Make sure the assertion op is called by wrapping the result in an identity no-op
  69. with tf.control_dependencies([assert_gte0]):
  70. shifted_input_ids = tf.identity(shifted_input_ids)
  71. return shifted_input_ids
  72. def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
  73. """
  74. Make causal mask used for bi-directional self-attention.
  75. """
  76. bsz = input_ids_shape[0]
  77. tgt_len = input_ids_shape[1]
  78. mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
  79. mask_cond = tf.range(shape_list(mask)[-1])
  80. mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
  81. if past_key_values_length > 0:
  82. mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
  83. return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
  84. def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
  85. """
  86. Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
  87. """
  88. src_len = shape_list(mask)[1]
  89. tgt_len = tgt_len if tgt_len is not None else src_len
  90. one_cst = tf.constant(1.0)
  91. mask = tf.cast(mask, dtype=one_cst.dtype)
  92. expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
  93. return (one_cst - expanded_mask) * LARGE_NEGATIVE
  94. class TFBartLearnedPositionalEmbedding(keras.layers.Embedding):
  95. """
  96. This module learns positional embeddings up to a fixed maximum size.
  97. """
  98. def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
  99. # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
  100. # and adjust num_embeddings appropriately. Other models don't have this hack
  101. self.offset = 2
  102. super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
  103. def call(
  104. self,
  105. input_shape: Optional[tf.TensorShape] = None,
  106. past_key_values_length: int = 0,
  107. position_ids: tf.Tensor | None = None,
  108. ):
  109. """Input is expected to be of size [bsz x seqlen]."""
  110. if position_ids is None:
  111. seq_len = input_shape[1]
  112. position_ids = tf.range(seq_len, delta=1, name="range")
  113. position_ids += past_key_values_length
  114. offset_dtype = position_ids.dtype if isinstance(position_ids, tf.Tensor) else tf.int32
  115. return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype))
  116. class TFBartAttention(keras.layers.Layer):
  117. """Multi-headed attention from "Attention Is All You Need"""
  118. def __init__(
  119. self,
  120. embed_dim: int,
  121. num_heads: int,
  122. dropout: float = 0.0,
  123. is_decoder: bool = False,
  124. bias: bool = True,
  125. **kwargs,
  126. ):
  127. super().__init__(**kwargs)
  128. self.embed_dim = embed_dim
  129. self.num_heads = num_heads
  130. self.dropout = keras.layers.Dropout(dropout)
  131. self.head_dim = embed_dim // num_heads
  132. if (self.head_dim * num_heads) != self.embed_dim:
  133. raise ValueError(
  134. f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
  135. f" and `num_heads`: {num_heads})."
  136. )
  137. self.scaling = self.head_dim**-0.5
  138. self.is_decoder = is_decoder
  139. self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
  140. self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
  141. self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
  142. self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
  143. def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
  144. return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
  145. def call(
  146. self,
  147. hidden_states: tf.Tensor,
  148. key_value_states: tf.Tensor | None = None,
  149. past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
  150. attention_mask: tf.Tensor | None = None,
  151. layer_head_mask: tf.Tensor | None = None,
  152. training: Optional[bool] = False,
  153. ) -> Tuple[tf.Tensor, tf.Tensor | None]:
  154. """Input shape: Batch x Time x Channel"""
  155. # if key_value_states are provided this layer is used as a cross-attention layer
  156. # for the decoder
  157. is_cross_attention = key_value_states is not None
  158. bsz, tgt_len, embed_dim = shape_list(hidden_states)
  159. # get query proj
  160. query_states = self.q_proj(hidden_states) * self.scaling
  161. # get key, value proj
  162. if is_cross_attention and past_key_value is not None:
  163. # reuse k,v, cross_attentions
  164. key_states = past_key_value[0]
  165. value_states = past_key_value[1]
  166. elif is_cross_attention:
  167. # cross_attentions
  168. key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
  169. value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
  170. elif past_key_value is not None:
  171. # reuse k, v, self_attention
  172. key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
  173. value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
  174. key_states = tf.concat([past_key_value[0], key_states], axis=2)
  175. value_states = tf.concat([past_key_value[1], value_states], axis=2)
  176. else:
  177. # self_attention
  178. key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
  179. value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
  180. if self.is_decoder:
  181. # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
  182. # Further calls to cross_attention layer can then reuse all cross-attention
  183. # key/value_states (first "if" case)
  184. # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
  185. # all previous decoder key/value_states. Further calls to uni-directional self-attention
  186. # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
  187. # if encoder bi-directional self-attention `past_key_value` is always `None`
  188. past_key_value = (key_states, value_states)
  189. proj_shape = (bsz * self.num_heads, -1, self.head_dim)
  190. query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
  191. key_states = tf.reshape(key_states, proj_shape)
  192. value_states = tf.reshape(value_states, proj_shape)
  193. src_len = shape_list(key_states)[1]
  194. attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
  195. tf.debugging.assert_equal(
  196. shape_list(attn_weights),
  197. [bsz * self.num_heads, tgt_len, src_len],
  198. message=(
  199. f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
  200. f" {shape_list(attn_weights)}"
  201. ),
  202. )
  203. if attention_mask is not None:
  204. tf.debugging.assert_equal(
  205. shape_list(attention_mask),
  206. [bsz, 1, tgt_len, src_len],
  207. message=(
  208. f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
  209. f" {shape_list(attention_mask)}"
  210. ),
  211. )
  212. attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype)
  213. attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
  214. attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
  215. attn_weights = stable_softmax(attn_weights, axis=-1)
  216. if layer_head_mask is not None:
  217. tf.debugging.assert_equal(
  218. shape_list(layer_head_mask),
  219. [self.num_heads],
  220. message=(
  221. f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
  222. f" {shape_list(layer_head_mask)}"
  223. ),
  224. )
  225. attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
  226. attn_weights, (bsz, self.num_heads, tgt_len, src_len)
  227. )
  228. attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
  229. attn_probs = self.dropout(attn_weights, training=training)
  230. attn_output = tf.matmul(attn_probs, value_states)
  231. tf.debugging.assert_equal(
  232. shape_list(attn_output),
  233. [bsz * self.num_heads, tgt_len, self.head_dim],
  234. message=(
  235. f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
  236. f" {shape_list(attn_output)}"
  237. ),
  238. )
  239. attn_output = tf.transpose(
  240. tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
  241. )
  242. attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
  243. attn_output = self.out_proj(attn_output)
  244. attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
  245. return attn_output, attn_weights, past_key_value
  246. def build(self, input_shape=None):
  247. if self.built:
  248. return
  249. self.built = True
  250. if getattr(self, "k_proj", None) is not None:
  251. with tf.name_scope(self.k_proj.name):
  252. self.k_proj.build([None, None, self.embed_dim])
  253. if getattr(self, "q_proj", None) is not None:
  254. with tf.name_scope(self.q_proj.name):
  255. self.q_proj.build([None, None, self.embed_dim])
  256. if getattr(self, "v_proj", None) is not None:
  257. with tf.name_scope(self.v_proj.name):
  258. self.v_proj.build([None, None, self.embed_dim])
  259. if getattr(self, "out_proj", None) is not None:
  260. with tf.name_scope(self.out_proj.name):
  261. self.out_proj.build([None, None, self.embed_dim])
  262. class TFBartEncoderLayer(keras.layers.Layer):
  263. def __init__(self, config: BartConfig, **kwargs):
  264. super().__init__(**kwargs)
  265. self.embed_dim = config.d_model
  266. self.self_attn = TFBartAttention(
  267. self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
  268. )
  269. self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
  270. self.dropout = keras.layers.Dropout(config.dropout)
  271. self.activation_fn = get_tf_activation(config.activation_function)
  272. self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
  273. self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
  274. self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
  275. self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
  276. self.config = config
  277. def call(
  278. self,
  279. hidden_states: tf.Tensor,
  280. attention_mask: np.ndarray | tf.Tensor | None,
  281. layer_head_mask: tf.Tensor | None,
  282. training: Optional[bool] = False,
  283. ) -> tf.Tensor:
  284. """
  285. Args:
  286. hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
  287. attention_mask (`tf.Tensor`): attention mask of size
  288. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
  289. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
  290. `(encoder_attention_heads,)`
  291. """
  292. residual = hidden_states
  293. hidden_states, self_attn_weights, _ = self.self_attn(
  294. hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
  295. )
  296. tf.debugging.assert_equal(
  297. shape_list(hidden_states),
  298. shape_list(residual),
  299. message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
  300. )
  301. hidden_states = self.dropout(hidden_states, training=training)
  302. hidden_states = residual + hidden_states
  303. hidden_states = self.self_attn_layer_norm(hidden_states)
  304. residual = hidden_states
  305. hidden_states = self.activation_fn(self.fc1(hidden_states))
  306. hidden_states = self.activation_dropout(hidden_states, training=training)
  307. hidden_states = self.fc2(hidden_states)
  308. hidden_states = self.dropout(hidden_states, training=training)
  309. hidden_states = residual + hidden_states
  310. hidden_states = self.final_layer_norm(hidden_states)
  311. return hidden_states, self_attn_weights
  312. def build(self, input_shape=None):
  313. if self.built:
  314. return
  315. self.built = True
  316. if getattr(self, "self_attn", None) is not None:
  317. with tf.name_scope(self.self_attn.name):
  318. self.self_attn.build(None)
  319. if getattr(self, "self_attn_layer_norm", None) is not None:
  320. with tf.name_scope(self.self_attn_layer_norm.name):
  321. self.self_attn_layer_norm.build([None, None, self.embed_dim])
  322. if getattr(self, "fc1", None) is not None:
  323. with tf.name_scope(self.fc1.name):
  324. self.fc1.build([None, None, self.embed_dim])
  325. if getattr(self, "fc2", None) is not None:
  326. with tf.name_scope(self.fc2.name):
  327. self.fc2.build([None, None, self.config.encoder_ffn_dim])
  328. if getattr(self, "final_layer_norm", None) is not None:
  329. with tf.name_scope(self.final_layer_norm.name):
  330. self.final_layer_norm.build([None, None, self.embed_dim])
  331. class TFBartDecoderLayer(keras.layers.Layer):
  332. def __init__(self, config: BartConfig, **kwargs):
  333. super().__init__(**kwargs)
  334. self.embed_dim = config.d_model
  335. self.self_attn = TFBartAttention(
  336. embed_dim=self.embed_dim,
  337. num_heads=config.decoder_attention_heads,
  338. dropout=config.attention_dropout,
  339. name="self_attn",
  340. is_decoder=True,
  341. )
  342. self.dropout = keras.layers.Dropout(config.dropout)
  343. self.activation_fn = get_tf_activation(config.activation_function)
  344. self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
  345. self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
  346. self.encoder_attn = TFBartAttention(
  347. self.embed_dim,
  348. config.decoder_attention_heads,
  349. dropout=config.attention_dropout,
  350. name="encoder_attn",
  351. is_decoder=True,
  352. )
  353. self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
  354. self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
  355. self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
  356. self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
  357. self.config = config
  358. def call(
  359. self,
  360. hidden_states: tf.Tensor,
  361. attention_mask: np.ndarray | tf.Tensor | None = None,
  362. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  363. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  364. layer_head_mask: tf.Tensor | None = None,
  365. cross_attn_layer_head_mask: tf.Tensor | None = None,
  366. past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  367. training: Optional[bool] = False,
  368. ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
  369. """
  370. Args:
  371. hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
  372. attention_mask (`tf.Tensor`): attention mask of size
  373. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
  374. encoder_hidden_states (`tf.Tensor`):
  375. cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
  376. encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
  377. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
  378. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
  379. `(decoder_attention_heads,)`
  380. cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
  381. `(decoder_attention_heads,)`
  382. past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
  383. """
  384. residual = hidden_states
  385. # Self Attention
  386. # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
  387. self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
  388. # add present self-attn cache to positions 1,2 of present_key_value tuple
  389. hidden_states, self_attn_weights, present_key_value = self.self_attn(
  390. hidden_states=hidden_states,
  391. past_key_value=self_attn_past_key_value,
  392. attention_mask=attention_mask,
  393. layer_head_mask=layer_head_mask,
  394. )
  395. hidden_states = self.dropout(hidden_states, training=training)
  396. hidden_states = residual + hidden_states
  397. hidden_states = self.self_attn_layer_norm(hidden_states)
  398. # Cross-Attention Block
  399. cross_attn_present_key_value = None
  400. cross_attn_weights = None
  401. if encoder_hidden_states is not None:
  402. residual = hidden_states
  403. # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
  404. cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
  405. hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
  406. hidden_states=hidden_states,
  407. key_value_states=encoder_hidden_states,
  408. attention_mask=encoder_attention_mask,
  409. layer_head_mask=cross_attn_layer_head_mask,
  410. past_key_value=cross_attn_past_key_value,
  411. )
  412. hidden_states = self.dropout(hidden_states, training=training)
  413. hidden_states = residual + hidden_states
  414. hidden_states = self.encoder_attn_layer_norm(hidden_states)
  415. # add cross-attn to positions 3,4 of present_key_value tuple
  416. present_key_value = present_key_value + cross_attn_present_key_value
  417. # Fully Connected
  418. residual = hidden_states
  419. hidden_states = self.activation_fn(self.fc1(hidden_states))
  420. hidden_states = self.activation_dropout(hidden_states, training=training)
  421. hidden_states = self.fc2(hidden_states)
  422. hidden_states = self.dropout(hidden_states, training=training)
  423. hidden_states = residual + hidden_states
  424. hidden_states = self.final_layer_norm(hidden_states)
  425. return (
  426. hidden_states,
  427. self_attn_weights,
  428. cross_attn_weights,
  429. present_key_value,
  430. )
  431. def build(self, input_shape=None):
  432. if self.built:
  433. return
  434. self.built = True
  435. if getattr(self, "self_attn", None) is not None:
  436. with tf.name_scope(self.self_attn.name):
  437. self.self_attn.build(None)
  438. if getattr(self, "self_attn_layer_norm", None) is not None:
  439. with tf.name_scope(self.self_attn_layer_norm.name):
  440. self.self_attn_layer_norm.build([None, None, self.embed_dim])
  441. if getattr(self, "encoder_attn", None) is not None:
  442. with tf.name_scope(self.encoder_attn.name):
  443. self.encoder_attn.build(None)
  444. if getattr(self, "encoder_attn_layer_norm", None) is not None:
  445. with tf.name_scope(self.encoder_attn_layer_norm.name):
  446. self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
  447. if getattr(self, "fc1", None) is not None:
  448. with tf.name_scope(self.fc1.name):
  449. self.fc1.build([None, None, self.embed_dim])
  450. if getattr(self, "fc2", None) is not None:
  451. with tf.name_scope(self.fc2.name):
  452. self.fc2.build([None, None, self.config.decoder_ffn_dim])
  453. if getattr(self, "final_layer_norm", None) is not None:
  454. with tf.name_scope(self.final_layer_norm.name):
  455. self.final_layer_norm.build([None, None, self.embed_dim])
  456. class TFBartClassificationHead(keras.layers.Layer):
  457. """Head for sentence-level classification tasks."""
  458. def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs):
  459. super().__init__(name=name, **kwargs)
  460. self.dense = keras.layers.Dense(inner_dim, name="dense")
  461. self.dropout = keras.layers.Dropout(pooler_dropout)
  462. self.out_proj = keras.layers.Dense(num_classes, name="out_proj")
  463. self.input_dim = inner_dim
  464. self.inner_dim = inner_dim
  465. def call(self, inputs):
  466. hidden_states = self.dropout(inputs)
  467. hidden_states = self.dense(hidden_states)
  468. hidden_states = keras.activations.tanh(hidden_states)
  469. hidden_states = self.dropout(hidden_states)
  470. hidden_states = self.out_proj(hidden_states)
  471. return hidden_states
  472. def build(self, input_shape=None):
  473. if self.built:
  474. return
  475. self.built = True
  476. if getattr(self, "dense", None) is not None:
  477. with tf.name_scope(self.dense.name):
  478. self.dense.build([None, None, self.input_dim])
  479. if getattr(self, "out_proj", None) is not None:
  480. with tf.name_scope(self.out_proj.name):
  481. self.out_proj.build([None, None, self.inner_dim])
  482. class TFBartPretrainedModel(TFPreTrainedModel):
  483. config_class = BartConfig
  484. base_model_prefix = "model"
  485. @property
  486. def dummy_inputs(self):
  487. dummy_inputs = super().dummy_inputs
  488. # Dummy inputs should not contain the default val of 1
  489. # as this is the padding token and some assertions check it
  490. dummy_inputs["input_ids"] = dummy_inputs["input_ids"] * 2
  491. if "decoder_input_ids" in dummy_inputs:
  492. dummy_inputs["decoder_input_ids"] = dummy_inputs["decoder_input_ids"] * 2
  493. return dummy_inputs
  494. def tf_to_pt_weight_rename(self, tf_weight):
  495. if tf_weight == "model.shared.weight":
  496. return tf_weight, "model.decoder.embed_tokens.weight"
  497. else:
  498. return (tf_weight,)
  499. BART_START_DOCSTRING = r"""
  500. This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
  501. library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  502. etc.)
  503. This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
  504. as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
  505. behavior.
  506. <Tip>
  507. TensorFlow models and layers in `transformers` accept two formats as input:
  508. - having all inputs as keyword arguments (like PyTorch models), or
  509. - having all inputs as a list, tuple or dict in the first positional argument.
  510. The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
  511. and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
  512. pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
  513. format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
  514. the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
  515. positional argument:
  516. - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
  517. - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
  518. `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
  519. - a dictionary with one or several input Tensors associated to the input names given in the docstring:
  520. `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
  521. Note that when creating models and layers with
  522. [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
  523. about any of this, as you can just pass inputs like you would to any other Python function!
  524. </Tip>
  525. Args:
  526. config ([`BartConfig`]): Model configuration class with all the parameters of the model.
  527. Initializing with a config file does not load the weights associated with the model, only the
  528. configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
  529. """
  530. BART_GENERATION_EXAMPLE = r"""
  531. Summarization example:
  532. ```python
  533. >>> from transformers import AutoTokenizer, TFBartForConditionalGeneration
  534. >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
  535. >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
  536. >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
  537. >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="tf")
  538. >>> # Generate Summary
  539. >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
  540. >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
  541. ```
  542. Mask filling example:
  543. ```python
  544. >>> from transformers import AutoTokenizer, TFBartForConditionalGeneration
  545. >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
  546. >>> TXT = "My friends are <mask> but they eat too many carbs."
  547. >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
  548. >>> input_ids = tokenizer([TXT], return_tensors="tf")["input_ids"]
  549. >>> logits = model(input_ids).logits
  550. >>> probs = tf.nn.softmax(logits[0])
  551. >>> # probs[5] is associated with the mask token
  552. ```
  553. """
  554. BART_INPUTS_DOCSTRING = r"""
  555. Args:
  556. input_ids (`tf.Tensor` of shape `({0})`):
  557. Indices of input sequence tokens in the vocabulary.
  558. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  559. [`PreTrainedTokenizer.__call__`] for details.
  560. [What are input IDs?](../glossary#input-ids)
  561. attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
  562. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  563. - 1 for tokens that are **not masked**,
  564. - 0 for tokens that are **masked**.
  565. [What are attention masks?](../glossary#attention-mask)
  566. decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
  567. Indices of decoder input sequence tokens in the vocabulary.
  568. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  569. [`PreTrainedTokenizer.__call__`] for details.
  570. [What are decoder input IDs?](../glossary#decoder-input-ids)
  571. Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
  572. is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
  573. For translation and summarization training, `decoder_input_ids` should be provided. If no
  574. `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
  575. for denoising pre-training following the paper.
  576. decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
  577. will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
  578. decoder_position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  579. Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
  580. range `[0, config.max_position_embeddings - 1]`.
  581. head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
  582. Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
  583. - 1 indicates the head is **not masked**,
  584. - 0 indicates the head is **masked**.
  585. decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
  586. Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
  587. - 1 indicates the head is **not masked**,
  588. - 0 indicates the head is **masked**.
  589. cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
  590. Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
  591. - 1 indicates the head is **not masked**,
  592. - 0 indicates the head is **masked**.
  593. encoder_outputs (`tf.FloatTensor`, *optional*):
  594. hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
  595. of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
  596. past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
  597. contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
  598. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
  599. don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
  600. `decoder_input_ids` of shape `(batch_size, sequence_length)`.
  601. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  602. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
  603. This is useful if you want more control over how to convert `input_ids` indices into associated vectors
  604. than the model's internal embedding lookup matrix.
  605. use_cache (`bool`, *optional*, defaults to `True`):
  606. If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
  607. `past_key_values`). Set to `False` during training, `True` during generation
  608. output_attentions (`bool`, *optional*):
  609. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  610. tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
  611. config will be used instead.
  612. output_hidden_states (`bool`, *optional*):
  613. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  614. more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
  615. used instead.
  616. return_dict (`bool`, *optional*):
  617. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
  618. eager mode, in graph mode the value will always be set to True.
  619. training (`bool`, *optional*, defaults to `False`):
  620. Whether or not to use the model in training mode (some modules like dropout modules have different
  621. behaviors between training and evaluation).
  622. """
  623. @keras_serializable
  624. class TFBartEncoder(keras.layers.Layer):
  625. config_class = BartConfig
  626. """
  627. Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
  628. [`TFBartEncoderLayer`].
  629. Args:
  630. config: BartConfig
  631. """
  632. def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
  633. super().__init__(**kwargs)
  634. self.config = config
  635. self.dropout = keras.layers.Dropout(config.dropout)
  636. self.layerdrop = config.encoder_layerdrop
  637. self.padding_idx = config.pad_token_id
  638. self.max_source_positions = config.max_position_embeddings
  639. self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
  640. self.embed_tokens = embed_tokens
  641. self.embed_positions = TFBartLearnedPositionalEmbedding(
  642. config.max_position_embeddings,
  643. config.d_model,
  644. name="embed_positions",
  645. )
  646. self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
  647. self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
  648. self.embed_dim = config.d_model
  649. @unpack_inputs
  650. def call(
  651. self,
  652. input_ids: TFModelInputType | None = None,
  653. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  654. attention_mask: np.ndarray | tf.Tensor | None = None,
  655. head_mask: np.ndarray | tf.Tensor | None = None,
  656. output_attentions: Optional[bool] = None,
  657. output_hidden_states: Optional[bool] = None,
  658. return_dict: Optional[bool] = None,
  659. training: Optional[bool] = False,
  660. ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
  661. """
  662. Args:
  663. input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  664. Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
  665. provide it.
  666. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  667. [`PreTrainedTokenizer.__call__`] for details.
  668. [What are input IDs?](../glossary#input-ids)
  669. attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  670. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  671. - 1 for tokens that are **not masked**,
  672. - 0 for tokens that are **masked**.
  673. [What are attention masks?](../glossary#attention-mask)
  674. head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional):
  675. Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
  676. - 1 indicates the head is **not masked**,
  677. - 0 indicates the head is **masked**.
  678. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  679. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
  680. This is useful if you want more control over how to convert `input_ids` indices into associated vectors
  681. than the model's internal embedding lookup matrix.
  682. output_attentions (`bool`, *optional*):
  683. Whether or not to return the attentions tensors of all attention layers. See `attentions` under
  684. returned tensors for more detail.
  685. output_hidden_states (`bool`, *optional*):
  686. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
  687. for more detail.
  688. return_dict (`bool`, *optional*):
  689. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  690. """
  691. if input_ids is not None and inputs_embeds is not None:
  692. raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
  693. elif input_ids is not None:
  694. input_shape = shape_list(input_ids)
  695. elif inputs_embeds is not None:
  696. input_shape = shape_list(inputs_embeds)[:-1]
  697. else:
  698. raise ValueError("You have to specify either input_ids or inputs_embeds")
  699. if inputs_embeds is None:
  700. check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
  701. inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
  702. embed_pos = self.embed_positions(input_shape)
  703. hidden_states = inputs_embeds + embed_pos
  704. hidden_states = self.layernorm_embedding(hidden_states)
  705. hidden_states = self.dropout(hidden_states, training=training)
  706. # check attention mask and invert
  707. if attention_mask is not None:
  708. # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
  709. attention_mask = _expand_mask(attention_mask)
  710. else:
  711. attention_mask = None
  712. encoder_states = () if output_hidden_states else None
  713. all_attentions = () if output_attentions else None
  714. # check if head_mask has a correct number of layers specified if desired
  715. if head_mask is not None:
  716. tf.debugging.assert_equal(
  717. shape_list(head_mask)[0],
  718. len(self.layers),
  719. message=(
  720. f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
  721. f" {shape_list(head_mask)[0]}."
  722. ),
  723. )
  724. # encoder layers
  725. for idx, encoder_layer in enumerate(self.layers):
  726. if output_hidden_states:
  727. encoder_states = encoder_states + (hidden_states,)
  728. # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
  729. dropout_probability = random.uniform(0, 1)
  730. if training and (dropout_probability < self.layerdrop): # skip the layer
  731. continue
  732. hidden_states, attn = encoder_layer(
  733. hidden_states,
  734. attention_mask,
  735. head_mask[idx] if head_mask is not None else None,
  736. )
  737. if output_attentions:
  738. all_attentions += (attn,)
  739. if output_hidden_states:
  740. encoder_states = encoder_states + (hidden_states,)
  741. if not return_dict:
  742. return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
  743. return TFBaseModelOutput(
  744. last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
  745. )
  746. def build(self, input_shape=None):
  747. if self.built:
  748. return
  749. self.built = True
  750. if getattr(self, "embed_positions", None) is not None:
  751. with tf.name_scope(self.embed_positions.name):
  752. self.embed_positions.build(None)
  753. if getattr(self, "layernorm_embedding", None) is not None:
  754. with tf.name_scope(self.layernorm_embedding.name):
  755. self.layernorm_embedding.build([None, None, self.embed_dim])
  756. if getattr(self, "layers", None) is not None:
  757. for layer in self.layers:
  758. with tf.name_scope(layer.name):
  759. layer.build(None)
  760. @keras_serializable
  761. class TFBartDecoder(keras.layers.Layer):
  762. config_class = BartConfig
  763. """
  764. Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`]
  765. Args:
  766. config: BartConfig
  767. embed_tokens: output embedding
  768. """
  769. def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
  770. super().__init__(**kwargs)
  771. self.config = config
  772. self.padding_idx = config.pad_token_id
  773. self.embed_tokens = embed_tokens
  774. self.layerdrop = config.decoder_layerdrop
  775. self.embed_positions = TFBartLearnedPositionalEmbedding(
  776. config.max_position_embeddings,
  777. config.d_model,
  778. name="embed_positions",
  779. )
  780. self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
  781. self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
  782. self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
  783. self.dropout = keras.layers.Dropout(config.dropout)
  784. @unpack_inputs
  785. def call(
  786. self,
  787. input_ids: TFModelInputType | None = None,
  788. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  789. attention_mask: np.ndarray | tf.Tensor | None = None,
  790. position_ids: np.ndarray | tf.Tensor | None = None,
  791. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  792. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  793. head_mask: np.ndarray | tf.Tensor | None = None,
  794. cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
  795. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  796. use_cache: Optional[bool] = None,
  797. output_attentions: Optional[bool] = None,
  798. output_hidden_states: Optional[bool] = None,
  799. return_dict: Optional[bool] = None,
  800. training: Optional[bool] = False,
  801. ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
  802. r"""
  803. Args:
  804. input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  805. Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
  806. provide it.
  807. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  808. [`PreTrainedTokenizer.__call__`] for details.
  809. [What are input IDs?](../glossary#input-ids)
  810. attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  811. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  812. - 1 for tokens that are **not masked**,
  813. - 0 for tokens that are **masked**.
  814. [What are attention masks?](../glossary#attention-mask)
  815. position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  816. Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
  817. range `[0, config.max_position_embeddings - 1]`.
  818. encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
  819. Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
  820. of the decoder.
  821. encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
  822. Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
  823. selected in `[0, 1]`:
  824. - 1 for tokens that are **not masked**,
  825. - 0 for tokens that are **masked**.
  826. [What are attention masks?](../glossary#attention-mask)
  827. head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
  828. Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
  829. - 1 indicates the head is **not masked**,
  830. - 0 indicates the head is **masked**.
  831. cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
  832. Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
  833. - 1 indicates the head is **not masked**,
  834. - 0 indicates the head is **masked**.
  835. past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
  836. Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
  837. decoding.
  838. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
  839. that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
  840. all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
  841. inputs_embeds (`tf.tTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  842. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
  843. This is useful if you want more control over how to convert `input_ids` indices into associated vectors
  844. than the model's internal embedding lookup matrix.
  845. output_attentions (`bool`, *optional*):
  846. Whether or not to return the attentions tensors of all attention layers. See `attentions` under
  847. returned tensors for more detail.
  848. output_hidden_states (`bool`, *optional*):
  849. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
  850. for more detail.
  851. return_dict (`bool`, *optional*):
  852. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  853. """
  854. if input_ids is not None and inputs_embeds is not None:
  855. raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
  856. elif input_ids is not None:
  857. input_shape = shape_list(input_ids)
  858. elif inputs_embeds is not None:
  859. input_shape = shape_list(inputs_embeds)[:-1]
  860. else:
  861. raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
  862. past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
  863. # embed positions
  864. if position_ids is None:
  865. positions = self.embed_positions(input_shape, past_key_values_length)
  866. else:
  867. positions = self.embed_positions(input_shape, position_ids=position_ids)
  868. if inputs_embeds is None:
  869. check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
  870. inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
  871. hidden_states = inputs_embeds
  872. # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
  873. if input_shape[-1] > 1:
  874. combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
  875. else:
  876. combined_attention_mask = _expand_mask(
  877. tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1]
  878. )
  879. if attention_mask is not None:
  880. combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1])
  881. if encoder_hidden_states is not None and encoder_attention_mask is not None:
  882. # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
  883. encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
  884. hidden_states = self.layernorm_embedding(hidden_states + positions)
  885. hidden_states = self.dropout(hidden_states, training=training)
  886. # decoder layers
  887. all_hidden_states = () if output_hidden_states else None
  888. all_self_attns = () if output_attentions else None
  889. all_cross_attns = () if (output_attentions and encoder_hidden_states is not None) else None
  890. present_key_values = () if use_cache else None
  891. # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired
  892. for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]:
  893. if attn_mask is not None:
  894. tf.debugging.assert_equal(
  895. shape_list(attn_mask)[0],
  896. len(self.layers),
  897. message=(
  898. f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for"
  899. f" {shape_list(attn_mask)[0]}."
  900. ),
  901. )
  902. for idx, decoder_layer in enumerate(self.layers):
  903. # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
  904. if output_hidden_states:
  905. all_hidden_states += (hidden_states,)
  906. dropout_probability = random.uniform(0, 1)
  907. if training and (dropout_probability < self.layerdrop):
  908. continue
  909. past_key_value = past_key_values[idx] if past_key_values is not None else None
  910. hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer(
  911. hidden_states,
  912. attention_mask=combined_attention_mask,
  913. encoder_hidden_states=encoder_hidden_states,
  914. encoder_attention_mask=encoder_attention_mask,
  915. layer_head_mask=head_mask[idx] if head_mask is not None else None,
  916. cross_attn_layer_head_mask=cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
  917. past_key_value=past_key_value,
  918. )
  919. if use_cache:
  920. present_key_values += (present_key_value,)
  921. if output_attentions:
  922. all_self_attns += (layer_self_attn,)
  923. if encoder_hidden_states is not None:
  924. all_cross_attns += (layer_cross_attn,)
  925. if output_hidden_states:
  926. all_hidden_states += (hidden_states,)
  927. if not return_dict:
  928. return hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attns
  929. else:
  930. return TFBaseModelOutputWithPastAndCrossAttentions(
  931. last_hidden_state=hidden_states,
  932. past_key_values=present_key_values,
  933. hidden_states=all_hidden_states,
  934. attentions=all_self_attns,
  935. cross_attentions=all_cross_attns,
  936. )
  937. def build(self, input_shape=None):
  938. if self.built:
  939. return
  940. self.built = True
  941. if getattr(self, "embed_positions", None) is not None:
  942. with tf.name_scope(self.embed_positions.name):
  943. self.embed_positions.build(None)
  944. if getattr(self, "layernorm_embedding", None) is not None:
  945. with tf.name_scope(self.layernorm_embedding.name):
  946. self.layernorm_embedding.build([None, None, self.config.d_model])
  947. if getattr(self, "layers", None) is not None:
  948. for layer in self.layers:
  949. with tf.name_scope(layer.name):
  950. layer.build(None)
  951. @keras_serializable
  952. class TFBartMainLayer(keras.layers.Layer):
  953. config_class = BartConfig
  954. def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs):
  955. super().__init__(**kwargs)
  956. self.config = config
  957. self.shared = keras.layers.Embedding(
  958. input_dim=config.vocab_size,
  959. output_dim=config.d_model,
  960. embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
  961. name="model.shared",
  962. )
  963. # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
  964. self.shared.load_weight_prefix = "model.shared" if load_weight_prefix is None else load_weight_prefix
  965. self.encoder = TFBartEncoder(config, self.shared, name="encoder")
  966. self.decoder = TFBartDecoder(config, self.shared, name="decoder")
  967. def get_input_embeddings(self):
  968. return self.shared
  969. def set_input_embeddings(self, new_embeddings):
  970. self.shared = new_embeddings
  971. self.encoder.embed_tokens = self.shared
  972. self.decoder.embed_tokens = self.shared
  973. @unpack_inputs
  974. def call(
  975. self,
  976. input_ids: TFModelInputType | None = None,
  977. attention_mask: np.ndarray | tf.Tensor | None = None,
  978. decoder_input_ids: np.ndarray | tf.Tensor | None = None,
  979. decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  980. decoder_position_ids: np.ndarray | tf.Tensor | None = None,
  981. head_mask: np.ndarray | tf.Tensor | None = None,
  982. decoder_head_mask: np.ndarray | tf.Tensor | None = None,
  983. cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
  984. encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
  985. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  986. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  987. decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
  988. use_cache: Optional[bool] = None,
  989. output_attentions: Optional[bool] = None,
  990. output_hidden_states: Optional[bool] = None,
  991. return_dict: Optional[bool] = None,
  992. training: Optional[bool] = False,
  993. **kwargs,
  994. ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
  995. # different to other models, Bart automatically creates decoder_input_ids from
  996. # input_ids if no decoder_input_ids are provided
  997. if decoder_input_ids is None and decoder_inputs_embeds is None:
  998. if input_ids is None:
  999. raise ValueError(
  1000. "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
  1001. "passed, `input_ids` cannot be `None`. Please pass either "
  1002. "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
  1003. )
  1004. decoder_input_ids = shift_tokens_right(
  1005. input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
  1006. )
  1007. if encoder_outputs is None:
  1008. encoder_outputs = self.encoder(
  1009. input_ids=input_ids,
  1010. attention_mask=attention_mask,
  1011. head_mask=head_mask,
  1012. inputs_embeds=inputs_embeds,
  1013. output_attentions=output_attentions,
  1014. output_hidden_states=output_hidden_states,
  1015. return_dict=return_dict,
  1016. training=training,
  1017. )
  1018. # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True
  1019. elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
  1020. encoder_outputs = TFBaseModelOutput(
  1021. last_hidden_state=encoder_outputs[0],
  1022. hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
  1023. attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
  1024. )
  1025. # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False
  1026. elif not return_dict and not isinstance(encoder_outputs, tuple):
  1027. encoder_outputs = encoder_outputs.to_tuple()
  1028. decoder_outputs = self.decoder(
  1029. decoder_input_ids,
  1030. attention_mask=decoder_attention_mask,
  1031. position_ids=decoder_position_ids,
  1032. encoder_hidden_states=encoder_outputs[0],
  1033. encoder_attention_mask=attention_mask,
  1034. head_mask=decoder_head_mask,
  1035. cross_attn_head_mask=cross_attn_head_mask,
  1036. past_key_values=past_key_values,
  1037. inputs_embeds=decoder_inputs_embeds,
  1038. use_cache=use_cache,
  1039. output_attentions=output_attentions,
  1040. output_hidden_states=output_hidden_states,
  1041. return_dict=return_dict,
  1042. training=training,
  1043. )
  1044. if not return_dict:
  1045. return decoder_outputs + encoder_outputs
  1046. return TFSeq2SeqModelOutput(
  1047. last_hidden_state=decoder_outputs.last_hidden_state,
  1048. past_key_values=decoder_outputs.past_key_values,
  1049. decoder_hidden_states=decoder_outputs.hidden_states,
  1050. decoder_attentions=decoder_outputs.attentions,
  1051. cross_attentions=decoder_outputs.cross_attentions,
  1052. encoder_last_hidden_state=encoder_outputs.last_hidden_state,
  1053. encoder_hidden_states=encoder_outputs.hidden_states,
  1054. encoder_attentions=encoder_outputs.attentions,
  1055. )
  1056. def build(self, input_shape=None):
  1057. if self.built:
  1058. return
  1059. self.built = True
  1060. # The shared/tied weights expect to be in the model base namespace
  1061. # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
  1062. # the current one.
  1063. with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
  1064. self.shared.build(None)
  1065. if getattr(self, "encoder", None) is not None:
  1066. with tf.name_scope(self.encoder.name):
  1067. self.encoder.build(None)
  1068. if getattr(self, "decoder", None) is not None:
  1069. with tf.name_scope(self.decoder.name):
  1070. self.decoder.build(None)
  1071. @add_start_docstrings(
  1072. "The bare BART Model outputting raw hidden-states without any specific head on top.",
  1073. BART_START_DOCSTRING,
  1074. )
  1075. class TFBartModel(TFBartPretrainedModel):
  1076. _requires_load_weight_prefix = True
  1077. def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs):
  1078. super().__init__(config, *inputs, **kwargs)
  1079. self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model")
  1080. def get_encoder(self):
  1081. return self.model.encoder
  1082. def get_decoder(self):
  1083. return self.model.decoder
  1084. @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1085. @add_code_sample_docstrings(
  1086. checkpoint=_CHECKPOINT_FOR_DOC,
  1087. output_type=TFSeq2SeqModelOutput,
  1088. config_class=_CONFIG_FOR_DOC,
  1089. )
  1090. @unpack_inputs
  1091. def call(
  1092. self,
  1093. input_ids: TFModelInputType | None = None,
  1094. attention_mask: np.ndarray | tf.Tensor | None = None,
  1095. decoder_input_ids: np.ndarray | tf.Tensor | None = None,
  1096. decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  1097. decoder_position_ids: np.ndarray | tf.Tensor | None = None,
  1098. head_mask: np.ndarray | tf.Tensor | None = None,
  1099. decoder_head_mask: np.ndarray | tf.Tensor | None = None,
  1100. cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
  1101. encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
  1102. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  1103. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1104. decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1105. use_cache: Optional[bool] = None,
  1106. output_attentions: Optional[bool] = None,
  1107. output_hidden_states: Optional[bool] = None,
  1108. return_dict: Optional[bool] = None,
  1109. training: Optional[bool] = False,
  1110. **kwargs,
  1111. ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
  1112. outputs = self.model(
  1113. input_ids=input_ids,
  1114. attention_mask=attention_mask,
  1115. decoder_input_ids=decoder_input_ids,
  1116. decoder_attention_mask=decoder_attention_mask,
  1117. decoder_position_ids=decoder_position_ids,
  1118. head_mask=head_mask,
  1119. decoder_head_mask=decoder_head_mask,
  1120. cross_attn_head_mask=cross_attn_head_mask,
  1121. encoder_outputs=encoder_outputs,
  1122. past_key_values=past_key_values,
  1123. inputs_embeds=inputs_embeds,
  1124. decoder_inputs_embeds=decoder_inputs_embeds,
  1125. use_cache=use_cache,
  1126. output_attentions=output_attentions,
  1127. output_hidden_states=output_hidden_states,
  1128. return_dict=return_dict,
  1129. training=training,
  1130. )
  1131. return outputs
  1132. def serving_output(self, output):
  1133. pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
  1134. dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
  1135. dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
  1136. cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
  1137. enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
  1138. enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
  1139. return TFSeq2SeqModelOutput(
  1140. last_hidden_state=output.last_hidden_state,
  1141. past_key_values=pkv,
  1142. decoder_hidden_states=dec_hs,
  1143. decoder_attentions=dec_attns,
  1144. cross_attentions=cross_attns,
  1145. encoder_last_hidden_state=output.encoder_last_hidden_state,
  1146. encoder_hidden_states=enc_hs,
  1147. encoder_attentions=enc_attns,
  1148. )
  1149. def build(self, input_shape=None):
  1150. if self.built:
  1151. return
  1152. self.built = True
  1153. if getattr(self, "model", None) is not None:
  1154. with tf.name_scope(self.model.name):
  1155. self.model.build(None)
  1156. class BiasLayer(keras.layers.Layer):
  1157. """
  1158. Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
  1159. so all weights have to be registered in a layer.
  1160. """
  1161. def __init__(self, shape, initializer, trainable, name, **kwargs):
  1162. super().__init__(name=name, **kwargs)
  1163. # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
  1164. # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
  1165. # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
  1166. self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)
  1167. def call(self, x):
  1168. return x + self.bias
  1169. @add_start_docstrings(
  1170. "The BART Model with a language modeling head. Can be used for summarization.",
  1171. BART_START_DOCSTRING,
  1172. )
  1173. class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageModelingLoss):
  1174. _keys_to_ignore_on_load_missing = [r"final_logits_bias"]
  1175. _requires_load_weight_prefix = True
  1176. def __init__(self, config, load_weight_prefix=None, *inputs, **kwargs):
  1177. super().__init__(config, *inputs, **kwargs)
  1178. self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model")
  1179. self.use_cache = config.use_cache
  1180. # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
  1181. self.bias_layer = BiasLayer(
  1182. name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
  1183. )
  1184. def get_decoder(self):
  1185. return self.model.decoder
  1186. def get_encoder(self):
  1187. return self.model.encoder
  1188. def get_output_embeddings(self):
  1189. return self.get_input_embeddings()
  1190. def set_output_embeddings(self, value):
  1191. self.set_input_embeddings(value)
  1192. def get_bias(self):
  1193. return {"final_logits_bias": self.bias_layer.bias}
  1194. def set_bias(self, value):
  1195. # Replaces the existing layers containing bias for correct (de)serialization.
  1196. vocab_size = value["final_logits_bias"].shape[-1]
  1197. self.bias_layer = BiasLayer(
  1198. name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
  1199. )
  1200. self.bias_layer.bias.assign(value["final_logits_bias"])
  1201. @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
  1202. @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
  1203. @add_end_docstrings(BART_GENERATION_EXAMPLE)
  1204. @unpack_inputs
  1205. def call(
  1206. self,
  1207. input_ids: TFModelInputType | None = None,
  1208. attention_mask: np.ndarray | tf.Tensor | None = None,
  1209. decoder_input_ids: np.ndarray | tf.Tensor | None = None,
  1210. decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  1211. decoder_position_ids: np.ndarray | tf.Tensor | None = None,
  1212. head_mask: np.ndarray | tf.Tensor | None = None,
  1213. decoder_head_mask: np.ndarray | tf.Tensor | None = None,
  1214. cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
  1215. encoder_outputs: Optional[TFBaseModelOutput] = None,
  1216. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  1217. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1218. decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1219. use_cache: Optional[bool] = None,
  1220. output_attentions: Optional[bool] = None,
  1221. output_hidden_states: Optional[bool] = None,
  1222. return_dict: Optional[bool] = None,
  1223. labels: tf.Tensor | None = None,
  1224. training: Optional[bool] = False,
  1225. ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
  1226. r"""
  1227. labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  1228. Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
  1229. config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
  1230. (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
  1231. Returns:
  1232. """
  1233. if labels is not None:
  1234. labels = tf.where(
  1235. labels == self.config.pad_token_id,
  1236. tf.cast(tf.fill(shape_list(labels), -100), labels.dtype),
  1237. labels,
  1238. )
  1239. use_cache = False
  1240. if decoder_input_ids is None and decoder_inputs_embeds is None:
  1241. decoder_input_ids = shift_tokens_right(
  1242. labels, self.config.pad_token_id, self.config.decoder_start_token_id
  1243. )
  1244. outputs = self.model(
  1245. input_ids,
  1246. attention_mask=attention_mask,
  1247. decoder_input_ids=decoder_input_ids,
  1248. encoder_outputs=encoder_outputs,
  1249. decoder_attention_mask=decoder_attention_mask,
  1250. decoder_position_ids=decoder_position_ids,
  1251. head_mask=head_mask,
  1252. decoder_head_mask=decoder_head_mask,
  1253. cross_attn_head_mask=cross_attn_head_mask,
  1254. past_key_values=past_key_values,
  1255. inputs_embeds=inputs_embeds,
  1256. decoder_inputs_embeds=decoder_inputs_embeds,
  1257. use_cache=use_cache,
  1258. output_attentions=output_attentions,
  1259. output_hidden_states=output_hidden_states,
  1260. return_dict=return_dict,
  1261. training=training,
  1262. )
  1263. lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
  1264. lm_logits = self.bias_layer(lm_logits)
  1265. masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)
  1266. if not return_dict:
  1267. output = (lm_logits,) + outputs[1:]
  1268. return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
  1269. return TFSeq2SeqLMOutput(
  1270. loss=masked_lm_loss,
  1271. logits=lm_logits,
  1272. past_key_values=outputs.past_key_values, # index 1 of d outputs
  1273. decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs
  1274. decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs
  1275. cross_attentions=outputs.cross_attentions, # index 4 of d outputs
  1276. encoder_last_hidden_state=outputs.encoder_last_hidden_state, # index 0 of encoder outputs
  1277. encoder_hidden_states=outputs.encoder_hidden_states, # 1 of e out
  1278. encoder_attentions=outputs.encoder_attentions, # 2 of e out
  1279. )
  1280. def serving_output(self, output):
  1281. pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
  1282. dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
  1283. dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
  1284. cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
  1285. enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
  1286. enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
  1287. return TFSeq2SeqLMOutput(
  1288. logits=output.logits,
  1289. past_key_values=pkv,
  1290. decoder_hidden_states=dec_hs,
  1291. decoder_attentions=dec_attns,
  1292. cross_attentions=cross_attns,
  1293. encoder_last_hidden_state=output.encoder_last_hidden_state,
  1294. encoder_hidden_states=enc_hs,
  1295. encoder_attentions=enc_attns,
  1296. )
  1297. def prepare_inputs_for_generation(
  1298. self,
  1299. decoder_input_ids,
  1300. past_key_values=None,
  1301. attention_mask=None,
  1302. decoder_attention_mask=None,
  1303. head_mask=None,
  1304. decoder_head_mask=None,
  1305. cross_attn_head_mask=None,
  1306. use_cache=None,
  1307. encoder_outputs=None,
  1308. **kwargs,
  1309. ):
  1310. # cut decoder_input_ids if past_key_values is used
  1311. if past_key_values is not None:
  1312. decoder_input_ids = decoder_input_ids[:, -1:]
  1313. if decoder_attention_mask is not None: # xla
  1314. decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
  1315. elif past_key_values is not None: # no xla + past_key_values
  1316. decoder_position_ids = past_key_values[0][0].shape[2]
  1317. else: # no xla + no past_key_values
  1318. decoder_position_ids = tf.range(decoder_input_ids.shape[1])
  1319. return {
  1320. "input_ids": None, # encoder_outputs is defined. input_ids not needed
  1321. "encoder_outputs": encoder_outputs,
  1322. "past_key_values": past_key_values,
  1323. "decoder_input_ids": decoder_input_ids,
  1324. "attention_mask": attention_mask,
  1325. "decoder_attention_mask": decoder_attention_mask,
  1326. "decoder_position_ids": decoder_position_ids,
  1327. "head_mask": head_mask,
  1328. "decoder_head_mask": decoder_head_mask,
  1329. "cross_attn_head_mask": cross_attn_head_mask,
  1330. "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
  1331. }
  1332. def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
  1333. return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
  1334. def build(self, input_shape=None):
  1335. if self.built:
  1336. return
  1337. self.built = True
  1338. if getattr(self, "model", None) is not None:
  1339. with tf.name_scope(self.model.name):
  1340. self.model.build(None)
  1341. if getattr(self, "bias_layer", None) is not None:
  1342. with tf.name_scope(self.bias_layer.name):
  1343. self.bias_layer.build(None)
  1344. @add_start_docstrings(
  1345. """
  1346. Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
  1347. tasks.
  1348. """,
  1349. BART_START_DOCSTRING,
  1350. )
  1351. class TFBartForSequenceClassification(TFBartPretrainedModel, TFSequenceClassificationLoss):
  1352. def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs):
  1353. super().__init__(config, *inputs, **kwargs)
  1354. self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model")
  1355. self.classification_head = TFBartClassificationHead(
  1356. config.d_model, config.num_labels, config.classifier_dropout, name="classification_head"
  1357. )
  1358. @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
  1359. @replace_return_docstrings(output_type=TFSeq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
  1360. @unpack_inputs
  1361. def call(
  1362. self,
  1363. input_ids: TFModelInputType | None = None,
  1364. attention_mask: np.ndarray | tf.Tensor | None = None,
  1365. decoder_input_ids: np.ndarray | tf.Tensor | None = None,
  1366. decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  1367. decoder_position_ids: np.ndarray | tf.Tensor | None = None,
  1368. head_mask: np.ndarray | tf.Tensor | None = None,
  1369. decoder_head_mask: np.ndarray | tf.Tensor | None = None,
  1370. cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
  1371. encoder_outputs: Optional[TFBaseModelOutput] = None,
  1372. past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
  1373. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1374. decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1375. use_cache: Optional[bool] = None,
  1376. output_attentions: Optional[bool] = None,
  1377. output_hidden_states: Optional[bool] = None,
  1378. return_dict: Optional[bool] = None,
  1379. labels: tf.Tensor | None = None,
  1380. training: Optional[bool] = False,
  1381. ) -> Union[TFSeq2SeqSequenceClassifierOutput, Tuple[tf.Tensor]]:
  1382. r"""
  1383. labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  1384. Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
  1385. config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
  1386. Returns:
  1387. """
  1388. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1389. if labels is not None:
  1390. use_cache = False
  1391. if input_ids is None and inputs_embeds is not None:
  1392. raise NotImplementedError(
  1393. f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
  1394. )
  1395. outputs = self.model(
  1396. input_ids=input_ids,
  1397. attention_mask=attention_mask,
  1398. decoder_input_ids=decoder_input_ids,
  1399. decoder_attention_mask=decoder_attention_mask,
  1400. decoder_position_ids=decoder_position_ids,
  1401. head_mask=head_mask,
  1402. decoder_head_mask=decoder_head_mask,
  1403. cross_attn_head_mask=cross_attn_head_mask,
  1404. encoder_outputs=encoder_outputs,
  1405. past_key_values=past_key_values,
  1406. inputs_embeds=inputs_embeds,
  1407. decoder_inputs_embeds=decoder_inputs_embeds,
  1408. use_cache=use_cache,
  1409. output_attentions=output_attentions,
  1410. output_hidden_states=output_hidden_states,
  1411. return_dict=return_dict,
  1412. training=training,
  1413. )
  1414. last_hidden_state = outputs[0]
  1415. eos_mask = tf.equal(input_ids, self.config.eos_token_id)
  1416. # out the rows with False where present. Then verify all the final
  1417. # entries are True
  1418. self_masked = tf.reshape(tf.boolean_mask(eos_mask, eos_mask), (tf.shape(input_ids)[0], -1))
  1419. tf.Assert(tf.reduce_all(self_masked[:, -1]), ["All examples must have the same number of <eos> tokens."])
  1420. masked = tf.reshape(
  1421. tf.boolean_mask(last_hidden_state, eos_mask),
  1422. (tf.shape(input_ids)[0], tf.shape(self_masked)[1], tf.shape(last_hidden_state)[-1]),
  1423. )
  1424. sentence_representation = masked[:, -1, :]
  1425. logits = self.classification_head(sentence_representation)
  1426. loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
  1427. if not return_dict:
  1428. output = (logits,) + outputs[1:]
  1429. return ((loss,) + output) if loss is not None else output
  1430. return TFSeq2SeqSequenceClassifierOutput(
  1431. loss=loss,
  1432. logits=logits,
  1433. past_key_values=outputs.past_key_values,
  1434. decoder_hidden_states=outputs.decoder_hidden_states,
  1435. decoder_attentions=outputs.decoder_attentions,
  1436. cross_attentions=outputs.cross_attentions,
  1437. encoder_last_hidden_state=outputs.encoder_last_hidden_state,
  1438. encoder_hidden_states=outputs.encoder_hidden_states,
  1439. encoder_attentions=outputs.encoder_attentions,
  1440. )
  1441. def serving_output(self, output):
  1442. logits = tf.convert_to_tensor(output.logits)
  1443. pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
  1444. dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
  1445. dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
  1446. cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
  1447. enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
  1448. enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
  1449. return TFSeq2SeqSequenceClassifierOutput(
  1450. logits=logits,
  1451. past_key_values=pkv,
  1452. decoder_hidden_states=dec_hs,
  1453. decoder_attentions=dec_attns,
  1454. cross_attentions=cross_attns,
  1455. encoder_last_hidden_state=output.encoder_last_hidden_state,
  1456. encoder_hidden_states=enc_hs,
  1457. encoder_attentions=enc_attns,
  1458. )
  1459. def build(self, input_shape=None):
  1460. if self.built:
  1461. return
  1462. self.built = True
  1463. if getattr(self, "model", None) is not None:
  1464. with tf.name_scope(self.model.name):
  1465. self.model.build(None)
  1466. if getattr(self, "classification_head", None) is not None:
  1467. with tf.name_scope(self.classification_head.name):
  1468. self.classification_head.build(None)