modeling_tf_outputs.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991
  1. # Copyright 2020 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import annotations
  15. import warnings
  16. from dataclasses import dataclass
  17. from typing import List, Optional, Tuple
  18. import tensorflow as tf
  19. from .utils import ModelOutput
  20. @dataclass
  21. class TFBaseModelOutput(ModelOutput):
  22. """
  23. Base class for model's outputs, with potential hidden states and attentions.
  24. Args:
  25. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  26. Sequence of hidden-states at the output of the last layer of the model.
  27. hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  28. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  29. `(batch_size, sequence_length, hidden_size)`.
  30. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  31. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  32. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  33. sequence_length)`.
  34. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  35. heads.
  36. """
  37. last_hidden_state: tf.Tensor = None
  38. hidden_states: Tuple[tf.Tensor] | None = None
  39. attentions: Tuple[tf.Tensor] | None = None
  40. @dataclass
  41. class TFBaseModelOutputWithNoAttention(ModelOutput):
  42. """
  43. Base class for model's outputs, with potential hidden states.
  44. Args:
  45. last_hidden_state (`tf.Tensor` shape `(batch_size, num_channels, height, width)`):
  46. Sequence of hidden-states at the output of the last layer of the model.
  47. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  48. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  49. the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  50. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  51. """
  52. last_hidden_state: tf.Tensor = None
  53. hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
  54. @dataclass
  55. class TFBaseModelOutputWithPooling(ModelOutput):
  56. """
  57. Base class for model's outputs that also contains a pooling of the last hidden states.
  58. Args:
  59. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  60. Sequence of hidden-states at the output of the last layer of the model.
  61. pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
  62. Last layer hidden-state of the first token of the sequence (classification token) further processed by a
  63. Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
  64. prediction (classification) objective during pretraining.
  65. This output is usually *not* a good summary of the semantic content of the input, you're often better with
  66. averaging or pooling the sequence of hidden-states for the whole input sequence.
  67. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  68. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  69. `(batch_size, sequence_length, hidden_size)`.
  70. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  71. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  72. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  73. sequence_length)`.
  74. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  75. heads.
  76. """
  77. last_hidden_state: tf.Tensor = None
  78. pooler_output: tf.Tensor = None
  79. hidden_states: Tuple[tf.Tensor] | None = None
  80. attentions: Tuple[tf.Tensor] | None = None
  81. @dataclass
  82. class TFBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
  83. """
  84. Base class for model's outputs that also contains a pooling of the last hidden states.
  85. Args:
  86. last_hidden_state (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
  87. Sequence of hidden-states at the output of the last layer of the model.
  88. pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
  89. Last layer hidden-state after a pooling operation on the spatial dimensions.
  90. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  91. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  92. the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  93. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  94. """
  95. last_hidden_state: tf.Tensor = None
  96. pooler_output: tf.Tensor = None
  97. hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
  98. @dataclass
  99. class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
  100. """
  101. Base class for model's outputs that also contains a pooling of the last hidden states.
  102. Args:
  103. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  104. Sequence of hidden-states at the output of the last layer of the model.
  105. pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
  106. Last layer hidden-state of the first token of the sequence (classification token) further processed by a
  107. Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
  108. prediction (classification) objective during pretraining.
  109. This output is usually *not* a good summary of the semantic content of the input, you're often better with
  110. averaging or pooling the sequence of hidden-states for the whole input sequence.
  111. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  112. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  113. sequence_length, embed_size_per_head)`).
  114. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  115. `past_key_values` input) to speed up sequential decoding.
  116. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  117. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  118. `(batch_size, sequence_length, hidden_size)`.
  119. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  120. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  121. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  122. sequence_length)`.
  123. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  124. heads.
  125. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  126. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  127. sequence_length)`.
  128. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  129. weighted average in the cross-attention heads.
  130. """
  131. last_hidden_state: tf.Tensor = None
  132. pooler_output: tf.Tensor = None
  133. past_key_values: List[tf.Tensor] | None = None
  134. hidden_states: Tuple[tf.Tensor] | None = None
  135. attentions: Tuple[tf.Tensor] | None = None
  136. cross_attentions: Tuple[tf.Tensor] | None = None
  137. @dataclass
  138. class TFBaseModelOutputWithPast(ModelOutput):
  139. """
  140. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  141. Args:
  142. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  143. Sequence of hidden-states at the output of the last layer of the model.
  144. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  145. hidden_size)` is output.
  146. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  147. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  148. sequence_length, embed_size_per_head)`).
  149. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  150. `past_key_values` input) to speed up sequential decoding.
  151. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  152. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  153. `(batch_size, sequence_length, hidden_size)`.
  154. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  155. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  156. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  157. sequence_length)`.
  158. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  159. heads.
  160. """
  161. last_hidden_state: tf.Tensor = None
  162. past_key_values: List[tf.Tensor] | None = None
  163. hidden_states: Tuple[tf.Tensor] | None = None
  164. attentions: Tuple[tf.Tensor] | None = None
  165. @dataclass
  166. class TFBaseModelOutputWithCrossAttentions(ModelOutput):
  167. """
  168. Base class for model's outputs, with potential hidden states and attentions.
  169. Args:
  170. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  171. Sequence of hidden-states at the output of the last layer of the model.
  172. hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  173. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  174. `(batch_size, sequence_length, hidden_size)`.
  175. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  176. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  177. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  178. sequence_length)`.
  179. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  180. heads.
  181. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  182. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  183. sequence_length)`.
  184. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  185. weighted average in the cross-attention heads.
  186. """
  187. last_hidden_state: tf.Tensor = None
  188. hidden_states: Tuple[tf.Tensor] | None = None
  189. attentions: Tuple[tf.Tensor] | None = None
  190. cross_attentions: Tuple[tf.Tensor] | None = None
  191. @dataclass
  192. class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
  193. """
  194. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  195. Args:
  196. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  197. Sequence of hidden-states at the output of the last layer of the model.
  198. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  199. hidden_size)` is output.
  200. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  201. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  202. sequence_length, embed_size_per_head)`).
  203. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  204. `past_key_values` input) to speed up sequential decoding.
  205. hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  206. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  207. `(batch_size, sequence_length, hidden_size)`.
  208. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  209. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  210. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  211. sequence_length)`.
  212. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  213. heads.
  214. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  215. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  216. sequence_length)`.
  217. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  218. weighted average in the cross-attention heads.
  219. """
  220. last_hidden_state: tf.Tensor = None
  221. past_key_values: List[tf.Tensor] | None = None
  222. hidden_states: Tuple[tf.Tensor] | None = None
  223. attentions: Tuple[tf.Tensor] | None = None
  224. cross_attentions: Tuple[tf.Tensor] | None = None
  225. @dataclass
  226. class TFSeq2SeqModelOutput(ModelOutput):
  227. """
  228. Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
  229. decoding.
  230. Args:
  231. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  232. Sequence of hidden-states at the output of the last layer of the decoder of the model.
  233. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  234. hidden_size)` is output.
  235. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  236. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  237. sequence_length, embed_size_per_head)`).
  238. Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
  239. used (see `past_key_values` input) to speed up sequential decoding.
  240. decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  241. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  242. `(batch_size, sequence_length, hidden_size)`.
  243. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  244. decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  245. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  246. sequence_length)`.
  247. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  248. self-attention heads.
  249. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  250. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  251. sequence_length)`.
  252. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  253. weighted average in the cross-attention heads.
  254. encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  255. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  256. encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  257. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  258. `(batch_size, sequence_length, hidden_size)`.
  259. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  260. encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  261. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  262. sequence_length)`.
  263. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  264. self-attention heads.
  265. """
  266. last_hidden_state: tf.Tensor = None
  267. past_key_values: List[tf.Tensor] | None = None
  268. decoder_hidden_states: Tuple[tf.Tensor] | None = None
  269. decoder_attentions: Tuple[tf.Tensor] | None = None
  270. cross_attentions: Tuple[tf.Tensor] | None = None
  271. encoder_last_hidden_state: tf.Tensor | None = None
  272. encoder_hidden_states: Tuple[tf.Tensor] | None = None
  273. encoder_attentions: Tuple[tf.Tensor] | None = None
  274. @dataclass
  275. class TFCausalLMOutput(ModelOutput):
  276. """
  277. Base class for causal language model (or autoregressive) outputs.
  278. Args:
  279. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  280. Language modeling loss (for next-token prediction).
  281. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  282. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  283. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  284. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  285. `(batch_size, sequence_length, hidden_size)`.
  286. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  287. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  288. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  289. sequence_length)`.
  290. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  291. heads.
  292. """
  293. loss: tf.Tensor | None = None
  294. logits: tf.Tensor = None
  295. hidden_states: Tuple[tf.Tensor] | None = None
  296. attentions: Tuple[tf.Tensor] | None = None
  297. @dataclass
  298. class TFCausalLMOutputWithPast(ModelOutput):
  299. """
  300. Base class for causal language model (or autoregressive) outputs.
  301. Args:
  302. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  303. Language modeling loss (for next-token prediction).
  304. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  305. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  306. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  307. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  308. sequence_length, embed_size_per_head)`).
  309. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  310. `past_key_values` input) to speed up sequential decoding.
  311. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  312. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  313. `(batch_size, sequence_length, hidden_size)`.
  314. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  315. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  316. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  317. sequence_length)`.
  318. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  319. heads.
  320. """
  321. loss: tf.Tensor | None = None
  322. logits: tf.Tensor = None
  323. past_key_values: List[tf.Tensor] | None = None
  324. hidden_states: Tuple[tf.Tensor] | None = None
  325. attentions: Tuple[tf.Tensor] | None = None
  326. @dataclass
  327. class TFCausalLMOutputWithCrossAttentions(ModelOutput):
  328. """
  329. Base class for causal language model (or autoregressive) outputs.
  330. Args:
  331. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  332. Language modeling loss (for next-token prediction).
  333. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  334. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  335. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  336. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  337. `(batch_size, sequence_length, hidden_size)`.
  338. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  339. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  340. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  341. sequence_length)`.
  342. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  343. heads.
  344. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  345. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  346. sequence_length)`.
  347. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  348. weighted average in the cross-attention heads.
  349. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  350. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  351. sequence_length, embed_size_per_head)`).
  352. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  353. `past_key_values` input) to speed up sequential decoding.
  354. """
  355. loss: tf.Tensor | None = None
  356. logits: tf.Tensor = None
  357. past_key_values: List[tf.Tensor] | None = None
  358. hidden_states: Tuple[tf.Tensor] | None = None
  359. attentions: Tuple[tf.Tensor] | None = None
  360. cross_attentions: Tuple[tf.Tensor] | None = None
  361. @dataclass
  362. class TFMaskedLMOutput(ModelOutput):
  363. """
  364. Base class for masked language models outputs.
  365. Args:
  366. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  367. Masked language modeling (MLM) loss.
  368. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  369. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  370. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  371. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  372. `(batch_size, sequence_length, hidden_size)`.
  373. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  374. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  375. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  376. sequence_length)`.
  377. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  378. heads.
  379. """
  380. loss: tf.Tensor | None = None
  381. logits: tf.Tensor = None
  382. hidden_states: Tuple[tf.Tensor] | None = None
  383. attentions: Tuple[tf.Tensor] | None = None
  384. @dataclass
  385. class TFSeq2SeqLMOutput(ModelOutput):
  386. """
  387. Base class for sequence-to-sequence language models outputs.
  388. Args:
  389. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  390. Language modeling loss.
  391. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  392. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  393. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  394. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  395. sequence_length, embed_size_per_head)`).
  396. Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
  397. used (see `past_key_values` input) to speed up sequential decoding.
  398. decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  399. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  400. `(batch_size, sequence_length, hidden_size)`.
  401. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  402. decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  403. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  404. sequence_length)`.
  405. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  406. self-attention heads.
  407. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  408. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  409. sequence_length)`.
  410. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  411. weighted average in the cross-attention heads.
  412. encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  413. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  414. encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  415. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  416. `(batch_size, sequence_length, hidden_size)`.
  417. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  418. encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  419. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  420. sequence_length)`.
  421. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  422. self-attention heads.
  423. """
  424. loss: tf.Tensor | None = None
  425. logits: tf.Tensor = None
  426. past_key_values: List[tf.Tensor] | None = None
  427. decoder_hidden_states: Tuple[tf.Tensor] | None = None
  428. decoder_attentions: Tuple[tf.Tensor] | None = None
  429. cross_attentions: Tuple[tf.Tensor] | None = None
  430. encoder_last_hidden_state: tf.Tensor | None = None
  431. encoder_hidden_states: Tuple[tf.Tensor] | None = None
  432. encoder_attentions: Tuple[tf.Tensor] | None = None
  433. @dataclass
  434. class TFNextSentencePredictorOutput(ModelOutput):
  435. """
  436. Base class for outputs of models predicting if two sentences are consecutive or not.
  437. Args:
  438. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `next_sentence_label` is provided):
  439. Next sentence prediction loss.
  440. logits (`tf.Tensor` of shape `(batch_size, 2)`):
  441. Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
  442. before SoftMax).
  443. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  444. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  445. `(batch_size, sequence_length, hidden_size)`.
  446. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  447. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  448. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  449. sequence_length)`.
  450. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  451. heads.
  452. """
  453. loss: tf.Tensor | None = None
  454. logits: tf.Tensor = None
  455. hidden_states: Tuple[tf.Tensor] | None = None
  456. attentions: Tuple[tf.Tensor] | None = None
  457. @dataclass
  458. class TFSequenceClassifierOutput(ModelOutput):
  459. """
  460. Base class for outputs of sentence classification models.
  461. Args:
  462. loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
  463. Classification (or regression if config.num_labels==1) loss.
  464. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  465. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  466. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  467. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  468. `(batch_size, sequence_length, hidden_size)`.
  469. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  470. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  471. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  472. sequence_length)`.
  473. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  474. heads.
  475. """
  476. loss: tf.Tensor | None = None
  477. logits: tf.Tensor = None
  478. hidden_states: Tuple[tf.Tensor] | None = None
  479. attentions: Tuple[tf.Tensor] | None = None
  480. @dataclass
  481. class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
  482. """
  483. Base class for outputs of sequence-to-sequence sentence classification models.
  484. Args:
  485. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
  486. Classification (or regression if config.num_labels==1) loss.
  487. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  488. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  489. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  490. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  491. sequence_length, embed_size_per_head)`).
  492. Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
  493. used (see `past_key_values` input) to speed up sequential decoding.
  494. decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  495. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  496. `(batch_size, sequence_length, hidden_size)`.
  497. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  498. decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  499. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  500. sequence_length)`.
  501. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  502. self-attention heads.
  503. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  504. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  505. sequence_length)`
  506. encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  507. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  508. encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  509. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  510. `(batch_size, sequence_length, hidden_size)`.
  511. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  512. encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  513. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  514. sequence_length)`.
  515. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  516. self-attention heads.
  517. """
  518. loss: tf.Tensor | None = None
  519. logits: tf.Tensor = None
  520. past_key_values: List[tf.Tensor] | None = None
  521. decoder_hidden_states: Tuple[tf.Tensor] | None = None
  522. decoder_attentions: Tuple[tf.Tensor] | None = None
  523. cross_attentions: Tuple[tf.Tensor] | None = None
  524. encoder_last_hidden_state: tf.Tensor | None = None
  525. encoder_hidden_states: Tuple[tf.Tensor] | None = None
  526. encoder_attentions: Tuple[tf.Tensor] | None = None
  527. @dataclass
  528. class TFSemanticSegmenterOutput(ModelOutput):
  529. """
  530. Base class for outputs of semantic segmentation models.
  531. Args:
  532. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  533. Classification (or regression if config.num_labels==1) loss.
  534. logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
  535. Classification scores for each pixel.
  536. <Tip warning={true}>
  537. The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
  538. to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
  539. original image size as post-processing. You should always check your logits shape and resize as needed.
  540. </Tip>
  541. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  542. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  543. the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
  544. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  545. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  546. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
  547. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  548. heads.
  549. """
  550. loss: tf.Tensor | None = None
  551. logits: tf.Tensor = None
  552. hidden_states: Tuple[tf.Tensor] | None = None
  553. attentions: Tuple[tf.Tensor] | None = None
  554. @dataclass
  555. class TFSemanticSegmenterOutputWithNoAttention(ModelOutput):
  556. """
  557. Base class for outputs of semantic segmentation models that do not output attention scores.
  558. Args:
  559. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  560. Classification (or regression if config.num_labels==1) loss.
  561. logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
  562. Classification scores for each pixel.
  563. <Tip warning={true}>
  564. The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
  565. to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
  566. original image size as post-processing. You should always check your logits shape and resize as needed.
  567. </Tip>
  568. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  569. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  570. the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
  571. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  572. """
  573. loss: tf.Tensor | None = None
  574. logits: tf.Tensor = None
  575. hidden_states: Tuple[tf.Tensor] | None = None
  576. @dataclass
  577. class TFImageClassifierOutput(ModelOutput):
  578. """
  579. Base class for outputs of image classification models.
  580. Args:
  581. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  582. Classification (or regression if config.num_labels==1) loss.
  583. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  584. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  585. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  586. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  587. the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
  588. feature maps) of the model at the output of each stage.
  589. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  590. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
  591. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  592. heads.
  593. """
  594. loss: tf.Tensor | None = None
  595. logits: tf.Tensor = None
  596. hidden_states: Tuple[tf.Tensor] | None = None
  597. attentions: Tuple[tf.Tensor] | None = None
  598. @dataclass
  599. class TFMultipleChoiceModelOutput(ModelOutput):
  600. """
  601. Base class for outputs of multiple choice models.
  602. Args:
  603. loss (`tf.Tensor` of shape *(batch_size, )*, *optional*, returned when `labels` is provided):
  604. Classification loss.
  605. logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
  606. *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
  607. Classification scores (before SoftMax).
  608. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  609. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  610. `(batch_size, sequence_length, hidden_size)`.
  611. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  612. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  613. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  614. sequence_length)`.
  615. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  616. heads.
  617. """
  618. loss: tf.Tensor | None = None
  619. logits: tf.Tensor = None
  620. hidden_states: Tuple[tf.Tensor] | None = None
  621. attentions: Tuple[tf.Tensor] | None = None
  622. @dataclass
  623. class TFTokenClassifierOutput(ModelOutput):
  624. """
  625. Base class for outputs of token classification models.
  626. Args:
  627. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of unmasked labels, returned when `labels` is provided) :
  628. Classification loss.
  629. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
  630. Classification scores (before SoftMax).
  631. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  632. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  633. `(batch_size, sequence_length, hidden_size)`.
  634. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  635. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  636. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  637. sequence_length)`.
  638. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  639. heads.
  640. """
  641. loss: tf.Tensor | None = None
  642. logits: tf.Tensor = None
  643. hidden_states: Tuple[tf.Tensor] | None = None
  644. attentions: Tuple[tf.Tensor] | None = None
  645. @dataclass
  646. class TFQuestionAnsweringModelOutput(ModelOutput):
  647. """
  648. Base class for outputs of question answering models.
  649. Args:
  650. loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `start_positions` and `end_positions` are provided):
  651. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  652. start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  653. Span-start scores (before SoftMax).
  654. end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  655. Span-end scores (before SoftMax).
  656. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  657. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  658. `(batch_size, sequence_length, hidden_size)`.
  659. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  660. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  661. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  662. sequence_length)`.
  663. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  664. heads.
  665. """
  666. loss: tf.Tensor | None = None
  667. start_logits: tf.Tensor = None
  668. end_logits: tf.Tensor = None
  669. hidden_states: Tuple[tf.Tensor] | None = None
  670. attentions: Tuple[tf.Tensor] | None = None
  671. @dataclass
  672. class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
  673. """
  674. Base class for outputs of sequence-to-sequence question answering models.
  675. Args:
  676. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  677. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  678. start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  679. Span-start scores (before SoftMax).
  680. end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  681. Span-end scores (before SoftMax).
  682. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  683. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  684. sequence_length, embed_size_per_head)`).
  685. Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
  686. used (see `past_key_values` input) to speed up sequential decoding.
  687. decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  688. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  689. `(batch_size, sequence_length, hidden_size)`.
  690. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  691. decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  692. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  693. sequence_length)`.
  694. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  695. self-attention heads.
  696. encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  697. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  698. encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  699. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  700. `(batch_size, sequence_length, hidden_size)`.
  701. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  702. encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  703. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  704. sequence_length)`.
  705. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  706. self-attention heads.
  707. """
  708. loss: tf.Tensor | None = None
  709. start_logits: tf.Tensor = None
  710. end_logits: tf.Tensor = None
  711. past_key_values: List[tf.Tensor] | None = None
  712. decoder_hidden_states: Tuple[tf.Tensor] | None = None
  713. decoder_attentions: Tuple[tf.Tensor] | None = None
  714. encoder_last_hidden_state: tf.Tensor | None = None
  715. encoder_hidden_states: Tuple[tf.Tensor] | None = None
  716. encoder_attentions: Tuple[tf.Tensor] | None = None
  717. @dataclass
  718. class TFSequenceClassifierOutputWithPast(ModelOutput):
  719. """
  720. Base class for outputs of sentence classification models.
  721. Args:
  722. loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
  723. Classification (or regression if config.num_labels==1) loss.
  724. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  725. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  726. past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  727. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  728. sequence_length, embed_size_per_head)`).
  729. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  730. `past_key_values` input) to speed up sequential decoding.
  731. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  732. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  733. `(batch_size, sequence_length, hidden_size)`.
  734. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  735. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  736. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  737. sequence_length)`.
  738. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  739. heads.
  740. """
  741. loss: tf.Tensor | None = None
  742. logits: tf.Tensor = None
  743. past_key_values: List[tf.Tensor] | None = None
  744. hidden_states: Tuple[tf.Tensor] | None = None
  745. attentions: Tuple[tf.Tensor] | None = None
  746. @dataclass
  747. class TFImageClassifierOutputWithNoAttention(ModelOutput):
  748. """
  749. Base class for outputs of image classification models.
  750. Args:
  751. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  752. Classification (or regression if config.num_labels==1) loss.
  753. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  754. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  755. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  756. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  757. the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called
  758. feature maps) of the model at the output of each stage.
  759. """
  760. loss: tf.Tensor | None = None
  761. logits: tf.Tensor = None
  762. hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
  763. @dataclass
  764. class TFMaskedImageModelingOutput(ModelOutput):
  765. """
  766. Base class for outputs of masked image completion / in-painting models.
  767. Args:
  768. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
  769. Reconstruction loss.
  770. reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
  771. Reconstructed / completed images.
  772. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
  773. `config.output_hidden_states=True`):
  774. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  775. the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
  776. feature maps) of the model at the output of each stage.
  777. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
  778. `config.output_attentions=True`):
  779. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
  780. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  781. heads.
  782. """
  783. loss: tf.Tensor | None = None
  784. reconstruction: tf.Tensor = None
  785. hidden_states: Tuple[tf.Tensor] | None = None
  786. attentions: Tuple[tf.Tensor] | None = None
  787. @property
  788. def logits(self):
  789. warnings.warn(
  790. "logits attribute is deprecated and will be removed in version 5 of Transformers."
  791. " Please use the reconstruction attribute to retrieve the final output instead.",
  792. FutureWarning,
  793. )
  794. return self.reconstruction