modeling_outputs.py 110 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753
  1. # Copyright 2020 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import warnings
  15. from dataclasses import dataclass
  16. from typing import Optional, Tuple
  17. import torch
  18. from .utils import ModelOutput
  19. @dataclass
  20. class BaseModelOutput(ModelOutput):
  21. """
  22. Base class for model's outputs, with potential hidden states and attentions.
  23. Args:
  24. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  25. Sequence of hidden-states at the output of the last layer of the model.
  26. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  27. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  28. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  29. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  30. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  31. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  32. sequence_length)`.
  33. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  34. heads.
  35. """
  36. last_hidden_state: torch.FloatTensor = None
  37. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  38. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  39. @dataclass
  40. class BaseModelOutputWithNoAttention(ModelOutput):
  41. """
  42. Base class for model's outputs, with potential hidden states.
  43. Args:
  44. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  45. Sequence of hidden-states at the output of the last layer of the model.
  46. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  47. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  48. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  49. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  50. """
  51. last_hidden_state: torch.FloatTensor = None
  52. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  53. @dataclass
  54. class BaseModelOutputWithPooling(ModelOutput):
  55. """
  56. Base class for model's outputs that also contains a pooling of the last hidden states.
  57. Args:
  58. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  59. Sequence of hidden-states at the output of the last layer of the model.
  60. pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
  61. Last layer hidden-state of the first token of the sequence (classification token) after further processing
  62. through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
  63. the classification token after processing through a linear layer and a tanh activation function. The linear
  64. layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  65. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  66. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  67. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  68. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  69. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  70. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  71. sequence_length)`.
  72. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  73. heads.
  74. """
  75. last_hidden_state: torch.FloatTensor = None
  76. pooler_output: torch.FloatTensor = None
  77. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  78. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  79. @dataclass
  80. class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
  81. """
  82. Base class for model's outputs that also contains a pooling of the last hidden states.
  83. Args:
  84. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  85. Sequence of hidden-states at the output of the last layer of the model.
  86. pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
  87. Last layer hidden-state after a pooling operation on the spatial dimensions.
  88. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  89. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  90. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  91. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  92. """
  93. last_hidden_state: torch.FloatTensor = None
  94. pooler_output: torch.FloatTensor = None
  95. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  96. @dataclass
  97. class BaseModelOutputWithPast(ModelOutput):
  98. """
  99. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  100. Args:
  101. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  102. Sequence of hidden-states at the output of the last layer of the model.
  103. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  104. hidden_size)` is output.
  105. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  106. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  107. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  108. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  109. encoder_sequence_length, embed_size_per_head)`.
  110. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  111. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  112. input) to speed up sequential decoding.
  113. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  114. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  115. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  116. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  117. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  118. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  119. sequence_length)`.
  120. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  121. heads.
  122. """
  123. last_hidden_state: torch.FloatTensor = None
  124. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  125. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  126. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  127. @dataclass
  128. class BaseModelOutputWithCrossAttentions(ModelOutput):
  129. """
  130. Base class for model's outputs, with potential hidden states and attentions.
  131. Args:
  132. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  133. Sequence of hidden-states at the output of the last layer of the model.
  134. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  135. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  136. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  137. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  138. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  139. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  140. sequence_length)`.
  141. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  142. heads.
  143. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  144. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  145. sequence_length)`.
  146. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  147. weighted average in the cross-attention heads.
  148. """
  149. last_hidden_state: torch.FloatTensor = None
  150. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  151. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  152. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  153. @dataclass
  154. class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
  155. """
  156. Base class for model's outputs that also contains a pooling of the last hidden states.
  157. Args:
  158. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  159. Sequence of hidden-states at the output of the last layer of the model.
  160. pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
  161. Last layer hidden-state of the first token of the sequence (classification token) after further processing
  162. through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
  163. the classification token after processing through a linear layer and a tanh activation function. The linear
  164. layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  165. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  166. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  167. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  168. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  169. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  170. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  171. sequence_length)`.
  172. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  173. heads.
  174. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  175. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  176. sequence_length)`.
  177. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  178. weighted average in the cross-attention heads.
  179. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  180. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  181. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  182. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  183. encoder_sequence_length, embed_size_per_head)`.
  184. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  185. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  186. input) to speed up sequential decoding.
  187. """
  188. last_hidden_state: torch.FloatTensor = None
  189. pooler_output: torch.FloatTensor = None
  190. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  191. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  192. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  193. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  194. @dataclass
  195. class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
  196. """
  197. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  198. Args:
  199. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  200. Sequence of hidden-states at the output of the last layer of the model.
  201. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  202. hidden_size)` is output.
  203. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  204. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  205. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  206. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  207. encoder_sequence_length, embed_size_per_head)`.
  208. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  209. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  210. input) to speed up sequential decoding.
  211. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  212. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  213. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  214. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  215. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  216. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  217. sequence_length)`.
  218. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  219. heads.
  220. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  221. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  222. sequence_length)`.
  223. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  224. weighted average in the cross-attention heads.
  225. """
  226. last_hidden_state: torch.FloatTensor = None
  227. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  228. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  229. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  230. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  231. @dataclass
  232. class MoECausalLMOutputWithPast(ModelOutput):
  233. """
  234. Base class for causal language model (or autoregressive) outputs as well as Mixture of Expert's router hidden
  235. states terms, to train a MoE model.
  236. Args:
  237. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  238. Language modeling loss (for next-token prediction).
  239. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  240. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  241. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  242. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  243. `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
  244. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  245. `past_key_values` input) to speed up sequential decoding.
  246. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  247. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  248. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  249. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  250. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  251. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  252. sequence_length)`.
  253. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  254. heads.
  255. z_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
  256. z_loss for the sparse modules.
  257. aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
  258. aux_loss for the sparse modules.
  259. router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  260. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  261. Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
  262. modules.
  263. """
  264. loss: Optional[torch.FloatTensor] = None
  265. logits: torch.FloatTensor = None
  266. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  267. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  268. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  269. z_loss: torch.FloatTensor = None
  270. aux_loss: torch.FloatTensor = None
  271. router_logits: Optional[Tuple[torch.FloatTensor]] = None
  272. @dataclass
  273. class MoEModelOutput(ModelOutput):
  274. """
  275. Base class for model's outputs, with potential hidden states and attentions.
  276. Args:
  277. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  278. Sequence of hidden-states at the output of the last layer of the model.
  279. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  280. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  281. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  282. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  283. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  284. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  285. sequence_length)`.
  286. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  287. heads.
  288. router_probs (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  289. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  290. Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary
  291. loss and the z_loss for Mixture of Experts models.
  292. """
  293. last_hidden_state: torch.FloatTensor = None
  294. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  295. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  296. router_probs: Optional[Tuple[torch.FloatTensor]] = None
  297. @dataclass
  298. class MoeModelOutputWithPast(ModelOutput):
  299. """
  300. Base class for model's outputs, with potential hidden states and attentions.
  301. Args:
  302. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  303. Sequence of hidden-states at the output of the last layer of the model.
  304. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  305. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  306. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  307. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  308. encoder_sequence_length, embed_size_per_head)`.
  309. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  310. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  311. input) to speed up sequential decoding.
  312. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  313. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  314. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  315. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  316. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  317. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  318. sequence_length)`.
  319. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  320. heads.
  321. router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  322. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  323. Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
  324. loss for Mixture of Experts models.
  325. """
  326. last_hidden_state: torch.FloatTensor = None
  327. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  328. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  329. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  330. router_logits: Optional[Tuple[torch.FloatTensor]] = None
  331. @dataclass
  332. class MoeCausalLMOutputWithPast(ModelOutput):
  333. """
  334. Base class for causal language model (or autoregressive) with mixture of experts outputs.
  335. Args:
  336. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  337. Language modeling loss (for next-token prediction).
  338. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  339. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  340. aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
  341. aux_loss for the sparse modules.
  342. router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  343. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  344. Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
  345. loss for Mixture of Experts models.
  346. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  347. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  348. `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
  349. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  350. `past_key_values` input) to speed up sequential decoding.
  351. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  352. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  353. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  354. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  355. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  356. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  357. sequence_length)`.
  358. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  359. heads.
  360. """
  361. loss: Optional[torch.FloatTensor] = None
  362. aux_loss: Optional[torch.FloatTensor] = None
  363. logits: torch.FloatTensor = None
  364. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  365. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  366. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  367. router_logits: Optional[Tuple[torch.FloatTensor]] = None
  368. @dataclass
  369. class MoEModelOutputWithPastAndCrossAttentions(ModelOutput):
  370. """
  371. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding) as well as
  372. Mixture of Expert's router hidden states terms, to train a MoE model.
  373. Args:
  374. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  375. Sequence of hidden-states at the output of the last layer of the model.
  376. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  377. hidden_size)` is output.
  378. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  379. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  380. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  381. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  382. encoder_sequence_length, embed_size_per_head)`.
  383. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  384. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  385. input) to speed up sequential decoding.
  386. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  387. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  388. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  389. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  390. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  391. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  392. sequence_length)`.
  393. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  394. heads.
  395. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  396. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  397. sequence_length)`.
  398. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  399. weighted average in the cross-attention heads.
  400. router_probs (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  401. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  402. Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary
  403. loss and the z_loss for Mixture of Experts models.
  404. """
  405. last_hidden_state: torch.FloatTensor = None
  406. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  407. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  408. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  409. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  410. router_probs: Optional[Tuple[torch.FloatTensor]] = None
  411. @dataclass
  412. class Seq2SeqModelOutput(ModelOutput):
  413. """
  414. Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
  415. decoding.
  416. Args:
  417. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  418. Sequence of hidden-states at the output of the last layer of the decoder of the model.
  419. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  420. hidden_size)` is output.
  421. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  422. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  423. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  424. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  425. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  426. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  427. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  428. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  429. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  430. Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
  431. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  432. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  433. sequence_length)`.
  434. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  435. self-attention heads.
  436. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  437. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  438. sequence_length)`.
  439. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  440. weighted average in the cross-attention heads.
  441. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  442. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  443. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  444. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  445. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  446. Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
  447. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  448. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  449. sequence_length)`.
  450. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  451. self-attention heads.
  452. """
  453. last_hidden_state: torch.FloatTensor = None
  454. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  455. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  456. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  457. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  458. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  459. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  460. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  461. @dataclass
  462. class Seq2SeqMoEModelOutput(ModelOutput):
  463. """
  464. Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
  465. decoding.
  466. Args:
  467. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  468. Sequence of hidden-states at the output of the last layer of the decoder of the model.
  469. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  470. hidden_size)` is output.
  471. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  472. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  473. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  474. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  475. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  476. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  477. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  478. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  479. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  480. Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
  481. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  482. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  483. sequence_length)`.
  484. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  485. self-attention heads.
  486. decoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  487. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  488. Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
  489. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  490. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  491. sequence_length)`.
  492. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  493. weighted average in the cross-attention heads.
  494. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  495. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  496. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  497. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  498. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  499. Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
  500. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  501. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  502. sequence_length)`.
  503. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  504. self-attention heads.
  505. encoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  506. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  507. Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
  508. modules.
  509. """
  510. last_hidden_state: torch.FloatTensor = None
  511. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  512. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  513. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  514. decoder_router_logits: Optional[Tuple[torch.FloatTensor]] = None
  515. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  516. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  517. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  518. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  519. encoder_router_logits: Optional[Tuple[torch.FloatTensor]] = None
  520. @dataclass
  521. class CausalLMOutput(ModelOutput):
  522. """
  523. Base class for causal language model (or autoregressive) outputs.
  524. Args:
  525. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  526. Language modeling loss (for next-token prediction).
  527. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  528. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  529. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  530. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  531. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  532. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  533. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  534. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  535. sequence_length)`.
  536. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  537. heads.
  538. """
  539. loss: Optional[torch.FloatTensor] = None
  540. logits: torch.FloatTensor = None
  541. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  542. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  543. @dataclass
  544. class CausalLMOutputWithPast(ModelOutput):
  545. """
  546. Base class for causal language model (or autoregressive) outputs.
  547. Args:
  548. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  549. Language modeling loss (for next-token prediction).
  550. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  551. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  552. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  553. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  554. `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
  555. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  556. `past_key_values` input) to speed up sequential decoding.
  557. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  558. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  559. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  560. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  561. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  562. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  563. sequence_length)`.
  564. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  565. heads.
  566. """
  567. loss: Optional[torch.FloatTensor] = None
  568. logits: torch.FloatTensor = None
  569. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  570. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  571. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  572. @dataclass
  573. class CausalLMOutputWithCrossAttentions(ModelOutput):
  574. """
  575. Base class for causal language model (or autoregressive) outputs.
  576. Args:
  577. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  578. Language modeling loss (for next-token prediction).
  579. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  580. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  581. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  582. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  583. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  584. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  585. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  586. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  587. sequence_length)`.
  588. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  589. heads.
  590. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  591. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  592. sequence_length)`.
  593. Cross attentions weights after the attention softmax, used to compute the weighted average in the
  594. cross-attention heads.
  595. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  596. Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
  597. value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
  598. setting. Only relevant if `config.is_decoder = True`.
  599. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  600. `past_key_values` input) to speed up sequential decoding.
  601. """
  602. loss: Optional[torch.FloatTensor] = None
  603. logits: torch.FloatTensor = None
  604. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  605. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  606. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  607. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  608. @dataclass
  609. class SequenceClassifierOutputWithPast(ModelOutput):
  610. """
  611. Base class for outputs of sentence classification models.
  612. Args:
  613. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  614. Classification (or regression if config.num_labels==1) loss.
  615. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  616. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  617. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  618. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  619. `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
  620. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  621. `past_key_values` input) to speed up sequential decoding.
  622. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  623. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  624. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  625. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  626. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  627. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  628. sequence_length)`.
  629. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  630. heads.
  631. """
  632. loss: Optional[torch.FloatTensor] = None
  633. logits: torch.FloatTensor = None
  634. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  635. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  636. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  637. @dataclass
  638. class MaskedLMOutput(ModelOutput):
  639. """
  640. Base class for masked language models outputs.
  641. Args:
  642. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  643. Masked language modeling (MLM) loss.
  644. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  645. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  646. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  647. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  648. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  649. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  650. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  651. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  652. sequence_length)`.
  653. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  654. heads.
  655. """
  656. loss: Optional[torch.FloatTensor] = None
  657. logits: torch.FloatTensor = None
  658. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  659. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  660. @dataclass
  661. class Seq2SeqLMOutput(ModelOutput):
  662. """
  663. Base class for sequence-to-sequence language models outputs.
  664. Args:
  665. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  666. Language modeling loss.
  667. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  668. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  669. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  670. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  671. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  672. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  673. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  674. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  675. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  676. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  677. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  678. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  679. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  680. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  681. sequence_length)`.
  682. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  683. self-attention heads.
  684. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  685. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  686. sequence_length)`.
  687. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  688. weighted average in the cross-attention heads.
  689. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  690. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  691. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  692. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  693. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  694. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  695. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  696. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  697. sequence_length)`.
  698. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  699. self-attention heads.
  700. """
  701. loss: Optional[torch.FloatTensor] = None
  702. logits: torch.FloatTensor = None
  703. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  704. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  705. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  706. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  707. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  708. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  709. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  710. @dataclass
  711. class Seq2SeqMoEOutput(ModelOutput):
  712. """
  713. Base class for sequence-to-sequence language models outputs.
  714. Args:
  715. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  716. Language modeling loss.
  717. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  718. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  719. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  720. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  721. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  722. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  723. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  724. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  725. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  726. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  727. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  728. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  729. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  730. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  731. sequence_length)`.
  732. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  733. self-attention heads.
  734. decoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  735. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  736. Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
  737. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  738. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  739. sequence_length)`.
  740. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  741. weighted average in the cross-attention heads.
  742. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  743. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  744. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  745. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  746. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  747. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  748. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  749. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  750. sequence_length)`.
  751. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  752. self-attention heads.
  753. encoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  754. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  755. Router logits of the encoder model, useful to compute the auxiliary loss and z_loss for Mixture of Experts
  756. models.
  757. """
  758. loss: Optional[torch.FloatTensor] = None
  759. logits: torch.FloatTensor = None
  760. encoder_z_loss: torch.FloatTensor = None
  761. decoder_z_loss: torch.FloatTensor = None
  762. encoder_aux_loss: torch.FloatTensor = None
  763. decoder_aux_loss: torch.FloatTensor = None
  764. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  765. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  766. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  767. decoder_router_logits: Optional[Tuple[torch.FloatTensor]] = None
  768. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  769. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  770. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  771. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  772. encoder_router_logits: Optional[Tuple[torch.FloatTensor]] = None
  773. @dataclass
  774. class NextSentencePredictorOutput(ModelOutput):
  775. """
  776. Base class for outputs of models predicting if two sentences are consecutive or not.
  777. Args:
  778. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `next_sentence_label` is provided):
  779. Next sequence prediction (classification) loss.
  780. logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
  781. Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
  782. before SoftMax).
  783. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  784. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  785. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  786. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  787. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  788. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  789. sequence_length)`.
  790. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  791. heads.
  792. """
  793. loss: Optional[torch.FloatTensor] = None
  794. logits: torch.FloatTensor = None
  795. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  796. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  797. @dataclass
  798. class SequenceClassifierOutput(ModelOutput):
  799. """
  800. Base class for outputs of sentence classification models.
  801. Args:
  802. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  803. Classification (or regression if config.num_labels==1) loss.
  804. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  805. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  806. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  807. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  808. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  809. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  810. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  811. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  812. sequence_length)`.
  813. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  814. heads.
  815. """
  816. loss: Optional[torch.FloatTensor] = None
  817. logits: torch.FloatTensor = None
  818. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  819. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  820. @dataclass
  821. class Seq2SeqSequenceClassifierOutput(ModelOutput):
  822. """
  823. Base class for outputs of sequence-to-sequence sentence classification models.
  824. Args:
  825. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
  826. Classification (or regression if config.num_labels==1) loss.
  827. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  828. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  829. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  830. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  831. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  832. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  833. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  834. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  835. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  836. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  837. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  838. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  839. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  840. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  841. sequence_length)`.
  842. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  843. self-attention heads.
  844. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  845. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  846. sequence_length)`.
  847. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  848. weighted average in the cross-attention heads.
  849. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  850. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  851. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  852. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  853. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  854. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  855. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  856. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  857. sequence_length)`.
  858. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  859. self-attention heads.
  860. """
  861. loss: Optional[torch.FloatTensor] = None
  862. logits: torch.FloatTensor = None
  863. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  864. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  865. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  866. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  867. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  868. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  869. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  870. @dataclass
  871. class MultipleChoiceModelOutput(ModelOutput):
  872. """
  873. Base class for outputs of multiple choice models.
  874. Args:
  875. loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
  876. Classification loss.
  877. logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
  878. *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
  879. Classification scores (before SoftMax).
  880. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  881. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  882. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  883. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  884. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  885. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  886. sequence_length)`.
  887. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  888. heads.
  889. """
  890. loss: Optional[torch.FloatTensor] = None
  891. logits: torch.FloatTensor = None
  892. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  893. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  894. @dataclass
  895. class TokenClassifierOutput(ModelOutput):
  896. """
  897. Base class for outputs of token classification models.
  898. Args:
  899. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
  900. Classification loss.
  901. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
  902. Classification scores (before SoftMax).
  903. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  904. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  905. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  906. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  907. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  908. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  909. sequence_length)`.
  910. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  911. heads.
  912. """
  913. loss: Optional[torch.FloatTensor] = None
  914. logits: torch.FloatTensor = None
  915. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  916. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  917. @dataclass
  918. class QuestionAnsweringModelOutput(ModelOutput):
  919. """
  920. Base class for outputs of question answering models.
  921. Args:
  922. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  923. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  924. start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
  925. Span-start scores (before SoftMax).
  926. end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
  927. Span-end scores (before SoftMax).
  928. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  929. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  930. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  931. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  932. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  933. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  934. sequence_length)`.
  935. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  936. heads.
  937. """
  938. loss: Optional[torch.FloatTensor] = None
  939. start_logits: torch.FloatTensor = None
  940. end_logits: torch.FloatTensor = None
  941. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  942. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  943. @dataclass
  944. class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
  945. """
  946. Base class for outputs of sequence-to-sequence question answering models.
  947. Args:
  948. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  949. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  950. start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
  951. Span-start scores (before SoftMax).
  952. end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
  953. Span-end scores (before SoftMax).
  954. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  955. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  956. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  957. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  958. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  959. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  960. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  961. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  962. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  963. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  964. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  965. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  966. sequence_length)`.
  967. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  968. self-attention heads.
  969. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  970. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  971. sequence_length)`.
  972. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  973. weighted average in the cross-attention heads.
  974. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  975. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  976. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  977. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  978. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  979. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  980. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  981. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  982. sequence_length)`.
  983. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  984. self-attention heads.
  985. """
  986. loss: Optional[torch.FloatTensor] = None
  987. start_logits: torch.FloatTensor = None
  988. end_logits: torch.FloatTensor = None
  989. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  990. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  991. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  992. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  993. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  994. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  995. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  996. @dataclass
  997. class SemanticSegmenterOutput(ModelOutput):
  998. """
  999. Base class for outputs of semantic segmentation models.
  1000. Args:
  1001. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1002. Classification (or regression if config.num_labels==1) loss.
  1003. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
  1004. Classification scores for each pixel.
  1005. <Tip warning={true}>
  1006. The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
  1007. to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
  1008. original image size as post-processing. You should always check your logits shape and resize as needed.
  1009. </Tip>
  1010. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1011. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1012. one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
  1013. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1014. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1015. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1016. sequence_length)`.
  1017. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1018. heads.
  1019. """
  1020. loss: Optional[torch.FloatTensor] = None
  1021. logits: torch.FloatTensor = None
  1022. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1023. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1024. @dataclass
  1025. class ImageClassifierOutput(ModelOutput):
  1026. """
  1027. Base class for outputs of image classification models.
  1028. Args:
  1029. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1030. Classification (or regression if config.num_labels==1) loss.
  1031. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  1032. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  1033. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1034. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1035. one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
  1036. (also called feature maps) of the model at the output of each stage.
  1037. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1038. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1039. sequence_length)`.
  1040. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1041. heads.
  1042. """
  1043. loss: Optional[torch.FloatTensor] = None
  1044. logits: torch.FloatTensor = None
  1045. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1046. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1047. @dataclass
  1048. class ImageClassifierOutputWithNoAttention(ModelOutput):
  1049. """
  1050. Base class for outputs of image classification models.
  1051. Args:
  1052. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1053. Classification (or regression if config.num_labels==1) loss.
  1054. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  1055. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  1056. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1057. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1058. one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
  1059. called feature maps) of the model at the output of each stage.
  1060. """
  1061. loss: Optional[torch.FloatTensor] = None
  1062. logits: torch.FloatTensor = None
  1063. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1064. @dataclass
  1065. class DepthEstimatorOutput(ModelOutput):
  1066. """
  1067. Base class for outputs of depth estimation models.
  1068. Args:
  1069. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1070. Classification (or regression if config.num_labels==1) loss.
  1071. predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
  1072. Predicted depth for each pixel.
  1073. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1074. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1075. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  1076. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1077. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1078. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1079. sequence_length)`.
  1080. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1081. heads.
  1082. """
  1083. loss: Optional[torch.FloatTensor] = None
  1084. predicted_depth: torch.FloatTensor = None
  1085. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1086. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1087. @dataclass
  1088. class ImageSuperResolutionOutput(ModelOutput):
  1089. """
  1090. Base class for outputs of image super resolution models.
  1091. Args:
  1092. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1093. Reconstruction loss.
  1094. reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  1095. Reconstructed images, possibly upscaled.
  1096. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1097. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1098. one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
  1099. (also called feature maps) of the model at the output of each stage.
  1100. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1101. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1102. sequence_length)`.
  1103. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1104. heads.
  1105. """
  1106. loss: Optional[torch.FloatTensor] = None
  1107. reconstruction: torch.FloatTensor = None
  1108. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1109. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1110. @dataclass
  1111. class Wav2Vec2BaseModelOutput(ModelOutput):
  1112. """
  1113. Base class for models that have been trained with the Wav2Vec2 loss objective.
  1114. Args:
  1115. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  1116. Sequence of hidden-states at the output of the last layer of the model.
  1117. extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
  1118. Sequence of extracted feature vectors of the last convolutional layer of the model.
  1119. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1120. Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
  1121. shape `(batch_size, sequence_length, hidden_size)`.
  1122. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  1123. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1124. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1125. sequence_length)`.
  1126. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1127. heads.
  1128. """
  1129. last_hidden_state: torch.FloatTensor = None
  1130. extract_features: torch.FloatTensor = None
  1131. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1132. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1133. @dataclass
  1134. class XVectorOutput(ModelOutput):
  1135. """
  1136. Output type of [`Wav2Vec2ForXVector`].
  1137. Args:
  1138. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1139. Classification loss.
  1140. logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
  1141. Classification hidden states before AMSoftmax.
  1142. embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
  1143. Utterance embeddings used for vector similarity-based retrieval.
  1144. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1145. Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
  1146. shape `(batch_size, sequence_length, hidden_size)`.
  1147. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  1148. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1149. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1150. sequence_length)`.
  1151. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1152. heads.
  1153. """
  1154. loss: Optional[torch.FloatTensor] = None
  1155. logits: torch.FloatTensor = None
  1156. embeddings: torch.FloatTensor = None
  1157. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1158. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1159. @dataclass
  1160. class BackboneOutput(ModelOutput):
  1161. """
  1162. Base class for outputs of backbones.
  1163. Args:
  1164. feature_maps (`tuple(torch.FloatTensor)` of shape `(batch_size, num_channels, height, width)`):
  1165. Feature maps of the stages.
  1166. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1167. Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
  1168. shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, num_channels, height, width)`,
  1169. depending on the backbone.
  1170. Hidden-states of the model at the output of each stage plus the initial embedding outputs.
  1171. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1172. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1173. sequence_length)`. Only applicable if the backbone uses attention.
  1174. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1175. heads.
  1176. """
  1177. feature_maps: Tuple[torch.FloatTensor] = None
  1178. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1179. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1180. @dataclass
  1181. class BaseModelOutputWithPoolingAndProjection(ModelOutput):
  1182. """
  1183. Base class for model's outputs that also contains a pooling of the last hidden states.
  1184. Args:
  1185. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  1186. Sequence of hidden-states at the output of the last layer of the model.
  1187. pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
  1188. Last layer hidden-state of the first token of the sequence (classification token) after further processing
  1189. through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
  1190. the classification token after processing through a linear layer and a tanh activation function. The linear
  1191. layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  1192. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1193. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1194. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1195. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1196. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1197. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1198. sequence_length)`.
  1199. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1200. heads.
  1201. projection_state (`tuple(torch.FloatTensor)`, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1202. Tuple of `torch.FloatTensor` of shape `(batch_size,config.project_dim)`.
  1203. Text embeddings before the projection layer, used to mimic the last hidden state of the teacher encoder.
  1204. """
  1205. last_hidden_state: torch.FloatTensor = None
  1206. pooler_output: torch.FloatTensor = None
  1207. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1208. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1209. projection_state: Optional[Tuple[torch.FloatTensor]] = None
  1210. @dataclass
  1211. class Seq2SeqSpectrogramOutput(ModelOutput):
  1212. """
  1213. Base class for sequence-to-sequence spectrogram outputs.
  1214. Args:
  1215. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1216. Spectrogram generation loss.
  1217. spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
  1218. The predicted spectrogram.
  1219. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1220. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  1221. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  1222. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  1223. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  1224. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  1225. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1226. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1227. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1228. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  1229. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1230. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1231. sequence_length)`.
  1232. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1233. self-attention heads.
  1234. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1235. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1236. sequence_length)`.
  1237. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1238. weighted average in the cross-attention heads.
  1239. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  1240. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1241. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1242. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1243. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1244. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  1245. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1246. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1247. sequence_length)`.
  1248. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1249. self-attention heads.
  1250. """
  1251. loss: Optional[torch.FloatTensor] = None
  1252. spectrogram: torch.FloatTensor = None
  1253. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  1254. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1255. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1256. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1257. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  1258. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1259. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1260. @dataclass
  1261. class Seq2SeqTSModelOutput(ModelOutput):
  1262. """
  1263. Base class for time series model's encoder outputs that also contains pre-computed hidden states that can speed up
  1264. sequential decoding.
  1265. Args:
  1266. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  1267. Sequence of hidden-states at the output of the last layer of the decoder of the model.
  1268. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  1269. hidden_size)` is output.
  1270. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1271. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  1272. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  1273. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  1274. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  1275. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  1276. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1277. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1278. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1279. Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
  1280. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1281. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1282. sequence_length)`.
  1283. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1284. self-attention heads.
  1285. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1286. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1287. sequence_length)`.
  1288. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1289. weighted average in the cross-attention heads.
  1290. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  1291. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1292. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1293. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1294. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1295. Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
  1296. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1297. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1298. sequence_length)`.
  1299. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1300. self-attention heads.
  1301. loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
  1302. Shift values of each time series' context window which is used to give the model inputs of the same
  1303. magnitude and then used to shift back to the original magnitude.
  1304. scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
  1305. Scaling values of each time series' context window which is used to give the model inputs of the same
  1306. magnitude and then used to rescale back to the original magnitude.
  1307. static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
  1308. Static features of each time series' in a batch which are copied to the covariates at inference time.
  1309. """
  1310. last_hidden_state: torch.FloatTensor = None
  1311. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  1312. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1313. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1314. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1315. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  1316. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1317. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1318. loc: Optional[torch.FloatTensor] = None
  1319. scale: Optional[torch.FloatTensor] = None
  1320. static_features: Optional[torch.FloatTensor] = None
  1321. @dataclass
  1322. class Seq2SeqTSPredictionOutput(ModelOutput):
  1323. """
  1324. Base class for time series model's decoder outputs that also contain the loss as well as the parameters of the
  1325. chosen distribution.
  1326. Args:
  1327. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
  1328. Distributional loss.
  1329. params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
  1330. Parameters of the chosen distribution.
  1331. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1332. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  1333. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  1334. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  1335. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  1336. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  1337. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1338. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1339. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1340. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  1341. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1342. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1343. sequence_length)`.
  1344. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1345. self-attention heads.
  1346. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1347. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1348. sequence_length)`.
  1349. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1350. weighted average in the cross-attention heads.
  1351. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  1352. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1353. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1354. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1355. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1356. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  1357. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1358. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1359. sequence_length)`.
  1360. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1361. self-attention heads.
  1362. loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
  1363. Shift values of each time series' context window which is used to give the model inputs of the same
  1364. magnitude and then used to shift back to the original magnitude.
  1365. scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
  1366. Scaling values of each time series' context window which is used to give the model inputs of the same
  1367. magnitude and then used to rescale back to the original magnitude.
  1368. static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
  1369. Static features of each time series' in a batch which are copied to the covariates at inference time.
  1370. """
  1371. loss: Optional[torch.FloatTensor] = None
  1372. params: Optional[Tuple[torch.FloatTensor]] = None
  1373. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  1374. decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1375. decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1376. cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1377. encoder_last_hidden_state: Optional[torch.FloatTensor] = None
  1378. encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1379. encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1380. loc: Optional[torch.FloatTensor] = None
  1381. scale: Optional[torch.FloatTensor] = None
  1382. static_features: Optional[torch.FloatTensor] = None
  1383. @dataclass
  1384. class SampleTSPredictionOutput(ModelOutput):
  1385. """
  1386. Base class for time series model's predictions outputs that contains the sampled values from the chosen
  1387. distribution.
  1388. Args:
  1389. sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, input_size)`):
  1390. Sampled values from the chosen distribution.
  1391. """
  1392. sequences: torch.FloatTensor = None
  1393. @dataclass
  1394. class MaskedImageModelingOutput(ModelOutput):
  1395. """
  1396. Base class for outputs of masked image completion / in-painting models.
  1397. Args:
  1398. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
  1399. Reconstruction loss.
  1400. reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  1401. Reconstructed / completed images.
  1402. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or
  1403. when `config.output_hidden_states=True`):
  1404. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1405. one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
  1406. (also called feature maps) of the model at the output of each stage.
  1407. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when
  1408. `config.output_attentions=True`):
  1409. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1410. sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
  1411. the self-attention heads.
  1412. """
  1413. loss: Optional[torch.FloatTensor] = None
  1414. reconstruction: torch.FloatTensor = None
  1415. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  1416. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  1417. @property
  1418. def logits(self):
  1419. warnings.warn(
  1420. "logits attribute is deprecated and will be removed in version 5 of Transformers."
  1421. " Please use the reconstruction attribute to retrieve the final output instead.",
  1422. FutureWarning,
  1423. )
  1424. return self.reconstruction