modeling_idefics2.py 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755
  1. # coding=utf-8
  2. # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """PyTorch Idefics2 model."""
  16. import math
  17. from dataclasses import dataclass
  18. from typing import List, Optional, Tuple, Union
  19. import torch
  20. import torch.utils.checkpoint
  21. from torch import nn
  22. from torch.nn import CrossEntropyLoss
  23. from ...activations import ACT2FN
  24. from ...cache_utils import Cache, DynamicCache
  25. from ...generation import GenerationMixin
  26. from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
  27. from ...modeling_outputs import BaseModelOutput, ModelOutput
  28. from ...modeling_utils import PreTrainedModel
  29. from ...utils import (
  30. add_start_docstrings,
  31. add_start_docstrings_to_model_forward,
  32. is_flash_attn_2_available,
  33. is_flash_attn_greater_or_equal_2_10,
  34. logging,
  35. replace_return_docstrings,
  36. )
  37. from ..auto import AutoModel
  38. from .configuration_idefics2 import Idefics2Config, Idefics2PerceiverConfig, Idefics2VisionConfig
  39. if is_flash_attn_2_available():
  40. from ...modeling_flash_attention_utils import _flash_attention_forward
  41. logger = logging.get_logger(__name__)
  42. _CONFIG_FOR_DOC = "Idefics2Config"
  43. @dataclass
  44. class Idefics2BaseModelOutputWithPast(ModelOutput):
  45. """
  46. Base class for Idefics2 model's outputs that may also contain a past key/values (to speed up sequential decoding).
  47. Args:
  48. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  49. Sequence of hidden-states at the output of the last layer of the model.
  50. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  51. hidden_size)` is output.
  52. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  53. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  54. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  55. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  56. encoder_sequence_length, embed_size_per_head)`.
  57. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  58. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  59. input) to speed up sequential decoding.
  60. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  61. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  62. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  63. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  64. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  65. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  66. sequence_length)`.
  67. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  68. heads.
  69. image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
  70. Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
  71. sequence_length, hidden_size)`.
  72. image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
  73. """
  74. last_hidden_state: torch.FloatTensor = None
  75. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
  76. hidden_states: Optional[Tuple[torch.FloatTensor]] = None
  77. attentions: Optional[Tuple[torch.FloatTensor]] = None
  78. image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
  79. @dataclass
  80. # Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Idefics2
  81. class Idefics2CausalLMOutputWithPast(ModelOutput):
  82. """
  83. Base class for Idefics2 causal language model (or autoregressive) outputs.
  84. Args:
  85. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  86. Language modeling loss (for next-token prediction).
  87. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  88. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  89. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  90. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  91. `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
  92. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  93. `past_key_values` input) to speed up sequential decoding.
  94. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  95. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  96. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  97. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  98. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  99. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  100. sequence_length)`.
  101. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  102. heads.
  103. image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
  104. Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
  105. sequence_length, hidden_size)`.
  106. image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
  107. """
  108. loss: Optional[torch.FloatTensor] = None
  109. logits: torch.FloatTensor = None
  110. past_key_values: Optional[List[torch.FloatTensor]] = None
  111. hidden_states: Optional[Tuple[torch.FloatTensor]] = None
  112. attentions: Optional[Tuple[torch.FloatTensor]] = None
  113. image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
  114. class Idefics2VisionEmbeddings(nn.Module):
  115. """
  116. This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
  117. resolution.
  118. The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
  119. which allows treating images in their native aspect ratio and without the need to resize them to the same
  120. fixed size. In particular, we start from the original pre-trained SigLIP model
  121. (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
  122. """
  123. def __init__(self, config: Idefics2VisionConfig):
  124. super().__init__()
  125. self.embed_dim = config.hidden_size
  126. self.image_size = config.image_size
  127. self.patch_size = config.patch_size
  128. self.patch_embedding = nn.Conv2d(
  129. in_channels=config.num_channels,
  130. out_channels=self.embed_dim,
  131. kernel_size=self.patch_size,
  132. stride=self.patch_size,
  133. padding="valid",
  134. )
  135. self.num_patches_per_side = self.image_size // self.patch_size
  136. self.num_patches = self.num_patches_per_side**2
  137. self.num_positions = self.num_patches
  138. self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
  139. def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
  140. batch_size, _, max_im_h, max_im_w = pixel_values.shape
  141. patch_embeds = self.patch_embedding(pixel_values)
  142. embeddings = patch_embeds.flatten(2).transpose(1, 2)
  143. max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
  144. boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
  145. position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
  146. for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
  147. nb_patches_h = p_attn_mask[:, 0].sum()
  148. nb_patches_w = p_attn_mask[0].sum()
  149. fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
  150. fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
  151. bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
  152. bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
  153. pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
  154. position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
  155. position_ids = position_ids.to(self.position_embedding.weight.device)
  156. embeddings = embeddings + self.position_embedding(position_ids)
  157. return embeddings
  158. # Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics2Vision
  159. class Idefics2VisionAttention(nn.Module):
  160. """Multi-headed attention from 'Attention Is All You Need' paper"""
  161. # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
  162. def __init__(self, config):
  163. super().__init__()
  164. self.config = config
  165. self.embed_dim = config.hidden_size
  166. self.num_heads = config.num_attention_heads
  167. self.head_dim = self.embed_dim // self.num_heads
  168. if self.head_dim * self.num_heads != self.embed_dim:
  169. raise ValueError(
  170. f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
  171. f" {self.num_heads})."
  172. )
  173. self.scale = self.head_dim**-0.5
  174. self.dropout = config.attention_dropout
  175. self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
  176. self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
  177. self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
  178. self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
  179. # Ignore copy
  180. self.is_causal = False
  181. def forward(
  182. self,
  183. hidden_states: torch.Tensor,
  184. attention_mask: Optional[torch.Tensor] = None,
  185. output_attentions: Optional[bool] = False,
  186. ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
  187. """Input shape: Batch x Time x Channel"""
  188. batch_size, q_len, _ = hidden_states.size()
  189. query_states = self.q_proj(hidden_states)
  190. key_states = self.k_proj(hidden_states)
  191. value_states = self.v_proj(hidden_states)
  192. query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
  193. key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
  194. value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
  195. k_v_seq_len = key_states.shape[-2]
  196. attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
  197. if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
  198. raise ValueError(
  199. f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
  200. f" {attn_weights.size()}"
  201. )
  202. if attention_mask is not None:
  203. if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
  204. raise ValueError(
  205. f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
  206. )
  207. attn_weights = attn_weights + attention_mask
  208. # upcast attention to fp32
  209. attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
  210. attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
  211. attn_output = torch.matmul(attn_weights, value_states)
  212. if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
  213. raise ValueError(
  214. f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
  215. f" {attn_output.size()}"
  216. )
  217. attn_output = attn_output.transpose(1, 2).contiguous()
  218. attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
  219. attn_output = self.out_proj(attn_output)
  220. return attn_output, attn_weights
  221. class Idefics2VisionFlashAttention2(Idefics2VisionAttention):
  222. """
  223. Idefics2Vision flash attention module. This module inherits from `Idefics2VisionAttention` as the weights of the module stays
  224. untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
  225. flash attention and deal with padding tokens in case the input contains any of them.
  226. """
  227. # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
  228. def __init__(self, *args, **kwargs):
  229. super().__init__(*args, **kwargs)
  230. # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
  231. # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
  232. # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
  233. self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
  234. def forward(
  235. self,
  236. hidden_states: torch.Tensor,
  237. attention_mask: Optional[torch.LongTensor] = None,
  238. position_ids: Optional[torch.LongTensor] = None,
  239. past_key_value: Optional[Cache] = None,
  240. output_attentions: bool = False,
  241. use_cache: bool = False,
  242. **kwargs,
  243. ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
  244. output_attentions = False
  245. bsz, q_len, _ = hidden_states.size()
  246. query_states = self.q_proj(hidden_states)
  247. key_states = self.k_proj(hidden_states)
  248. value_states = self.v_proj(hidden_states)
  249. # Flash attention requires the input to have the shape
  250. # batch_size x seq_length x head_dim x hidden_dim
  251. # therefore we just need to keep the original shape
  252. query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
  253. key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
  254. value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
  255. kv_seq_len = key_states.shape[-2]
  256. if past_key_value is not None:
  257. kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
  258. # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
  259. # to be able to avoid many of these transpose/reshape/view.
  260. key_states = key_states.transpose(1, 2)
  261. value_states = value_states.transpose(1, 2)
  262. dropout_rate = self.dropout if self.training else 0.0
  263. # In PEFT, usually we cast the layer norms in float32 for training stability reasons
  264. # therefore the input hidden states gets silently casted in float32. Hence, we need
  265. # cast them back in the correct dtype just to be sure everything works as expected.
  266. # This might slowdown training & inference so it is recommended to not cast the LayerNorms
  267. # in fp32. (Idefics2VisionRMSNorm handles it correctly)
  268. input_dtype = query_states.dtype
  269. if input_dtype == torch.float32:
  270. if torch.is_autocast_enabled():
  271. target_dtype = torch.get_autocast_gpu_dtype()
  272. # Handle the case where the model is quantized
  273. elif hasattr(self.config, "_pre_quantization_dtype"):
  274. target_dtype = self.config._pre_quantization_dtype
  275. else:
  276. target_dtype = self.q_proj.weight.dtype
  277. logger.warning_once(
  278. f"The input hidden states seems to be silently casted in float32, this might be related to"
  279. f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
  280. f" {target_dtype}."
  281. )
  282. query_states = query_states.to(target_dtype)
  283. key_states = key_states.to(target_dtype)
  284. value_states = value_states.to(target_dtype)
  285. attn_output = _flash_attention_forward(
  286. query_states,
  287. key_states,
  288. value_states,
  289. attention_mask,
  290. q_len,
  291. dropout=dropout_rate,
  292. is_causal=self.is_causal,
  293. use_top_left_mask=self._flash_attn_uses_top_left_mask,
  294. )
  295. attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
  296. attn_output = self.out_proj(attn_output)
  297. if not output_attentions:
  298. attn_weights = None
  299. return attn_output, attn_weights
  300. IDEFICS_VISION_ATTENTION_CLASSES = {
  301. "eager": Idefics2VisionAttention,
  302. "flash_attention_2": Idefics2VisionFlashAttention2,
  303. }
  304. # Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics2Vision
  305. class Idefics2VisionMLP(nn.Module):
  306. def __init__(self, config):
  307. super().__init__()
  308. self.config = config
  309. self.activation_fn = ACT2FN[config.hidden_act]
  310. self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
  311. self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
  312. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  313. hidden_states = self.fc1(hidden_states)
  314. hidden_states = self.activation_fn(hidden_states)
  315. hidden_states = self.fc2(hidden_states)
  316. return hidden_states
  317. class Idefics2MLP(nn.Module):
  318. def __init__(
  319. self,
  320. hidden_size: int,
  321. intermediate_size: int,
  322. output_size: int,
  323. hidden_act: str,
  324. ):
  325. super().__init__()
  326. self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
  327. self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
  328. self.down_proj = nn.Linear(intermediate_size, output_size, bias=False)
  329. self.act_fn = ACT2FN[hidden_act]
  330. def forward(self, x):
  331. return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
  332. # Copied from transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead with Siglip->Idefics2
  333. class Idefics2MultiheadAttentionPoolingHead(nn.Module):
  334. """Multihead Attention Pooling."""
  335. def __init__(self, config: Idefics2VisionConfig):
  336. super().__init__()
  337. self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
  338. self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
  339. self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  340. # Ignore copy
  341. self.mlp = Idefics2MLP(
  342. hidden_size=config.hidden_size,
  343. intermediate_size=config.intermediate_size,
  344. hidden_act=config.hidden_act,
  345. output_size=config.hidden_size,
  346. )
  347. def forward(self, hidden_state):
  348. batch_size = hidden_state.shape[0]
  349. probe = self.probe.repeat(batch_size, 1, 1)
  350. hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
  351. residual = hidden_state
  352. hidden_state = self.layernorm(hidden_state)
  353. hidden_state = residual + self.mlp(hidden_state)
  354. return hidden_state[:, 0]
  355. class Idefics2EncoderLayer(nn.Module):
  356. def __init__(self, config: Idefics2VisionConfig):
  357. super().__init__()
  358. self.embed_dim = config.hidden_size
  359. self.self_attn = IDEFICS_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
  360. self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
  361. self.mlp = Idefics2VisionMLP(config)
  362. self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
  363. # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
  364. def forward(
  365. self,
  366. hidden_states: torch.Tensor,
  367. attention_mask: torch.Tensor,
  368. output_attentions: Optional[bool] = False,
  369. ) -> Tuple[torch.FloatTensor]:
  370. """
  371. Args:
  372. hidden_states (`torch.FloatTensor`):
  373. Input to the layer of shape `(batch, seq_len, embed_dim)`.
  374. attention_mask (`torch.FloatTensor`):
  375. Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
  376. output_attentions (`bool`, *optional*, defaults to `False`):
  377. Whether or not to return the attentions tensors of all attention layers. See `attentions` under
  378. returned tensors for more detail.
  379. """
  380. residual = hidden_states
  381. hidden_states = self.layer_norm1(hidden_states)
  382. hidden_states, attn_weights = self.self_attn(
  383. hidden_states=hidden_states,
  384. attention_mask=attention_mask,
  385. output_attentions=output_attentions,
  386. )
  387. hidden_states = residual + hidden_states
  388. residual = hidden_states
  389. hidden_states = self.layer_norm2(hidden_states)
  390. hidden_states = self.mlp(hidden_states)
  391. hidden_states = residual + hidden_states
  392. outputs = (hidden_states,)
  393. if output_attentions:
  394. outputs += (attn_weights,)
  395. return outputs
  396. # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoder with Siglip->Idefics2
  397. class Idefics2Encoder(nn.Module):
  398. """
  399. Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
  400. [`Idefics2EncoderLayer`].
  401. Args:
  402. config: Idefics2Config
  403. """
  404. def __init__(self, config: Idefics2Config):
  405. super().__init__()
  406. self.config = config
  407. self.layers = nn.ModuleList([Idefics2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
  408. self.gradient_checkpointing = False
  409. # Ignore copy
  410. def forward(
  411. self,
  412. inputs_embeds,
  413. attention_mask: Optional[torch.Tensor] = None,
  414. output_attentions: Optional[bool] = None,
  415. output_hidden_states: Optional[bool] = None,
  416. return_dict: Optional[bool] = None,
  417. ) -> Union[Tuple, BaseModelOutput]:
  418. r"""
  419. Args:
  420. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  421. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
  422. This is useful if you want more control over how to convert `input_ids` indices into associated vectors
  423. than the model's internal embedding lookup matrix.
  424. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  425. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  426. - 1 for tokens that are **not masked**,
  427. - 0 for tokens that are **masked**.
  428. [What are attention masks?](../glossary#attention-mask)
  429. output_attentions (`bool`, *optional*):
  430. Whether or not to return the attentions tensors of all attention layers. See `attentions` under
  431. returned tensors for more detail.
  432. output_hidden_states (`bool`, *optional*):
  433. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
  434. for more detail.
  435. return_dict (`bool`, *optional*):
  436. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  437. """
  438. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  439. output_hidden_states = (
  440. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  441. )
  442. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  443. encoder_states = () if output_hidden_states else None
  444. all_attentions = () if output_attentions else None
  445. hidden_states = inputs_embeds
  446. for encoder_layer in self.layers:
  447. if output_hidden_states:
  448. encoder_states = encoder_states + (hidden_states,)
  449. if self.gradient_checkpointing and self.training:
  450. layer_outputs = self._gradient_checkpointing_func(
  451. encoder_layer.__call__,
  452. hidden_states,
  453. attention_mask,
  454. output_attentions,
  455. )
  456. else:
  457. layer_outputs = encoder_layer(
  458. hidden_states,
  459. attention_mask,
  460. output_attentions=output_attentions,
  461. )
  462. hidden_states = layer_outputs[0]
  463. if output_attentions:
  464. all_attentions = all_attentions + (layer_outputs[1],)
  465. if output_hidden_states:
  466. encoder_states = encoder_states + (hidden_states,)
  467. if not return_dict:
  468. return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
  469. return BaseModelOutput(
  470. last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
  471. )
  472. IDEFICS2_START_DOCSTRING = r"""
  473. This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
  474. library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  475. etc.)
  476. This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
  477. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
  478. and behavior.
  479. Parameters:
  480. config ([`Idefics2Config`] or [`Idefics2VisionConfig`]):
  481. Model configuration class with all the parameters of the model. Initializing with a config file does not
  482. load the weights associated with the model, only the configuration. Check out the
  483. [`~PreTrainedModel.from_pretrained`] method to load the model weights.
  484. """
  485. @add_start_docstrings(
  486. "The bare Idefics2 Model outputting raw hidden-states without any specific head on top.",
  487. IDEFICS2_START_DOCSTRING,
  488. )
  489. class Idefics2PreTrainedModel(PreTrainedModel):
  490. config_class = Idefics2Config
  491. base_model_prefix = "model"
  492. supports_gradient_checkpointing = True
  493. _no_split_modules = ["Idefics2VisionAttention", "Idefics2MLP", "Idefics2PerceiverLayer", "Idefics2DecoderLayer"]
  494. _skip_keys_device_placement = "past_key_values"
  495. _supports_flash_attn_2 = True
  496. _supports_sdpa = True
  497. _supports_cache_class = True
  498. def _init_weights(self, module):
  499. std = (
  500. self.config.text_config.initializer_range
  501. if hasattr(self.config, "initializer_range")
  502. else self.config.text_config.initializer_range
  503. )
  504. if hasattr(module, "class_embedding"):
  505. module.class_embedding.data.normal_(mean=0.0, std=std)
  506. if isinstance(module, (nn.Linear, nn.Conv2d)):
  507. module.weight.data.normal_(mean=0.0, std=std)
  508. if module.bias is not None:
  509. module.bias.data.zero_()
  510. elif isinstance(module, nn.Embedding):
  511. module.weight.data.normal_(mean=0.0, std=std)
  512. if module.padding_idx is not None:
  513. module.weight.data[module.padding_idx].zero_()
  514. IDEFICS2_INPUTS_DOCSTRING = r"""
  515. Args:
  516. pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
  517. The tensors corresponding to the input images. Pixel values can be obtained using
  518. [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
  519. [`CLIPImageProcessor`] for processing images).
  520. pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
  521. Mask to avoid performing attention on padding pixel indices.
  522. output_attentions (`bool`, *optional*):
  523. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  524. tensors for more detail.
  525. output_hidden_states (`bool`, *optional*):
  526. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  527. more detail.
  528. return_dict (`bool`, *optional*):
  529. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  530. """
  531. @add_start_docstrings(
  532. """Idefics2 vision encoder model that returnss raw image embeddings.""",
  533. IDEFICS2_START_DOCSTRING,
  534. )
  535. class Idefics2VisionTransformer(Idefics2PreTrainedModel):
  536. _supports_sdpa = False
  537. config_class = Idefics2VisionConfig
  538. def __init__(self, config: Idefics2VisionConfig):
  539. super().__init__(config)
  540. embed_dim = config.hidden_size
  541. self.config = config
  542. self.embeddings = Idefics2VisionEmbeddings(config)
  543. self.encoder = Idefics2Encoder(config)
  544. self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
  545. self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
  546. def get_input_embeddings(self):
  547. return self.embeddings
  548. def set_input_embeddings(self, value):
  549. self.embeddings = value
  550. def forward(
  551. self,
  552. pixel_values,
  553. patch_attention_mask: Optional[torch.BoolTensor] = None,
  554. output_attentions: Optional[bool] = None,
  555. output_hidden_states: Optional[bool] = None,
  556. return_dict: Optional[bool] = None,
  557. ) -> Union[Tuple, BaseModelOutput]:
  558. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  559. output_hidden_states = (
  560. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  561. )
  562. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  563. batch_size = pixel_values.size(0)
  564. if patch_attention_mask is None:
  565. patch_size = self.config.patch_size
  566. patch_attention_mask = torch.ones(
  567. (
  568. batch_size,
  569. pixel_values.size(2) // patch_size,
  570. pixel_values.size(3) // patch_size,
  571. )
  572. )
  573. patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
  574. hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
  575. patch_attention_mask = patch_attention_mask.view(batch_size, -1)
  576. # The call to `_upad_input` in `_flash_attention_forward` is expensive
  577. # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
  578. # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
  579. if not torch.any(~patch_attention_mask):
  580. patch_attention_mask = None
  581. elif not self._use_flash_attention_2:
  582. patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
  583. encoder_outputs = self.encoder(
  584. inputs_embeds=hidden_states,
  585. attention_mask=patch_attention_mask,
  586. output_attentions=output_attentions,
  587. output_hidden_states=output_hidden_states,
  588. return_dict=return_dict,
  589. )
  590. last_hidden_state = encoder_outputs[0]
  591. last_hidden_state = self.post_layernorm(last_hidden_state)
  592. if not return_dict:
  593. return (last_hidden_state,) + encoder_outputs[1:]
  594. return BaseModelOutput(
  595. last_hidden_state=last_hidden_state,
  596. hidden_states=encoder_outputs.hidden_states,
  597. attentions=encoder_outputs.attentions,
  598. )
  599. # Copied from transformers.models.llama.modeling_llama.repeat_kv
  600. def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  601. """
  602. This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
  603. num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
  604. """
  605. batch, num_key_value_heads, slen, head_dim = hidden_states.shape
  606. if n_rep == 1:
  607. return hidden_states
  608. hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
  609. return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
  610. # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Idefics2
  611. class Idefics2RMSNorm(nn.Module):
  612. def __init__(self, hidden_size, eps=1e-6):
  613. """
  614. Idefics2RMSNorm is equivalent to T5LayerNorm
  615. """
  616. super().__init__()
  617. self.weight = nn.Parameter(torch.ones(hidden_size))
  618. self.variance_epsilon = eps
  619. def forward(self, hidden_states):
  620. input_dtype = hidden_states.dtype
  621. hidden_states = hidden_states.to(torch.float32)
  622. variance = hidden_states.pow(2).mean(-1, keepdim=True)
  623. hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
  624. return self.weight * hidden_states.to(input_dtype)
  625. def extra_repr(self):
  626. return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
  627. class Idefics2PerceiverAttention(nn.Module):
  628. def __init__(self, config, layer_idx: Optional[int] = None) -> None:
  629. """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
  630. super().__init__()
  631. self.layer_idx = None
  632. self.hidden_size = config.hidden_size
  633. self.num_heads = config.resampler_n_heads
  634. self.head_dim = config.resampler_head_dim
  635. self.num_key_value_heads = config.num_key_value_heads
  636. self.num_key_value_groups = self.num_heads // self.num_key_value_heads
  637. self.attention_dropout = config.attention_dropout
  638. self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
  639. self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
  640. self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
  641. self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
  642. self.is_causal = False
  643. def forward(
  644. self,
  645. latents: torch.Tensor,
  646. context: torch.Tensor,
  647. attention_mask: Optional[torch.Tensor] = None,
  648. position_ids: Optional[torch.LongTensor] = None,
  649. past_key_value: Optional[Tuple[torch.Tensor]] = None,
  650. output_attentions: bool = False,
  651. use_cache: bool = False,
  652. ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
  653. """
  654. Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
  655. Args:
  656. latents (`torch.Tensor`): Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to.
  657. context (`torch.Tensor`): Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample.
  658. attention_mask (`torch.Tensor`, *optional*): Tensor of shape [bsz, 1, seq, n_latents] representing attention mask.
  659. position_ids (`torch.LongTensor`, *optional*): Tensor of shape [bsz, seq] representing position indices of each input token.
  660. past_key_value (`Tuple[torch.Tensor]`, *optional*): Tuple of tensors containing cached key and value states.
  661. output_attentions (`bool`, *optional*, defaults to `False`): Whether to return attention weights.
  662. use_cache (`bool`, *optional*, defaults to `False`): Whether to use past_key_value for caching.
  663. """
  664. bsz, q_len, _ = latents.size()
  665. kv_seq_len = q_len + context.size()[1]
  666. hidden_states = torch.concat([context, latents], dim=-2)
  667. query_states = self.q_proj(latents)
  668. key_states = self.k_proj(hidden_states)
  669. value_states = self.v_proj(hidden_states)
  670. query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
  671. key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
  672. value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
  673. past_key_value = getattr(self, "past_key_value", past_key_value)
  674. if past_key_value is not None:
  675. key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
  676. # repeat k/v heads if n_kv_heads < n_heads
  677. key_states = repeat_kv(key_states, self.num_key_value_groups)
  678. value_states = repeat_kv(value_states, self.num_key_value_groups)
  679. attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
  680. if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
  681. raise ValueError(
  682. f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
  683. f" {attn_weights.size()}"
  684. )
  685. if attention_mask is not None:
  686. if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
  687. raise ValueError(
  688. f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
  689. )
  690. attn_weights = attn_weights + attention_mask
  691. # upcast attention to fp32
  692. attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
  693. attn_output = torch.matmul(attn_weights, value_states)
  694. if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
  695. raise ValueError(
  696. f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
  697. f" {attn_output.size()}"
  698. )
  699. attn_output = attn_output.transpose(1, 2).contiguous()
  700. attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
  701. attn_output = self.o_proj(attn_output)
  702. if not output_attentions:
  703. attn_weights = None
  704. return attn_output, attn_weights, past_key_value
  705. # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with MistralAttention->Idefics2PerceiverAttention,MistralFlashAttention->Idefics2PerceiverFlashAttention,Mistral->Idefics2
  706. class Idefics2PerceiverFlashAttention2(Idefics2PerceiverAttention):
  707. """
  708. Idefics2 flash attention module. This module inherits from `Idefics2PerceiverAttention` as the weights of the module stays
  709. untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
  710. flash attention and deal with padding tokens in case the input contains any of them.
  711. """
  712. # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
  713. def __init__(self, *args, **kwargs):
  714. super().__init__(*args, **kwargs)
  715. # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
  716. # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
  717. # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
  718. self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
  719. # Ignore copy
  720. def forward(
  721. self,
  722. latents: torch.Tensor,
  723. context: torch.Tensor,
  724. attention_mask: Optional[torch.LongTensor] = None,
  725. position_ids: Optional[torch.LongTensor] = None,
  726. past_key_value: Optional[Cache] = None,
  727. output_attentions: bool = False,
  728. use_cache: bool = False,
  729. **kwargs,
  730. ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
  731. bsz, q_len, _ = latents.size()
  732. kv_seq_len = q_len + context.size()[1]
  733. # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
  734. # Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
  735. query_states = self.q_proj(latents)
  736. key_states = self.k_proj(torch.cat([context, latents], dim=-2))
  737. value_states = self.v_proj(torch.cat([context, latents], dim=-2))
  738. query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
  739. key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
  740. value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
  741. kv_seq_len = key_states.shape[-2]
  742. if past_key_value is not None:
  743. kv_seq_len += past_key_value[0].shape[-2]
  744. if past_key_value is not None:
  745. # Activate slicing cache only if the config has a value `sliding_windows` attribute
  746. if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window:
  747. slicing_tokens = kv_seq_len - self.config.sliding_window
  748. past_key = past_key_value[0]
  749. past_value = past_key_value[1]
  750. past_key = past_key[:, :, slicing_tokens:, :].contiguous()
  751. past_value = past_value[:, :, slicing_tokens:, :].contiguous()
  752. if past_key.shape[-2] != self.config.sliding_window - 1:
  753. raise ValueError(
  754. "past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1,"
  755. f" head_dim`), got {past_key.shape}"
  756. )
  757. past_key_value = (past_key, past_value)
  758. if attention_mask is not None:
  759. attention_mask = attention_mask[:, slicing_tokens:]
  760. attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
  761. key_states = torch.cat([past_key_value[0], key_states], dim=2)
  762. value_states = torch.cat([past_key_value[1], value_states], dim=2)
  763. past_key_value = (key_states, value_states) if use_cache else None
  764. # repeat k/v heads if n_kv_heads < n_heads
  765. key_states = repeat_kv(key_states, self.num_key_value_groups)
  766. value_states = repeat_kv(value_states, self.num_key_value_groups)
  767. dropout_rate = 0.0 if not self.training else self.attention_dropout
  768. # In PEFT, usually we cast the layer norms in float32 for training stability reasons
  769. # therefore the input hidden states gets silently casted in float32. Hence, we need
  770. # cast them back in float16 just to be sure everything works as expected.
  771. input_dtype = query_states.dtype
  772. if input_dtype == torch.float32:
  773. if torch.is_autocast_enabled():
  774. target_dtype = torch.get_autocast_gpu_dtype()
  775. # Handle the case where the model is quantized
  776. elif hasattr(self.config, "_pre_quantization_dtype"):
  777. target_dtype = self.config._pre_quantization_dtype
  778. else:
  779. target_dtype = self.q_proj.weight.dtype
  780. logger.warning_once(
  781. f"The input hidden states seems to be silently casted in float32, this might be related to"
  782. f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
  783. f" {target_dtype}."
  784. )
  785. query_states = query_states.to(target_dtype)
  786. key_states = key_states.to(target_dtype)
  787. value_states = value_states.to(target_dtype)
  788. # Reashape to the expected shape for Flash Attention
  789. key_states = key_states.transpose(1, 2)
  790. value_states = value_states.transpose(1, 2)
  791. attn_output = _flash_attention_forward(
  792. query_states,
  793. key_states,
  794. value_states,
  795. attention_mask,
  796. q_len,
  797. dropout=dropout_rate,
  798. sliding_window=None,
  799. is_causal=self.is_causal,
  800. use_top_left_mask=self._flash_attn_uses_top_left_mask,
  801. )
  802. attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()
  803. attn_output = self.o_proj(attn_output)
  804. if not output_attentions:
  805. attn_weights = None
  806. return attn_output, attn_weights, past_key_value
  807. IDEFICS2_PERCEIVER_ATTENTION_CLASSES = {
  808. "eager": Idefics2PerceiverAttention,
  809. "flash_attention_2": Idefics2PerceiverFlashAttention2,
  810. }
  811. class Idefics2PerceiverLayer(nn.Module):
  812. def __init__(self, config, layer_idx: int):
  813. super().__init__()
  814. self.hidden_size = config.hidden_size
  815. self.n_latents = config.resampler_n_latents
  816. self.depth = config.resampler_depth
  817. self.rms_norm_eps = config.rms_norm_eps
  818. self.input_latents_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
  819. self.input_context_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
  820. self.self_attn = IDEFICS2_PERCEIVER_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
  821. self.post_attention_layernorm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
  822. self.mlp = Idefics2MLP(
  823. hidden_size=config.hidden_size,
  824. intermediate_size=config.hidden_size * 4,
  825. output_size=config.hidden_size,
  826. hidden_act=config.hidden_act,
  827. )
  828. def forward(
  829. self,
  830. latents: torch.Tensor,
  831. context: torch.Tensor,
  832. attention_mask: Optional[torch.Tensor] = None,
  833. position_ids: Optional[torch.LongTensor] = None,
  834. past_key_value: Optional[Tuple[torch.Tensor]] = None,
  835. output_attentions: Optional[bool] = False,
  836. use_cache: Optional[bool] = False,
  837. **kwargs,
  838. ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
  839. """
  840. Args:
  841. latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
  842. context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
  843. attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
  844. `(batch, sequence_length)` where padding elements are indicated by 0.
  845. output_attentions (`bool`, *optional*):
  846. Whether or not to return the attentions tensors of all attention layers. See `attentions` under
  847. returned tensors for more detail.
  848. use_cache (`bool`, *optional*):
  849. If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
  850. (see `past_key_values`).
  851. past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
  852. """
  853. residual = latents
  854. latents = self.input_latents_norm(latents)
  855. context = self.input_context_norm(context)
  856. latents, self_attn_weights, present_key_value = self.self_attn(
  857. latents=latents,
  858. context=context,
  859. attention_mask=attention_mask,
  860. )
  861. latents = residual + latents
  862. residual = latents
  863. latents = self.post_attention_layernorm(latents)
  864. latents = self.mlp(latents)
  865. latents = residual + latents
  866. outputs = (latents,)
  867. if output_attentions:
  868. outputs += (self_attn_weights,)
  869. if use_cache:
  870. outputs += (present_key_value,)
  871. return outputs
  872. IDEFICS2_INPUTS_DOCSTRING = r"""
  873. Args:
  874. context (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`):
  875. The hidden states of the image after vision encoder and modality projection.
  876. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  877. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  878. - 1 for tokens that are **not masked**,
  879. - 0 for tokens that are **masked**.
  880. [What are attention masks?](../glossary#attention-mask)
  881. """
  882. @add_start_docstrings(
  883. "Idefics2 perceiver resampler model that performs `depth` blocks of cross-attention with a fixed ",
  884. "`n_latents` inputs to decrease embedding sequence length. The Resampler acts as a form of learned pooling and ",
  885. "is derived from [Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206)",
  886. IDEFICS2_START_DOCSTRING,
  887. )
  888. class Idefics2PerceiverResampler(Idefics2PreTrainedModel):
  889. _supports_sdpa = False
  890. config_class = Idefics2PerceiverConfig
  891. def __init__(self, config) -> None:
  892. super().__init__(config)
  893. self.hidden_size = config.hidden_size
  894. self.hidden_act = config.hidden_act
  895. self.n_latents = config.resampler_n_latents
  896. self.depth = config.resampler_depth
  897. self.rms_norm_eps = config.rms_norm_eps
  898. # Create Latents for Perceiver
  899. self.latents = nn.Parameter(torch.ones(self.n_latents, self.hidden_size))
  900. # Create Transformer Blocks
  901. self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)])
  902. self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
  903. self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
  904. def forward(
  905. self,
  906. context: torch.Tensor,
  907. attention_mask: torch.Tensor,
  908. ) -> torch.Tensor:
  909. # seq embed -> bsz seq embed
  910. latents = self.latents.unsqueeze(0).expand((context.shape[0], *self.latents.size()))
  911. latent_attention_mask = torch.ones(
  912. (attention_mask.size(0), latents.size(1)), dtype=attention_mask.dtype, device=attention_mask.device
  913. )
  914. attention_mask = torch.cat([attention_mask, latent_attention_mask], dim=-1)
  915. attention_mask = (
  916. _prepare_4d_attention_mask(attention_mask, latents.dtype, tgt_len=self.n_latents)
  917. if not self._use_flash_attention_2
  918. else attention_mask
  919. )
  920. compressed_context = latents
  921. for perceiver_layer in self.layers:
  922. layer_outputs = perceiver_layer(
  923. compressed_context,
  924. context,
  925. attention_mask=attention_mask,
  926. position_ids=None,
  927. past_key_value=None,
  928. output_attentions=False,
  929. use_cache=False,
  930. )
  931. compressed_context = layer_outputs[0]
  932. compressed_context = self.norm(compressed_context)
  933. return compressed_context
  934. class Idefics2Connector(nn.Module):
  935. def __init__(self, config):
  936. super().__init__()
  937. self.modality_projection = Idefics2MLP(
  938. hidden_size=config.vision_config.hidden_size,
  939. intermediate_size=config.text_config.intermediate_size,
  940. output_size=config.text_config.hidden_size,
  941. hidden_act=config.text_config.hidden_act,
  942. )
  943. self.perceiver_resampler = Idefics2PerceiverResampler._from_config(config.perceiver_config)
  944. def forward(self, image_hidden_states, attention_mask):
  945. image_hidden_states = self.modality_projection(image_hidden_states)
  946. image_hidden_states = self.perceiver_resampler(context=image_hidden_states, attention_mask=attention_mask)
  947. return image_hidden_states
  948. IDEFICS2_INPUTS_DOCSTRING = r"""
  949. Args:
  950. input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
  951. Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
  952. it.
  953. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  954. [`PreTrainedTokenizer.__call__`] for details.
  955. [What are input IDs?](../glossary#input-ids)
  956. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  957. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  958. - 1 for tokens that are **not masked**,
  959. - 0 for tokens that are **masked**.
  960. [What are attention masks?](../glossary#attention-mask)
  961. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  962. [`PreTrainedTokenizer.__call__`] for details.
  963. If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
  964. `past_key_values`).
  965. If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
  966. and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
  967. information on the default strategy.
  968. - 1 indicates the head is **not masked**,
  969. - 0 indicates the head is **masked**.
  970. position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  971. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  972. config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
  973. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  974. Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  975. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  976. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  977. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  978. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  979. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
  980. don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
  981. `decoder_input_ids` of shape `(batch_size, sequence_length)`.
  982. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  983. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
  984. is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
  985. model's internal embedding lookup matrix.
  986. pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
  987. The tensors corresponding to the input images. Pixel values can be obtained using
  988. [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
  989. [`CLIPImageProcessor`] for processing images).
  990. pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
  991. Mask to avoid performing attention on padding pixel indices.
  992. image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
  993. The hidden states of the image encoder after modality projection and perceiver resampling.
  994. use_cache (`bool`, *optional*):
  995. If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
  996. `past_key_values`).
  997. output_attentions (`bool`, *optional*):
  998. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  999. tensors for more detail.
  1000. output_hidden_states (`bool`, *optional*):
  1001. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  1002. more detail.
  1003. return_dict (`bool`, *optional*):
  1004. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  1005. """
  1006. @add_start_docstrings(
  1007. """Idefics2 model consisting of a SIGLIP vision encoder and Mistral language decoder""",
  1008. IDEFICS2_START_DOCSTRING,
  1009. )
  1010. class Idefics2Model(Idefics2PreTrainedModel):
  1011. def __init__(self, config: Idefics2Config):
  1012. super().__init__(config)
  1013. self.padding_idx = self.config.text_config.pad_token_id
  1014. self.vocab_size = self.config.text_config.vocab_size
  1015. self.vision_model = Idefics2VisionTransformer._from_config(config.vision_config)
  1016. self.connector = Idefics2Connector(config)
  1017. self.text_model = AutoModel.from_config(config.text_config)
  1018. self.image_seq_len = config.perceiver_config.resampler_n_latents
  1019. self.image_token_id = self.config.image_token_id
  1020. self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2"
  1021. self.post_init()
  1022. def enable_input_require_grads(self):
  1023. """
  1024. Enables the gradients for the input embeddings.
  1025. This is useful for lora when using gradient checkpointing.
  1026. c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032
  1027. Override to set output.requires_grad = True for both the decoder's and vision model's embeddings.
  1028. """
  1029. def get_lowest_module(module):
  1030. if len(list(module.children())) == 0:
  1031. # If the module has no children, it is a leaf module (e.g., Linear, Conv2d, etc.)
  1032. return module
  1033. else:
  1034. # Recursively call the function on each child module
  1035. return get_lowest_module(list(module.children())[0])
  1036. def make_inputs_require_grads(module, input, output):
  1037. output.requires_grad_(True)
  1038. self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
  1039. self._vision_require_grads_hook = get_lowest_module(self.vision_model).register_forward_hook(
  1040. make_inputs_require_grads
  1041. )
  1042. def disable_input_require_grads(self):
  1043. self._text_require_grads_hook.remove()
  1044. self._vision_require_grads_hook.remove()
  1045. def get_input_embeddings(self):
  1046. return self.text_model.get_input_embeddings()
  1047. def set_input_embeddings(self, value):
  1048. self.text_model.set_input_embeddings(value)
  1049. def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
  1050. model_embeds = self.text_model.resize_token_embeddings(
  1051. new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of
  1052. )
  1053. self.config.text_config.vocab_size = model_embeds.num_embeddings
  1054. return model_embeds
  1055. def inputs_merger(
  1056. self,
  1057. input_ids: torch.LongTensor,
  1058. inputs_embeds: Optional[torch.Tensor],
  1059. image_hidden_states: Optional[torch.Tensor],
  1060. ):
  1061. """
  1062. This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
  1063. The merging happens as follows:
  1064. - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
  1065. - We get the image hidden states for the image through the vision encoder (and potentially the perceiver), and that hidden state is then projected into the text embedding space.
  1066. We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
  1067. - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
  1068. - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
  1069. """
  1070. num_images, _, vision_hidden_size = image_hidden_states.shape
  1071. special_image_token_mask = input_ids == self.image_token_id
  1072. new_inputs_embeds = inputs_embeds.clone()
  1073. reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
  1074. new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
  1075. return new_inputs_embeds
  1076. @add_start_docstrings_to_model_forward(
  1077. """
  1078. Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
  1079. the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
  1080. max_num_images is the maximum number of images among the batch_size samples in the batch.
  1081. Padding images are not needed beyond padding the pixel_values at the entrance of the model.
  1082. For efficiency, we only pass through the vision_model's forward the real images by
  1083. discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
  1084. image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
  1085. """,
  1086. IDEFICS2_INPUTS_DOCSTRING,
  1087. )
  1088. def forward(
  1089. self,
  1090. input_ids: torch.LongTensor = None,
  1091. attention_mask: Optional[torch.Tensor] = None,
  1092. position_ids: Optional[torch.LongTensor] = None,
  1093. past_key_values: Optional[List[torch.FloatTensor]] = None,
  1094. inputs_embeds: Optional[torch.FloatTensor] = None,
  1095. pixel_values: Optional[torch.FloatTensor] = None,
  1096. pixel_attention_mask: Optional[torch.BoolTensor] = None,
  1097. image_hidden_states: Optional[torch.FloatTensor] = None,
  1098. use_cache: Optional[bool] = None,
  1099. output_attentions: Optional[bool] = None,
  1100. output_hidden_states: Optional[bool] = None,
  1101. return_dict: Optional[bool] = None,
  1102. ) -> Union[Tuple, Idefics2BaseModelOutputWithPast]:
  1103. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  1104. output_hidden_states = (
  1105. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  1106. )
  1107. use_cache = use_cache if use_cache is not None else self.config.use_cache
  1108. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1109. if self.training and self.text_model.gradient_checkpointing and use_cache:
  1110. logger.warning_once(
  1111. "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
  1112. )
  1113. use_cache = False
  1114. # retrieve input_ids and inputs_embeds
  1115. if input_ids is not None:
  1116. batch_size, seq_length = input_ids.shape
  1117. elif inputs_embeds is not None:
  1118. batch_size, seq_length, _ = inputs_embeds.shape
  1119. else:
  1120. raise ValueError("You have to specify either input_ids or inputs_embeds")
  1121. past_seen_tokens = 0
  1122. # kept for BC (non `Cache` `past_key_values` inputs)
  1123. return_legacy_cache = False
  1124. if use_cache:
  1125. if not isinstance(past_key_values, Cache):
  1126. return_legacy_cache = True
  1127. if past_key_values is None:
  1128. past_key_values = DynamicCache()
  1129. else:
  1130. past_key_values = DynamicCache.from_legacy_cache(past_key_values)
  1131. logger.warning_once(
  1132. "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
  1133. "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
  1134. "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
  1135. )
  1136. past_seen_tokens = past_key_values.get_seq_length()
  1137. if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
  1138. raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.")
  1139. if inputs_embeds is None:
  1140. inputs_embeds = self.text_model.get_input_embeddings()(input_ids)
  1141. # START VISUAL INPUTS INTEGRATION
  1142. if pixel_values is not None and image_hidden_states is not None:
  1143. raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
  1144. elif pixel_values is not None:
  1145. batch_size, num_images, num_channels, height, width = pixel_values.shape
  1146. pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility
  1147. pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
  1148. # Remove padding images - padding images are full 0.
  1149. nb_values_per_image = pixel_values.shape[1:].numel()
  1150. real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
  1151. pixel_values = pixel_values[real_images_inds].contiguous()
  1152. # Handle the vision attention mask
  1153. if pixel_attention_mask is None:
  1154. pixel_attention_mask = torch.ones(
  1155. size=(pixel_values.size(0), pixel_values.size(2), pixel_values.size(3)),
  1156. dtype=torch.bool,
  1157. device=pixel_values.device,
  1158. )
  1159. else:
  1160. # Remove padding images from the mask/pP p
  1161. pixel_attention_mask = pixel_attention_mask.view(
  1162. batch_size * num_images, *pixel_attention_mask.shape[2:]
  1163. )
  1164. pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
  1165. patch_size = self.config.vision_config.patch_size
  1166. patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
  1167. patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
  1168. patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) == patch_size * patch_size).bool()
  1169. # Get sequence from the vision encoder
  1170. image_hidden_states = self.vision_model(
  1171. pixel_values=pixel_values,
  1172. patch_attention_mask=patch_attention_mask,
  1173. ).last_hidden_state
  1174. # Modality projection & resampling
  1175. image_hidden_states = self.connector(
  1176. image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1)
  1177. )
  1178. elif image_hidden_states is not None:
  1179. image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
  1180. if past_seen_tokens == 0 and inputs_embeds is not None and image_hidden_states is not None:
  1181. # When we generate, we don't want to replace the potential image_token_id that we generated by images
  1182. # that simply don't exist
  1183. inputs_embeds = self.inputs_merger(
  1184. input_ids=input_ids,
  1185. inputs_embeds=inputs_embeds,
  1186. image_hidden_states=image_hidden_states,
  1187. )
  1188. outputs = self.text_model(
  1189. inputs_embeds=inputs_embeds,
  1190. attention_mask=attention_mask,
  1191. position_ids=position_ids,
  1192. past_key_values=past_key_values,
  1193. use_cache=use_cache,
  1194. output_attentions=output_attentions,
  1195. output_hidden_states=output_hidden_states,
  1196. return_dict=return_dict,
  1197. )
  1198. if return_legacy_cache and use_cache:
  1199. outputs.past_key_values = outputs.past_key_values.to_legacy_cache()
  1200. if not return_dict:
  1201. return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
  1202. return Idefics2BaseModelOutputWithPast(
  1203. last_hidden_state=outputs.last_hidden_state,
  1204. past_key_values=outputs.past_key_values,
  1205. hidden_states=outputs.hidden_states,
  1206. attentions=outputs.attentions,
  1207. image_hidden_states=image_hidden_states,
  1208. )
  1209. @add_start_docstrings(
  1210. """The Idefics2 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top. """,
  1211. IDEFICS2_START_DOCSTRING,
  1212. )
  1213. class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin):
  1214. _tied_weights_keys = ["lm_head.weight"]
  1215. def __init__(self, config):
  1216. super().__init__(config)
  1217. self.model = Idefics2Model(config)
  1218. self.image_token_id = self.config.image_token_id
  1219. self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
  1220. self.vocab_size = config.text_config.vocab_size
  1221. # Initialize weights and apply final processing
  1222. self.post_init()
  1223. def enable_input_require_grads(self):
  1224. """
  1225. Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
  1226. the model weights fixed.
  1227. """
  1228. def make_inputs_require_grads(module, input, output):
  1229. output.requires_grad_(True)
  1230. self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
  1231. self._vision_require_grads_hook = self.model.vision_model.get_input_embeddings().register_forward_hook(
  1232. make_inputs_require_grads
  1233. )
  1234. def disable_input_require_grads(self):
  1235. self._text_require_grads_hook.remove()
  1236. self._vision_require_grads_hook.remove()
  1237. def get_input_embeddings(self):
  1238. return self.model.text_model.get_input_embeddings()
  1239. def set_input_embeddings(self, value):
  1240. self.model.text_model.set_input_embeddings(value)
  1241. def get_output_embeddings(self):
  1242. return self.lm_head
  1243. def set_output_embeddings(self, new_embeddings):
  1244. self.lm_head = new_embeddings
  1245. def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
  1246. # model_embeds = self.model.resize_token_embeddings(new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of)
  1247. model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
  1248. if new_num_tokens is None and pad_to_multiple_of is None:
  1249. return model_embeds
  1250. # Update base model and current model config
  1251. # Ignore copy
  1252. self.config.text_config.vocab_size = model_embeds.weight.shape[0]
  1253. self.vocab_size = self.config.text_config.vocab_size
  1254. # Tie weights again if needed
  1255. self.tie_weights()
  1256. return model_embeds
  1257. def tie_weights(self):
  1258. """
  1259. Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
  1260. """
  1261. output_embeddings = self.get_output_embeddings()
  1262. input_embeddings = self.get_input_embeddings()
  1263. if getattr(self.config, "tie_word_embeddings", True):
  1264. output_embeddings.weight = input_embeddings.weight
  1265. @add_start_docstrings_to_model_forward(IDEFICS2_INPUTS_DOCSTRING)
  1266. @replace_return_docstrings(output_type=Idefics2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
  1267. def forward(
  1268. self,
  1269. input_ids: torch.LongTensor = None,
  1270. attention_mask: Optional[torch.Tensor] = None,
  1271. position_ids: Optional[torch.LongTensor] = None,
  1272. past_key_values: Optional[List[torch.FloatTensor]] = None,
  1273. inputs_embeds: Optional[torch.FloatTensor] = None,
  1274. pixel_values: Optional[torch.FloatTensor] = None,
  1275. pixel_attention_mask: Optional[torch.BoolTensor] = None,
  1276. image_hidden_states: Optional[torch.FloatTensor] = None,
  1277. labels: Optional[torch.LongTensor] = None,
  1278. use_cache: Optional[bool] = None,
  1279. output_attentions: Optional[bool] = None,
  1280. output_hidden_states: Optional[bool] = None,
  1281. return_dict: Optional[bool] = None,
  1282. num_logits_to_keep: int = 0,
  1283. ) -> Union[Tuple, Idefics2CausalLMOutputWithPast]:
  1284. r"""
  1285. Args:
  1286. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  1287. Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
  1288. config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics2ForConditionalGeneration`).
  1289. Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
  1290. computed for the tokens with labels in `[0, ..., config.vocab_size]`.
  1291. num_logits_to_keep (`int`, *optional*):
  1292. Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
  1293. `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
  1294. token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
  1295. Returns:
  1296. Example:
  1297. ```python
  1298. >>> import requests
  1299. >>> import torch
  1300. >>> from PIL import Image
  1301. >>> from io import BytesIO
  1302. >>> from transformers import AutoProcessor, AutoModelForVision2Seq
  1303. >>> from transformers.image_utils import load_image
  1304. >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
  1305. >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
  1306. >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
  1307. >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
  1308. >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base")
  1309. >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b-base", device_map="auto")
  1310. >>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
  1311. >>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
  1312. >>> # Create inputs
  1313. >>> prompts = [
  1314. ... "<image>In this image, we can see the city of New York, and more specifically the Statue of Liberty.<image>In this image,",
  1315. ... "In which city is that bridge located?<image>",
  1316. ... ]
  1317. >>> images = [[image1, image2], [image3]]
  1318. >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda")
  1319. >>> # Generate
  1320. >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
  1321. >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
  1322. >>> print(generated_texts)
  1323. ['In this image, we can see the city of New York, and more specifically the Statue of Liberty. In this image, we can see the city of New York, and more specifically the Statue of Liberty.\n\n', 'In which city is that bridge located?\n\nThe bridge is located in the city of Pittsburgh, Pennsylvania.\n\n\nThe bridge is']
  1324. ```"""
  1325. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  1326. output_hidden_states = (
  1327. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  1328. )
  1329. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1330. # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
  1331. outputs = self.model(
  1332. input_ids=input_ids,
  1333. attention_mask=attention_mask,
  1334. position_ids=position_ids,
  1335. past_key_values=past_key_values,
  1336. inputs_embeds=inputs_embeds,
  1337. pixel_values=pixel_values,
  1338. pixel_attention_mask=pixel_attention_mask,
  1339. image_hidden_states=image_hidden_states,
  1340. use_cache=use_cache,
  1341. output_attentions=output_attentions,
  1342. output_hidden_states=output_hidden_states,
  1343. return_dict=return_dict,
  1344. )
  1345. hidden_states = outputs[0]
  1346. # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
  1347. logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
  1348. loss = None
  1349. if labels is not None:
  1350. # Upcast to float if we need to compute the loss to avoid potential precision issues
  1351. logits = logits.float()
  1352. labels = labels.to(logits.device)
  1353. # Shift so that tokens < n predict n
  1354. if attention_mask is not None:
  1355. # we use the input attention mask to shift the logits and labels, because it is 2D.
  1356. # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
  1357. shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
  1358. shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
  1359. shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
  1360. else:
  1361. shift_logits = logits[..., :-1, :].contiguous()
  1362. shift_labels = labels[..., 1:].contiguous()
  1363. # Flatten the tokens
  1364. loss_fct = CrossEntropyLoss()
  1365. loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
  1366. if not return_dict:
  1367. output = (logits,) + outputs[1:]
  1368. return (loss,) + output if loss is not None else output
  1369. return Idefics2CausalLMOutputWithPast(
  1370. loss=loss,
  1371. logits=logits,
  1372. past_key_values=outputs.past_key_values,
  1373. hidden_states=outputs.hidden_states,
  1374. attentions=outputs.attentions,
  1375. image_hidden_states=outputs.image_hidden_states,
  1376. )
  1377. def prepare_inputs_for_generation(
  1378. self,
  1379. input_ids,
  1380. past_key_values=None,
  1381. attention_mask=None,
  1382. inputs_embeds=None,
  1383. cache_position=None,
  1384. pixel_values=None,
  1385. pixel_attention_mask=None,
  1386. image_hidden_states=None,
  1387. num_logits_to_keep=None,
  1388. **kwargs,
  1389. ):
  1390. # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
  1391. # precedence is moved to the model, we can remove this fn)
  1392. # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
  1393. if past_key_values is not None:
  1394. if inputs_embeds is not None: # Exception 1
  1395. input_ids = input_ids[:, -cache_position.shape[0] :]
  1396. elif input_ids.shape[1] != cache_position.shape[0]:
  1397. input_ids = input_ids[:, cache_position]
  1398. position_ids = kwargs.get("position_ids", None)
  1399. if attention_mask is not None and position_ids is None:
  1400. # create position_ids on the fly for batch generation
  1401. position_ids = attention_mask.long().cumsum(-1) - 1
  1402. position_ids.masked_fill_(attention_mask == 0, 1)
  1403. if past_key_values:
  1404. position_ids = position_ids[:, -input_ids.shape[1] :]
  1405. # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
  1406. # but IDEFICS requires noth ids and embeds to be present
  1407. if inputs_embeds is not None and cache_position[0] == 0:
  1408. model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": input_ids}
  1409. else:
  1410. # The clone here is for the same reason as for `position_ids`.
  1411. model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
  1412. if num_logits_to_keep is not None:
  1413. model_inputs["num_logits_to_keep"] = num_logits_to_keep
  1414. if image_hidden_states is not None:
  1415. pixel_values = None
  1416. pixel_attention_mask = None
  1417. else:
  1418. pixel_values = pixel_values
  1419. pixel_attention_mask = pixel_attention_mask
  1420. model_inputs.update(
  1421. {
  1422. "position_ids": position_ids,
  1423. "past_key_values": past_key_values,
  1424. "use_cache": kwargs.get("use_cache"),
  1425. "attention_mask": attention_mask,
  1426. "pixel_values": pixel_values,
  1427. "pixel_attention_mask": pixel_attention_mask,
  1428. "image_hidden_states": image_hidden_states,
  1429. }
  1430. )
  1431. return model_inputs
  1432. def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
  1433. model_kwargs = super()._update_model_kwargs_for_generation(
  1434. outputs=outputs,
  1435. model_kwargs=model_kwargs,
  1436. is_encoder_decoder=is_encoder_decoder,
  1437. **kwargs,
  1438. )
  1439. # Get the precomputed image_hidden_states
  1440. model_kwargs["image_hidden_states"] = outputs.image_hidden_states
  1441. return model_kwargs
  1442. @staticmethod
  1443. # Copied from transformers.models.opt.modeling_opt.OPTForCausalLM._reorder_cache
  1444. def _reorder_cache(past_key_values, beam_idx):
  1445. reordered_past = ()
  1446. for layer_past in past_key_values:
  1447. reordered_past += (
  1448. tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
  1449. )
  1450. return reordered_past