modeling_codegen.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811
  1. # coding=utf-8
  2. # Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """PyTorch CodeGen model."""
  16. from typing import Optional, Tuple, Union
  17. import torch
  18. import torch.utils.checkpoint
  19. from torch import nn
  20. from torch.nn import CrossEntropyLoss
  21. from ...activations import ACT2FN
  22. from ...cache_utils import Cache, DynamicCache, StaticCache
  23. from ...generation import GenerationMixin
  24. from ...modeling_attn_mask_utils import AttentionMaskConverter
  25. from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
  26. from ...modeling_utils import PreTrainedModel
  27. from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
  28. from .configuration_codegen import CodeGenConfig
  29. logger = logging.get_logger(__name__)
  30. _CHECKPOINT_FOR_DOC = "Salesforce/codegen-2B-mono"
  31. _CONFIG_FOR_DOC = "CodeGenConfig"
  32. # Copied from transformers.models.gptj.modeling_gptj.create_sinusoidal_positions
  33. def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
  34. inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64) / dim))
  35. sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.int64).float(), inv_freq).float()
  36. return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
  37. # Copied from transformers.models.gptj.modeling_gptj.rotate_every_two
  38. def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
  39. x1 = x[:, :, :, ::2]
  40. x2 = x[:, :, :, 1::2]
  41. x = torch.stack((-x2, x1), dim=-1)
  42. return x.flatten(-2) # in einsum notation: rearrange(x, '... d j -> ... (d j)')
  43. # Copied from transformers.models.gptj.modeling_gptj.apply_rotary_pos_emb
  44. def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
  45. sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
  46. cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
  47. return (tensor * cos) + (rotate_every_two(tensor) * sin)
  48. class CodeGenAttention(nn.Module):
  49. def __init__(self, config, layer_idx=None):
  50. super().__init__()
  51. max_positions = config.max_position_embeddings
  52. self.attn_dropout = nn.Dropout(config.attn_pdrop)
  53. self.resid_dropout = nn.Dropout(config.resid_pdrop)
  54. self.layer_idx = layer_idx
  55. if layer_idx is None:
  56. logger.warning_once(
  57. f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
  58. "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
  59. "when creating this class."
  60. )
  61. self.embed_dim = config.hidden_size
  62. self.num_attention_heads = config.num_attention_heads
  63. self.head_dim = self.embed_dim // self.num_attention_heads
  64. if self.head_dim * self.num_attention_heads != self.embed_dim:
  65. raise ValueError(
  66. f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
  67. f" `num_attention_heads`: {self.num_attention_heads})."
  68. )
  69. self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
  70. self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False)
  71. self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
  72. self.rotary_dim = config.rotary_dim
  73. pos_embd_dim = self.rotary_dim or self.embed_dim
  74. self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim)
  75. def _split_heads(self, x, n_head, dim_head, mp_num):
  76. reshaped = x.reshape(x.shape[:-1] + (n_head // mp_num, dim_head))
  77. reshaped = reshaped.reshape(x.shape[:-2] + (-1,) + reshaped.shape[-1:])
  78. return reshaped
  79. def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
  80. """
  81. Merges attn_head_size dim and num_attn_heads dim into n_ctx
  82. """
  83. if len(tensor.shape) == 5:
  84. tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
  85. elif len(tensor.shape) == 4:
  86. tensor = tensor.permute(0, 2, 1, 3).contiguous()
  87. else:
  88. raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
  89. new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,)
  90. return tensor.view(new_shape)
  91. def _attn(
  92. self,
  93. query,
  94. key,
  95. value,
  96. attention_mask=None,
  97. head_mask=None,
  98. ):
  99. # Keep the attention weights computation in fp32 to avoid overflow issues
  100. query = query.to(torch.float32)
  101. key = key.to(torch.float32)
  102. attn_weights = torch.matmul(query, key.transpose(-1, -2))
  103. if attention_mask is not None:
  104. causal_mask = attention_mask[:, :, :, : key.shape[-2]]
  105. attn_weights += causal_mask
  106. attn_weights = attn_weights / self.scale_attn
  107. attn_weights = nn.Softmax(dim=-1)(attn_weights)
  108. attn_weights = attn_weights.to(value.dtype)
  109. attn_weights = self.attn_dropout(attn_weights)
  110. # Mask heads if we want to
  111. if head_mask is not None:
  112. attn_weights = attn_weights * head_mask
  113. attn_output = torch.matmul(attn_weights, value)
  114. return attn_output, attn_weights
  115. def forward(
  116. self,
  117. hidden_states: Optional[torch.FloatTensor],
  118. layer_past: Optional[Cache] = None,
  119. attention_mask: Optional[torch.FloatTensor] = None,
  120. position_ids: Optional[torch.LongTensor] = None,
  121. head_mask: Optional[torch.FloatTensor] = None,
  122. use_cache: Optional[bool] = False,
  123. output_attentions: Optional[bool] = False,
  124. cache_position: Optional[torch.LongTensor] = None,
  125. ) -> Union[
  126. Tuple[torch.Tensor, Tuple[torch.Tensor]],
  127. Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
  128. ]:
  129. qkv = self.qkv_proj(hidden_states)
  130. # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
  131. mp_num = 4
  132. qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))
  133. local_dim = self.head_dim * self.num_attention_heads // mp_num
  134. query, value, key = torch.split(qkv_split, local_dim, dim=-1)
  135. query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
  136. key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num)
  137. value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
  138. value = value.permute(0, 2, 1, 3)
  139. embed_positions = self.embed_positions
  140. if embed_positions.device != position_ids.device:
  141. embed_positions = embed_positions.to(position_ids.device)
  142. self.embed_positions = embed_positions
  143. sincos = embed_positions[position_ids]
  144. sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
  145. if self.rotary_dim is not None:
  146. k_rot = key[:, :, :, : self.rotary_dim]
  147. k_pass = key[:, :, :, self.rotary_dim :]
  148. q_rot = query[:, :, :, : self.rotary_dim]
  149. q_pass = query[:, :, :, self.rotary_dim :]
  150. k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
  151. q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
  152. key = torch.cat([k_rot, k_pass], dim=-1)
  153. query = torch.cat([q_rot, q_pass], dim=-1)
  154. else:
  155. key = apply_rotary_pos_emb(key, sin, cos)
  156. query = apply_rotary_pos_emb(query, sin, cos)
  157. key = key.permute(0, 2, 1, 3)
  158. query = query.permute(0, 2, 1, 3)
  159. # Note that this cast is quite ugly, but is not implemented before ROPE as k_rot in the original codebase is always in fp32.
  160. # Reference: https://github.com/salesforce/CodeGen/blob/f210c3bb1216c975ad858cd4132c0fdeabf4bfc2/codegen1/jaxformer/hf/codegen/modeling_codegen.py#L38
  161. if layer_past is not None:
  162. cache_kwargs = {
  163. "sin": sin,
  164. "cos": cos,
  165. "partial_rotation_size": self.rotary_dim,
  166. "cache_position": cache_position,
  167. }
  168. key, value = layer_past.update(key.to(hidden_states.dtype), value, self.layer_idx, cache_kwargs)
  169. # compute self-attention: V x Softmax(QK^T)
  170. attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
  171. attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
  172. attn_output = self.out_proj(attn_output)
  173. attn_output = self.resid_dropout(attn_output)
  174. outputs = (attn_output, layer_past)
  175. if output_attentions:
  176. outputs += (attn_weights,)
  177. return outputs # a, present, (attentions)
  178. # Copied from transformers.models.gptj.modeling_gptj.GPTJMLP with GPTJ->CodeGen
  179. class CodeGenMLP(nn.Module):
  180. def __init__(self, intermediate_size, config): # in MLP: intermediate_size= 4 * embed_dim
  181. super().__init__()
  182. embed_dim = config.n_embd
  183. self.fc_in = nn.Linear(embed_dim, intermediate_size)
  184. self.fc_out = nn.Linear(intermediate_size, embed_dim)
  185. self.act = ACT2FN[config.activation_function]
  186. self.dropout = nn.Dropout(config.resid_pdrop)
  187. def forward(self, hidden_states: Optional[torch.FloatTensor]) -> torch.FloatTensor:
  188. hidden_states = self.fc_in(hidden_states)
  189. hidden_states = self.act(hidden_states)
  190. hidden_states = self.fc_out(hidden_states)
  191. hidden_states = self.dropout(hidden_states)
  192. return hidden_states
  193. # Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->CodeGen
  194. class CodeGenBlock(nn.Module):
  195. # Ignore copy
  196. def __init__(self, config, layer_idx=None):
  197. super().__init__()
  198. inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
  199. self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
  200. self.attn = CodeGenAttention(config, layer_idx)
  201. self.mlp = CodeGenMLP(inner_dim, config)
  202. def forward(
  203. self,
  204. hidden_states: Optional[torch.FloatTensor],
  205. layer_past: Optional[Cache] = None,
  206. attention_mask: Optional[torch.FloatTensor] = None,
  207. position_ids: Optional[torch.LongTensor] = None,
  208. head_mask: Optional[torch.FloatTensor] = None,
  209. use_cache: Optional[bool] = False,
  210. output_attentions: Optional[bool] = False,
  211. cache_position: Optional[torch.LongTensor] = None,
  212. ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
  213. residual = hidden_states
  214. hidden_states = self.ln_1(hidden_states)
  215. attn_outputs = self.attn(
  216. hidden_states=hidden_states,
  217. layer_past=layer_past,
  218. attention_mask=attention_mask,
  219. position_ids=position_ids,
  220. head_mask=head_mask,
  221. use_cache=use_cache,
  222. output_attentions=output_attentions,
  223. cache_position=cache_position,
  224. )
  225. attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
  226. outputs = attn_outputs[1:]
  227. feed_forward_hidden_states = self.mlp(hidden_states)
  228. hidden_states = attn_output + feed_forward_hidden_states + residual
  229. if use_cache:
  230. outputs = (hidden_states,) + outputs
  231. else:
  232. outputs = (hidden_states,) + outputs[1:]
  233. return outputs # hidden_states, present, (attentions)
  234. class CodeGenPreTrainedModel(PreTrainedModel):
  235. """
  236. An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
  237. models.
  238. """
  239. config_class = CodeGenConfig
  240. base_model_prefix = "transformer"
  241. supports_gradient_checkpointing = True
  242. _no_split_modules = ["CodeGenBlock"]
  243. _skip_keys_device_placement = "past_key_values"
  244. _supports_cache_class = True
  245. _supports_quantized_cache = True
  246. _supports_static_cache = True
  247. def __init__(self, *inputs, **kwargs):
  248. super().__init__(*inputs, **kwargs)
  249. def _init_weights(self, module):
  250. """Initialize the weights."""
  251. if isinstance(module, (nn.Linear,)):
  252. # Slightly different from Mesh Transformer JAX which uses truncated_normal for initialization
  253. # cf https://github.com/pytorch/pytorch/pull/5617
  254. module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
  255. if module.bias is not None:
  256. module.bias.data.zero_()
  257. elif isinstance(module, nn.Embedding):
  258. module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
  259. if module.padding_idx is not None:
  260. module.weight.data[module.padding_idx].zero_()
  261. elif isinstance(module, nn.LayerNorm):
  262. module.bias.data.zero_()
  263. module.weight.data.fill_(1.0)
  264. CODEGEN_START_DOCSTRING = r"""
  265. This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
  266. it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
  267. behavior.
  268. Parameters:
  269. config ([`CodeGenConfig`]): Model configuration class with all the parameters of the model.
  270. Initializing with a config file does not load the weights associated with the model, only the
  271. configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
  272. """
  273. CODEGEN_INPUTS_DOCSTRING = r"""
  274. Args:
  275. input_ids (`torch.LongTensor` of shape `({0})`):
  276. Indices of input sequence tokens in the vocabulary.
  277. Indices can be obtained using [`AutoProcenizer`]. See [`PreTrainedTokenizer.encode`] and
  278. [`PreTrainedTokenizer.__call__`] for details.
  279. [What are input IDs?](../glossary#input-ids)
  280. attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
  281. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  282. - 1 for tokens that are **not masked**,
  283. - 0 for tokens that are **masked**.
  284. [What are attention masks?](../glossary#attention-mask)
  285. token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
  286. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
  287. 1]`:
  288. - 0 corresponds to a *sentence A* token,
  289. - 1 corresponds to a *sentence B* token.
  290. [What are token type IDs?](../glossary#token-type-ids)
  291. position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
  292. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  293. config.n_positions - 1]`.
  294. [What are position IDs?](../glossary#position-ids)
  295. head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
  296. Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
  297. - 1 indicates the head is **not masked**,
  298. - 0 indicates the head is **masked**.
  299. inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
  300. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
  301. is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
  302. model's internal embedding lookup matrix.
  303. past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
  304. Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  305. blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
  306. returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
  307. Two formats are allowed:
  308. - a [`~cache_utils.Cache`] instance, see our
  309. [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
  310. - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
  311. shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
  312. cache format.
  313. The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
  314. legacy cache format will be returned.
  315. If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
  316. have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
  317. of shape `(batch_size, sequence_length)`.
  318. output_attentions (`bool`, *optional*):
  319. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  320. tensors for more detail.
  321. output_hidden_states (`bool`, *optional*):
  322. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  323. more detail.
  324. return_dict (`bool`, *optional*):
  325. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  326. cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
  327. Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
  328. this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
  329. the complete sequence length.
  330. """
  331. @add_start_docstrings(
  332. "The bare CodeGen Model transformer outputting raw hidden-states without any specific head on top.",
  333. CODEGEN_START_DOCSTRING,
  334. )
  335. class CodeGenModel(CodeGenPreTrainedModel):
  336. def __init__(self, config):
  337. super().__init__(config)
  338. self.embed_dim = config.n_embd
  339. self.vocab_size = config.vocab_size
  340. self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
  341. self.drop = nn.Dropout(config.embd_pdrop)
  342. self.h = nn.ModuleList([CodeGenBlock(config, layer_idx=i) for i in range(config.n_layer)])
  343. self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
  344. self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)
  345. self.gradient_checkpointing = False
  346. # Initialize weights and apply final processing
  347. self.post_init()
  348. def get_input_embeddings(self):
  349. return self.wte
  350. def set_input_embeddings(self, new_embeddings):
  351. self.wte = new_embeddings
  352. @add_start_docstrings_to_model_forward(CODEGEN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  353. @add_code_sample_docstrings(
  354. checkpoint=_CHECKPOINT_FOR_DOC,
  355. output_type=BaseModelOutputWithPast,
  356. config_class=_CONFIG_FOR_DOC,
  357. )
  358. def forward(
  359. self,
  360. input_ids: Optional[torch.LongTensor] = None,
  361. past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor]]]] = None,
  362. attention_mask: Optional[torch.FloatTensor] = None,
  363. token_type_ids: Optional[torch.LongTensor] = None,
  364. position_ids: Optional[torch.LongTensor] = None,
  365. head_mask: Optional[torch.FloatTensor] = None,
  366. inputs_embeds: Optional[torch.FloatTensor] = None,
  367. use_cache: Optional[bool] = None,
  368. output_attentions: Optional[bool] = None,
  369. output_hidden_states: Optional[bool] = None,
  370. return_dict: Optional[bool] = None,
  371. cache_position: Optional[torch.LongTensor] = None,
  372. ) -> Union[Tuple, BaseModelOutputWithPast]:
  373. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  374. output_hidden_states = (
  375. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  376. )
  377. use_cache = use_cache if use_cache is not None else self.config.use_cache
  378. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  379. if (input_ids is None) ^ (inputs_embeds is not None):
  380. raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
  381. if self.gradient_checkpointing and self.training:
  382. if use_cache:
  383. logger.warning_once(
  384. "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
  385. )
  386. use_cache = False
  387. if inputs_embeds is None:
  388. inputs_embeds = self.wte(input_ids)
  389. # kept for BC (non `Cache` `past_key_values` inputs)
  390. return_legacy_cache = False
  391. if use_cache and not isinstance(past_key_values, Cache):
  392. return_legacy_cache = True
  393. if past_key_values is None:
  394. past_key_values = DynamicCache()
  395. else:
  396. past_key_values = DynamicCache.from_legacy_cache(past_key_values)
  397. logger.warning_once(
  398. "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
  399. "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
  400. "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
  401. )
  402. seq_length = inputs_embeds.shape[1]
  403. if cache_position is None:
  404. past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
  405. cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
  406. if position_ids is None:
  407. position_ids = cache_position.unsqueeze(0)
  408. causal_mask = self._update_causal_mask(
  409. attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
  410. )
  411. # Prepare head mask if needed
  412. # 1.0 in head_mask indicate we keep the head
  413. # attention_probs has shape bsz x num_attention_heads x N x N
  414. # head_mask has shape n_layer x batch x num_attention_heads x N x N
  415. head_mask = self.get_head_mask(head_mask, self.config.n_layer)
  416. hidden_states = inputs_embeds
  417. if token_type_ids is not None:
  418. token_type_ids = token_type_ids.view(-1, seq_length)
  419. token_type_embeds = self.wte(token_type_ids)
  420. hidden_states = hidden_states + token_type_embeds
  421. hidden_states = self.drop(hidden_states)
  422. output_shape = (-1, seq_length, hidden_states.size(-1))
  423. next_decoder_cache = None
  424. all_self_attentions = () if output_attentions else None
  425. all_hidden_states = () if output_hidden_states else None
  426. for i, block in enumerate(self.h):
  427. if output_hidden_states:
  428. all_hidden_states = all_hidden_states + (hidden_states,)
  429. if self.gradient_checkpointing and self.training:
  430. outputs = self._gradient_checkpointing_func(
  431. block.__call__,
  432. hidden_states,
  433. None,
  434. causal_mask,
  435. position_ids,
  436. head_mask[i],
  437. use_cache,
  438. output_attentions,
  439. cache_position,
  440. )
  441. else:
  442. outputs = block(
  443. hidden_states=hidden_states,
  444. layer_past=past_key_values,
  445. attention_mask=causal_mask,
  446. position_ids=position_ids,
  447. head_mask=head_mask[i],
  448. use_cache=use_cache,
  449. output_attentions=output_attentions,
  450. cache_position=cache_position,
  451. )
  452. hidden_states = outputs[0]
  453. if use_cache is True:
  454. next_decoder_cache = outputs[1]
  455. if output_attentions:
  456. all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
  457. hidden_states = self.ln_f(hidden_states)
  458. hidden_states = hidden_states.view(output_shape)
  459. # Add last hidden state
  460. if output_hidden_states:
  461. all_hidden_states = all_hidden_states + (hidden_states,)
  462. next_cache = next_decoder_cache if use_cache else None
  463. if return_legacy_cache:
  464. next_cache = next_cache.to_legacy_cache()
  465. if not return_dict:
  466. return tuple(
  467. v for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions] if v is not None
  468. )
  469. return BaseModelOutputWithPast(
  470. last_hidden_state=hidden_states,
  471. past_key_values=next_cache,
  472. hidden_states=all_hidden_states,
  473. attentions=all_self_attentions,
  474. )
  475. # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
  476. def _update_causal_mask(
  477. self,
  478. attention_mask: torch.Tensor,
  479. input_tensor: torch.Tensor,
  480. cache_position: torch.Tensor,
  481. past_key_values: Cache,
  482. output_attentions: bool,
  483. ):
  484. if self.config._attn_implementation == "flash_attention_2":
  485. if attention_mask is not None and 0.0 in attention_mask:
  486. return attention_mask
  487. return None
  488. # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
  489. # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
  490. # to infer the attention mask.
  491. past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
  492. using_static_cache = isinstance(past_key_values, StaticCache)
  493. # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
  494. if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
  495. if AttentionMaskConverter._ignore_causal_mask_sdpa(
  496. attention_mask,
  497. inputs_embeds=input_tensor,
  498. past_key_values_length=past_seen_tokens,
  499. is_training=self.training,
  500. ):
  501. return None
  502. dtype, device = input_tensor.dtype, input_tensor.device
  503. sequence_length = input_tensor.shape[1]
  504. if using_static_cache:
  505. target_length = past_key_values.get_max_cache_shape()
  506. else:
  507. target_length = (
  508. attention_mask.shape[-1]
  509. if isinstance(attention_mask, torch.Tensor)
  510. else past_seen_tokens + sequence_length + 1
  511. )
  512. # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
  513. causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
  514. attention_mask,
  515. sequence_length=sequence_length,
  516. target_length=target_length,
  517. dtype=dtype,
  518. device=device,
  519. cache_position=cache_position,
  520. batch_size=input_tensor.shape[0],
  521. )
  522. if (
  523. self.config._attn_implementation == "sdpa"
  524. and attention_mask is not None
  525. and attention_mask.device.type == "cuda"
  526. and not output_attentions
  527. ):
  528. # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
  529. # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
  530. # Details: https://github.com/pytorch/pytorch/issues/110213
  531. min_dtype = torch.finfo(dtype).min
  532. causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
  533. return causal_mask
  534. @staticmethod
  535. # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position
  536. def _prepare_4d_causal_attention_mask_with_cache_position(
  537. attention_mask: torch.Tensor,
  538. sequence_length: int,
  539. target_length: int,
  540. dtype: torch.dtype,
  541. device: torch.device,
  542. cache_position: torch.Tensor,
  543. batch_size: int,
  544. **kwargs,
  545. ):
  546. """
  547. Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
  548. `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
  549. Args:
  550. attention_mask (`torch.Tensor`):
  551. A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
  552. `(batch_size, 1, query_length, key_value_length)`.
  553. sequence_length (`int`):
  554. The sequence length being processed.
  555. target_length (`int`):
  556. The target length: when generating with static cache, the mask should be as long as the static cache,
  557. to account for the 0 padding, the part of the cache that is not filled yet.
  558. dtype (`torch.dtype`):
  559. The dtype to use for the 4D attention mask.
  560. device (`torch.device`):
  561. The device to plcae the 4D attention mask on.
  562. cache_position (`torch.Tensor`):
  563. Indices depicting the position of the input sequence tokens in the sequence.
  564. batch_size (`torch.Tensor`):
  565. Batch size.
  566. """
  567. if attention_mask is not None and attention_mask.dim() == 4:
  568. # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
  569. causal_mask = attention_mask
  570. else:
  571. min_dtype = torch.finfo(dtype).min
  572. causal_mask = torch.full(
  573. (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
  574. )
  575. if sequence_length != 1:
  576. causal_mask = torch.triu(causal_mask, diagonal=1)
  577. causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
  578. causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
  579. if attention_mask is not None:
  580. causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
  581. mask_length = attention_mask.shape[-1]
  582. padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
  583. padding_mask = padding_mask == 0
  584. causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
  585. padding_mask, min_dtype
  586. )
  587. return causal_mask
  588. @add_start_docstrings(
  589. """
  590. The CodeGen Model transformer with a language modeling head on top.
  591. """,
  592. CODEGEN_START_DOCSTRING,
  593. )
  594. class CodeGenForCausalLM(CodeGenPreTrainedModel, GenerationMixin):
  595. _tied_weights_keys = ["lm_head.weight"]
  596. def __init__(self, config):
  597. super().__init__(config)
  598. self.transformer = CodeGenModel(config)
  599. self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
  600. # Initialize weights and apply final processing
  601. self.post_init()
  602. def get_output_embeddings(self):
  603. return self.lm_head
  604. def set_output_embeddings(self, new_embeddings):
  605. self.lm_head = new_embeddings
  606. @add_start_docstrings_to_model_forward(CODEGEN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  607. @add_code_sample_docstrings(
  608. checkpoint=_CHECKPOINT_FOR_DOC,
  609. output_type=CausalLMOutputWithPast,
  610. config_class=_CONFIG_FOR_DOC,
  611. )
  612. def forward(
  613. self,
  614. input_ids: Optional[torch.LongTensor] = None,
  615. past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor]]]] = None,
  616. attention_mask: Optional[torch.FloatTensor] = None,
  617. token_type_ids: Optional[torch.LongTensor] = None,
  618. position_ids: Optional[torch.LongTensor] = None,
  619. head_mask: Optional[torch.FloatTensor] = None,
  620. inputs_embeds: Optional[torch.FloatTensor] = None,
  621. labels: Optional[torch.LongTensor] = None,
  622. use_cache: Optional[bool] = None,
  623. output_attentions: Optional[bool] = None,
  624. output_hidden_states: Optional[bool] = None,
  625. return_dict: Optional[bool] = None,
  626. cache_position: Optional[torch.LongTensor] = None,
  627. ) -> Union[Tuple, CausalLMOutputWithPast]:
  628. r"""
  629. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  630. Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
  631. `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
  632. are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
  633. """
  634. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  635. transformer_outputs = self.transformer(
  636. input_ids,
  637. past_key_values=past_key_values,
  638. attention_mask=attention_mask,
  639. token_type_ids=token_type_ids,
  640. position_ids=position_ids,
  641. head_mask=head_mask,
  642. inputs_embeds=inputs_embeds,
  643. use_cache=use_cache,
  644. output_attentions=output_attentions,
  645. output_hidden_states=output_hidden_states,
  646. return_dict=return_dict,
  647. cache_position=cache_position,
  648. )
  649. hidden_states = transformer_outputs[0]
  650. # make sure sampling in fp16 works correctly and
  651. # compute loss in fp32 to match with mesh-tf version
  652. # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
  653. lm_logits = self.lm_head(hidden_states).to(torch.float32)
  654. loss = None
  655. if labels is not None:
  656. # move labels to correct device to enable model parallelism
  657. labels = labels.to(lm_logits.device)
  658. # Shift so that tokens < n predict n
  659. shift_logits = lm_logits[..., :-1, :].contiguous()
  660. shift_labels = labels[..., 1:].contiguous()
  661. # Flatten the tokens
  662. loss_fct = CrossEntropyLoss()
  663. loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
  664. loss = loss.to(hidden_states.dtype)
  665. if not return_dict:
  666. output = (lm_logits,) + transformer_outputs[1:]
  667. return ((loss,) + output) if loss is not None else output
  668. return CausalLMOutputWithPast(
  669. loss=loss,
  670. logits=lm_logits,
  671. past_key_values=transformer_outputs.past_key_values,
  672. hidden_states=transformer_outputs.hidden_states,
  673. attentions=transformer_outputs.attentions,
  674. )
  675. @staticmethod
  676. def _reorder_cache(
  677. past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
  678. ) -> Tuple[Tuple[torch.Tensor]]:
  679. """
  680. This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
  681. [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
  682. beam_idx at every generation step.
  683. """
  684. return tuple(
  685. tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
  686. for layer_past in past_key_values
  687. )