modeling_xmod.py 73 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621
  1. # coding=utf-8
  2. # Copyright 2023 Meta AI Team and the HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """PyTorch X-MOD model."""
  16. import math
  17. from typing import List, Optional, Tuple, Union
  18. import torch
  19. import torch.utils.checkpoint
  20. from torch import nn
  21. from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
  22. from ...activations import ACT2FN, gelu
  23. from ...generation import GenerationMixin
  24. from ...modeling_outputs import (
  25. BaseModelOutputWithPastAndCrossAttentions,
  26. BaseModelOutputWithPoolingAndCrossAttentions,
  27. CausalLMOutputWithCrossAttentions,
  28. MaskedLMOutput,
  29. MultipleChoiceModelOutput,
  30. QuestionAnsweringModelOutput,
  31. SequenceClassifierOutput,
  32. TokenClassifierOutput,
  33. )
  34. from ...modeling_utils import PreTrainedModel
  35. from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
  36. from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
  37. from .configuration_xmod import XmodConfig
  38. logger = logging.get_logger(__name__)
  39. # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Xmod
  40. class XmodEmbeddings(nn.Module):
  41. """
  42. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
  43. """
  44. # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
  45. def __init__(self, config):
  46. super().__init__()
  47. self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
  48. self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
  49. self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
  50. # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
  51. # any TensorFlow checkpoint file
  52. self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  53. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  54. # position_ids (1, len position emb) is contiguous in memory and exported when serialized
  55. self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
  56. self.register_buffer(
  57. "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
  58. )
  59. self.register_buffer(
  60. "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
  61. )
  62. # End copy
  63. self.padding_idx = config.pad_token_id
  64. self.position_embeddings = nn.Embedding(
  65. config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
  66. )
  67. def forward(
  68. self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
  69. ):
  70. if position_ids is None:
  71. if input_ids is not None:
  72. # Create the position ids from the input token ids. Any padded tokens remain padded.
  73. position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
  74. else:
  75. position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
  76. if input_ids is not None:
  77. input_shape = input_ids.size()
  78. else:
  79. input_shape = inputs_embeds.size()[:-1]
  80. seq_length = input_shape[1]
  81. # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
  82. # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
  83. # issue #5664
  84. if token_type_ids is None:
  85. if hasattr(self, "token_type_ids"):
  86. buffered_token_type_ids = self.token_type_ids[:, :seq_length]
  87. buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
  88. token_type_ids = buffered_token_type_ids_expanded
  89. else:
  90. token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
  91. if inputs_embeds is None:
  92. inputs_embeds = self.word_embeddings(input_ids)
  93. token_type_embeddings = self.token_type_embeddings(token_type_ids)
  94. embeddings = inputs_embeds + token_type_embeddings
  95. if self.position_embedding_type == "absolute":
  96. position_embeddings = self.position_embeddings(position_ids)
  97. embeddings += position_embeddings
  98. embeddings = self.LayerNorm(embeddings)
  99. embeddings = self.dropout(embeddings)
  100. return embeddings
  101. def create_position_ids_from_inputs_embeds(self, inputs_embeds):
  102. """
  103. We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
  104. Args:
  105. inputs_embeds: torch.Tensor
  106. Returns: torch.Tensor
  107. """
  108. input_shape = inputs_embeds.size()[:-1]
  109. sequence_length = input_shape[1]
  110. position_ids = torch.arange(
  111. self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
  112. )
  113. return position_ids.unsqueeze(0).expand(input_shape)
  114. # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Xmod
  115. class XmodSelfAttention(nn.Module):
  116. def __init__(self, config, position_embedding_type=None):
  117. super().__init__()
  118. if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
  119. raise ValueError(
  120. f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
  121. f"heads ({config.num_attention_heads})"
  122. )
  123. self.num_attention_heads = config.num_attention_heads
  124. self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
  125. self.all_head_size = self.num_attention_heads * self.attention_head_size
  126. self.query = nn.Linear(config.hidden_size, self.all_head_size)
  127. self.key = nn.Linear(config.hidden_size, self.all_head_size)
  128. self.value = nn.Linear(config.hidden_size, self.all_head_size)
  129. self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
  130. self.position_embedding_type = position_embedding_type or getattr(
  131. config, "position_embedding_type", "absolute"
  132. )
  133. if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
  134. self.max_position_embeddings = config.max_position_embeddings
  135. self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
  136. self.is_decoder = config.is_decoder
  137. def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
  138. new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
  139. x = x.view(new_x_shape)
  140. return x.permute(0, 2, 1, 3)
  141. def forward(
  142. self,
  143. hidden_states: torch.Tensor,
  144. attention_mask: Optional[torch.FloatTensor] = None,
  145. head_mask: Optional[torch.FloatTensor] = None,
  146. encoder_hidden_states: Optional[torch.FloatTensor] = None,
  147. encoder_attention_mask: Optional[torch.FloatTensor] = None,
  148. past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
  149. output_attentions: Optional[bool] = False,
  150. ) -> Tuple[torch.Tensor]:
  151. mixed_query_layer = self.query(hidden_states)
  152. # If this is instantiated as a cross-attention module, the keys
  153. # and values come from an encoder; the attention mask needs to be
  154. # such that the encoder's padding tokens are not attended to.
  155. is_cross_attention = encoder_hidden_states is not None
  156. if is_cross_attention and past_key_value is not None:
  157. # reuse k,v, cross_attentions
  158. key_layer = past_key_value[0]
  159. value_layer = past_key_value[1]
  160. attention_mask = encoder_attention_mask
  161. elif is_cross_attention:
  162. key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
  163. value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
  164. attention_mask = encoder_attention_mask
  165. elif past_key_value is not None:
  166. key_layer = self.transpose_for_scores(self.key(hidden_states))
  167. value_layer = self.transpose_for_scores(self.value(hidden_states))
  168. key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
  169. value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
  170. else:
  171. key_layer = self.transpose_for_scores(self.key(hidden_states))
  172. value_layer = self.transpose_for_scores(self.value(hidden_states))
  173. query_layer = self.transpose_for_scores(mixed_query_layer)
  174. use_cache = past_key_value is not None
  175. if self.is_decoder:
  176. # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
  177. # Further calls to cross_attention layer can then reuse all cross-attention
  178. # key/value_states (first "if" case)
  179. # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
  180. # all previous decoder key/value_states. Further calls to uni-directional self-attention
  181. # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
  182. # if encoder bi-directional self-attention `past_key_value` is always `None`
  183. past_key_value = (key_layer, value_layer)
  184. # Take the dot product between "query" and "key" to get the raw attention scores.
  185. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
  186. if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
  187. query_length, key_length = query_layer.shape[2], key_layer.shape[2]
  188. if use_cache:
  189. position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
  190. -1, 1
  191. )
  192. else:
  193. position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
  194. position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
  195. distance = position_ids_l - position_ids_r
  196. positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
  197. positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
  198. if self.position_embedding_type == "relative_key":
  199. relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
  200. attention_scores = attention_scores + relative_position_scores
  201. elif self.position_embedding_type == "relative_key_query":
  202. relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
  203. relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
  204. attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
  205. attention_scores = attention_scores / math.sqrt(self.attention_head_size)
  206. if attention_mask is not None:
  207. # Apply the attention mask is (precomputed for all layers in XmodModel forward() function)
  208. attention_scores = attention_scores + attention_mask
  209. # Normalize the attention scores to probabilities.
  210. attention_probs = nn.functional.softmax(attention_scores, dim=-1)
  211. # This is actually dropping out entire tokens to attend to, which might
  212. # seem a bit unusual, but is taken from the original Transformer paper.
  213. attention_probs = self.dropout(attention_probs)
  214. # Mask heads if we want to
  215. if head_mask is not None:
  216. attention_probs = attention_probs * head_mask
  217. context_layer = torch.matmul(attention_probs, value_layer)
  218. context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
  219. new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
  220. context_layer = context_layer.view(new_context_layer_shape)
  221. outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
  222. if self.is_decoder:
  223. outputs = outputs + (past_key_value,)
  224. return outputs
  225. class XmodSelfOutput(nn.Module):
  226. # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput.__init__
  227. def __init__(self, config):
  228. super().__init__()
  229. self.dense = nn.Linear(config.hidden_size, config.hidden_size)
  230. self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  231. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  232. def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
  233. hidden_states = self.dense(hidden_states)
  234. hidden_states = self.dropout(hidden_states)
  235. hidden_states = hidden_states + input_tensor
  236. return hidden_states
  237. class XmodAttention(nn.Module):
  238. def __init__(self, config, position_embedding_type=None):
  239. super().__init__()
  240. self.self = XmodSelfAttention(config, position_embedding_type=position_embedding_type)
  241. self.output = XmodSelfOutput(config)
  242. self.pruned_heads = set()
  243. self.pre_norm = config.pre_norm
  244. # Copied from transformers.models.roberta.modeling_roberta.RobertaAttention.prune_heads
  245. def prune_heads(self, heads):
  246. if len(heads) == 0:
  247. return
  248. heads, index = find_pruneable_heads_and_indices(
  249. heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
  250. )
  251. # Prune linear layers
  252. self.self.query = prune_linear_layer(self.self.query, index)
  253. self.self.key = prune_linear_layer(self.self.key, index)
  254. self.self.value = prune_linear_layer(self.self.value, index)
  255. self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
  256. # Update hyper params and store pruned heads
  257. self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
  258. self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
  259. self.pruned_heads = self.pruned_heads.union(heads)
  260. def forward(
  261. self,
  262. hidden_states: torch.Tensor,
  263. attention_mask: Optional[torch.FloatTensor] = None,
  264. head_mask: Optional[torch.FloatTensor] = None,
  265. encoder_hidden_states: Optional[torch.FloatTensor] = None,
  266. encoder_attention_mask: Optional[torch.FloatTensor] = None,
  267. past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
  268. output_attentions: Optional[bool] = False,
  269. ) -> Tuple[torch.Tensor]:
  270. residual = hidden_states
  271. if self.pre_norm:
  272. hidden_states = self.output.LayerNorm(hidden_states)
  273. self_outputs = self.self(
  274. hidden_states,
  275. attention_mask,
  276. head_mask,
  277. encoder_hidden_states,
  278. encoder_attention_mask,
  279. past_key_value,
  280. output_attentions,
  281. )
  282. attention_output = self.output(self_outputs[0], residual)
  283. if not self.pre_norm:
  284. attention_output = self.output.LayerNorm(attention_output)
  285. outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
  286. return outputs
  287. # Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate
  288. class XmodIntermediate(nn.Module):
  289. def __init__(self, config):
  290. super().__init__()
  291. self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
  292. if isinstance(config.hidden_act, str):
  293. self.intermediate_act_fn = ACT2FN[config.hidden_act]
  294. else:
  295. self.intermediate_act_fn = config.hidden_act
  296. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  297. hidden_states = self.dense(hidden_states)
  298. hidden_states = self.intermediate_act_fn(hidden_states)
  299. return hidden_states
  300. class XmodAdapter(nn.Module):
  301. def __init__(self, config):
  302. super().__init__()
  303. self.bottleneck_size = config.hidden_size // config.adapter_reduction_factor
  304. self.dense1 = nn.Linear(config.hidden_size, self.bottleneck_size)
  305. self.dense2 = nn.Linear(self.bottleneck_size, config.hidden_size)
  306. if isinstance(config.hidden_act, str):
  307. self.adapter_act_fn = ACT2FN[config.hidden_act]
  308. else:
  309. self.adapter_act_fn = config.hidden_act
  310. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  311. hidden_states = self.dense1(hidden_states)
  312. hidden_states = self.adapter_act_fn(hidden_states)
  313. hidden_states = self.dense2(hidden_states)
  314. return hidden_states
  315. class XmodOutput(nn.Module):
  316. def __init__(self, config):
  317. super().__init__()
  318. self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
  319. self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  320. self.ln_before_adapter = config.ln_before_adapter
  321. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  322. if config.adapter_layer_norm:
  323. self.adapter_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  324. else:
  325. self.adapter_layer_norm = None
  326. self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm
  327. self.adapter_modules = nn.ModuleDict({})
  328. for language in config.languages:
  329. self.adapter_modules[str(language)] = XmodAdapter(config)
  330. def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, lang_ids: torch.Tensor) -> torch.Tensor:
  331. hidden_states = self.dense(hidden_states)
  332. hidden_states = self.dropout(hidden_states)
  333. hidden_states = hidden_states + input_tensor
  334. hidden_states = self.lang_adapter(lang_ids, hidden_states)
  335. return hidden_states
  336. def lang_adapter(self, lang_ids: torch.Tensor, hidden_states: torch.Tensor):
  337. # Process subsequent samples with the same lang_id in parallel
  338. lang_ids, lang_lengths = torch.unique_consecutive(lang_ids, return_counts=True)
  339. if not self.ln_before_adapter:
  340. residual = hidden_states
  341. if self.adapter_layer_norm is not None:
  342. hidden_states = self.adapter_layer_norm(hidden_states)
  343. elif self.adapter_reuse_layer_norm:
  344. hidden_states = self.LayerNorm(hidden_states)
  345. if self.ln_before_adapter:
  346. residual = hidden_states
  347. split_hidden_states = torch.split(hidden_states, lang_lengths.tolist(), 0)
  348. lang_wise_outputs = []
  349. for i, (lang_id, split_hidden_state) in enumerate(zip(lang_ids, split_hidden_states)):
  350. lang = list(self.adapter_modules.keys())[int(lang_id.item())]
  351. lang_wise_outputs.append(self.adapter_modules[lang](split_hidden_state))
  352. hidden_states = torch.cat(lang_wise_outputs, 0)
  353. hidden_states = self.dropout(hidden_states)
  354. hidden_states += residual
  355. return hidden_states
  356. class XmodLayer(nn.Module):
  357. def __init__(self, config):
  358. super().__init__()
  359. self.chunk_size_feed_forward = config.chunk_size_feed_forward
  360. self.seq_len_dim = 1
  361. self.attention = XmodAttention(config)
  362. self.is_decoder = config.is_decoder
  363. self.add_cross_attention = config.add_cross_attention
  364. if self.add_cross_attention:
  365. if not self.is_decoder:
  366. raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
  367. self.crossattention = XmodAttention(config, position_embedding_type="absolute")
  368. self.intermediate = XmodIntermediate(config)
  369. self.output = XmodOutput(config)
  370. self.pre_norm = config.pre_norm
  371. def forward(
  372. self,
  373. hidden_states: torch.Tensor,
  374. lang_ids: torch.Tensor,
  375. attention_mask: Optional[torch.FloatTensor] = None,
  376. head_mask: Optional[torch.FloatTensor] = None,
  377. encoder_hidden_states: Optional[torch.FloatTensor] = None,
  378. encoder_attention_mask: Optional[torch.FloatTensor] = None,
  379. past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
  380. output_attentions: Optional[bool] = False,
  381. ) -> Tuple[torch.Tensor]:
  382. # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
  383. self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
  384. self_attention_outputs = self.attention(
  385. hidden_states,
  386. attention_mask,
  387. head_mask,
  388. output_attentions=output_attentions,
  389. past_key_value=self_attn_past_key_value,
  390. )
  391. attention_output = self_attention_outputs[0]
  392. # if decoder, the last output is tuple of self-attn cache
  393. if self.is_decoder:
  394. outputs = self_attention_outputs[1:-1]
  395. present_key_value = self_attention_outputs[-1]
  396. else:
  397. outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
  398. cross_attn_present_key_value = None
  399. if self.is_decoder and encoder_hidden_states is not None:
  400. if not hasattr(self, "crossattention"):
  401. raise ValueError(
  402. f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
  403. " by setting `config.add_cross_attention=True`"
  404. )
  405. # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
  406. cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
  407. cross_attention_outputs = self.crossattention(
  408. attention_output,
  409. attention_mask,
  410. head_mask,
  411. encoder_hidden_states,
  412. encoder_attention_mask,
  413. cross_attn_past_key_value,
  414. output_attentions,
  415. )
  416. attention_output = cross_attention_outputs[0]
  417. outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
  418. # add cross-attn cache to positions 3,4 of present_key_value tuple
  419. cross_attn_present_key_value = cross_attention_outputs[-1]
  420. present_key_value = present_key_value + cross_attn_present_key_value
  421. residual = attention_output
  422. if self.pre_norm:
  423. attention_output = self.output.LayerNorm(attention_output)
  424. intermediate_output = apply_chunking_to_forward(
  425. self.feed_forward_chunk,
  426. self.chunk_size_feed_forward,
  427. self.seq_len_dim,
  428. attention_output,
  429. )
  430. layer_output = self.output(intermediate_output, residual, lang_ids)
  431. if not self.pre_norm:
  432. layer_output = self.output.LayerNorm(layer_output)
  433. outputs = (layer_output,) + outputs
  434. # if decoder, return the attn key/values as the last output
  435. if self.is_decoder:
  436. outputs = outputs + (present_key_value,)
  437. return outputs
  438. def feed_forward_chunk(self, attention_output):
  439. return self.intermediate(attention_output)
  440. class XmodEncoder(nn.Module):
  441. def __init__(self, config):
  442. super().__init__()
  443. self.config = config
  444. self.layer = nn.ModuleList([XmodLayer(config) for _ in range(config.num_hidden_layers)])
  445. self.is_pre_norm = config.pre_norm
  446. if self.is_pre_norm:
  447. self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  448. self.gradient_checkpointing = False
  449. def forward(
  450. self,
  451. hidden_states: torch.Tensor,
  452. lang_ids: torch.Tensor,
  453. attention_mask: Optional[torch.FloatTensor] = None,
  454. head_mask: Optional[torch.FloatTensor] = None,
  455. encoder_hidden_states: Optional[torch.FloatTensor] = None,
  456. encoder_attention_mask: Optional[torch.FloatTensor] = None,
  457. past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
  458. use_cache: Optional[bool] = None,
  459. output_attentions: Optional[bool] = False,
  460. output_hidden_states: Optional[bool] = False,
  461. return_dict: Optional[bool] = True,
  462. ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
  463. if self.gradient_checkpointing and self.training:
  464. if use_cache:
  465. logger.warning_once(
  466. "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
  467. )
  468. use_cache = False
  469. all_hidden_states = () if output_hidden_states else None
  470. all_self_attentions = () if output_attentions else None
  471. all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
  472. next_decoder_cache = () if use_cache else None
  473. for i, layer_module in enumerate(self.layer):
  474. if output_hidden_states:
  475. all_hidden_states = all_hidden_states + (hidden_states,)
  476. layer_head_mask = head_mask[i] if head_mask is not None else None
  477. past_key_value = past_key_values[i] if past_key_values is not None else None
  478. if self.gradient_checkpointing and self.training:
  479. layer_outputs = self._gradient_checkpointing_func(
  480. layer_module.__call__,
  481. hidden_states,
  482. lang_ids,
  483. attention_mask,
  484. layer_head_mask,
  485. encoder_hidden_states,
  486. encoder_attention_mask,
  487. past_key_value,
  488. output_attentions,
  489. )
  490. else:
  491. layer_outputs = layer_module(
  492. hidden_states,
  493. lang_ids,
  494. attention_mask,
  495. layer_head_mask,
  496. encoder_hidden_states,
  497. encoder_attention_mask,
  498. past_key_value,
  499. output_attentions,
  500. )
  501. hidden_states = layer_outputs[0]
  502. if use_cache:
  503. next_decoder_cache += (layer_outputs[-1],)
  504. if output_attentions:
  505. all_self_attentions = all_self_attentions + (layer_outputs[1],)
  506. if self.config.add_cross_attention:
  507. all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
  508. if self.is_pre_norm:
  509. hidden_states = self.LayerNorm(hidden_states)
  510. if output_hidden_states:
  511. all_hidden_states = all_hidden_states + (hidden_states,)
  512. if not return_dict:
  513. return tuple(
  514. v
  515. for v in [
  516. hidden_states,
  517. next_decoder_cache,
  518. all_hidden_states,
  519. all_self_attentions,
  520. all_cross_attentions,
  521. ]
  522. if v is not None
  523. )
  524. return BaseModelOutputWithPastAndCrossAttentions(
  525. last_hidden_state=hidden_states,
  526. past_key_values=next_decoder_cache,
  527. hidden_states=all_hidden_states,
  528. attentions=all_self_attentions,
  529. cross_attentions=all_cross_attentions,
  530. )
  531. # Copied from transformers.models.roberta.modeling_roberta.RobertaPooler
  532. class XmodPooler(nn.Module):
  533. def __init__(self, config):
  534. super().__init__()
  535. self.dense = nn.Linear(config.hidden_size, config.hidden_size)
  536. self.activation = nn.Tanh()
  537. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  538. # We "pool" the model by simply taking the hidden state corresponding
  539. # to the first token.
  540. first_token_tensor = hidden_states[:, 0]
  541. pooled_output = self.dense(first_token_tensor)
  542. pooled_output = self.activation(pooled_output)
  543. return pooled_output
  544. class XmodPreTrainedModel(PreTrainedModel):
  545. """
  546. An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
  547. models.
  548. """
  549. config_class = XmodConfig
  550. base_model_prefix = "roberta"
  551. supports_gradient_checkpointing = True
  552. # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
  553. def _init_weights(self, module):
  554. """Initialize the weights"""
  555. if isinstance(module, nn.Linear):
  556. # Slightly different from the TF version which uses truncated_normal for initialization
  557. # cf https://github.com/pytorch/pytorch/pull/5617
  558. module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
  559. if module.bias is not None:
  560. module.bias.data.zero_()
  561. elif isinstance(module, nn.Embedding):
  562. module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
  563. if module.padding_idx is not None:
  564. module.weight.data[module.padding_idx].zero_()
  565. elif isinstance(module, nn.LayerNorm):
  566. module.bias.data.zero_()
  567. module.weight.data.fill_(1.0)
  568. def set_default_language(self, language: str):
  569. """
  570. Set the default language code for the model. This is used when the language is not specified in the input.
  571. Args:
  572. language (`str`): The language code, such as `"en_XX"` or `"de_DE"`.
  573. """
  574. if language not in self.config.languages:
  575. raise ValueError(
  576. f"{self} does not have an adapter for {language}. Supported languages: {list(self.config.languages)}"
  577. )
  578. self.config.default_language = language
  579. def freeze_embeddings_and_language_adapters(self):
  580. """
  581. Freeze the embeddings and language adapters of the model. Usually, this is applied before the model is
  582. fine-tuned on a downstream task.
  583. """
  584. logger.info("Freezing embeddings")
  585. for parameter in self.roberta.embeddings.parameters():
  586. parameter.requires_grad = False
  587. logger.info("Freezing adapters")
  588. for layer in self.roberta.encoder.layer:
  589. if layer.output.adapter_layer_norm is not None:
  590. for parameter in layer.output.adapter_layer_norm.parameters():
  591. parameter.requires_grad = False
  592. for parameter in layer.output.adapter_modules.parameters():
  593. parameter.requires_grad = False
  594. XMOD_START_DOCSTRING = r"""
  595. This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
  596. library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  597. etc.)
  598. This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
  599. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
  600. and behavior.
  601. Parameters:
  602. config ([`XmodConfig`]): Model configuration class with all the parameters of the
  603. model. Initializing with a config file does not load the weights associated with the model, only the
  604. configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
  605. """
  606. XMOD_INPUTS_DOCSTRING = r"""
  607. Args:
  608. input_ids (`torch.LongTensor` of shape `({0})`):
  609. Indices of input sequence tokens in the vocabulary.
  610. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  611. [`PreTrainedTokenizer.__call__`] for details.
  612. [What are input IDs?](../glossary#input-ids)
  613. lang_ids (`torch.LongTensor` of shape `({0})`, *optional*):
  614. Indices of the language adapters that should be activated for each sample, respectively. Default: the index
  615. that corresponds to `self.config.default_language`.
  616. attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
  617. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  618. - 1 for tokens that are **not masked**,
  619. - 0 for tokens that are **masked**.
  620. [What are attention masks?](../glossary#attention-mask)
  621. token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
  622. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
  623. 1]`:
  624. - 0 corresponds to a *sentence A* token,
  625. - 1 corresponds to a *sentence B* token.
  626. [What are token type IDs?](../glossary#token-type-ids)
  627. position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
  628. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  629. config.max_position_embeddings - 1]`.
  630. [What are position IDs?](../glossary#position-ids)
  631. head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
  632. Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
  633. - 1 indicates the head is **not masked**,
  634. - 0 indicates the head is **masked**.
  635. inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
  636. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
  637. is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
  638. model's internal embedding lookup matrix.
  639. output_attentions (`bool`, *optional*):
  640. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  641. tensors for more detail.
  642. output_hidden_states (`bool`, *optional*):
  643. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  644. more detail.
  645. return_dict (`bool`, *optional*):
  646. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  647. """
  648. @add_start_docstrings(
  649. "The bare X-MOD Model transformer outputting raw hidden-states without any specific head on top.",
  650. XMOD_START_DOCSTRING,
  651. )
  652. class XmodModel(XmodPreTrainedModel):
  653. """
  654. The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
  655. cross-attention is added between the self-attention layers, following the architecture described in *Attention is
  656. all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
  657. Kaiser and Illia Polosukhin.
  658. To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
  659. to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
  660. `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
  661. .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
  662. """
  663. # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Xmod
  664. def __init__(self, config, add_pooling_layer=True):
  665. super().__init__(config)
  666. self.config = config
  667. self.embeddings = XmodEmbeddings(config)
  668. self.encoder = XmodEncoder(config)
  669. self.pooler = XmodPooler(config) if add_pooling_layer else None
  670. # Initialize weights and apply final processing
  671. self.post_init()
  672. # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.get_input_embeddings
  673. def get_input_embeddings(self):
  674. return self.embeddings.word_embeddings
  675. # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.set_input_embeddings
  676. def set_input_embeddings(self, value):
  677. self.embeddings.word_embeddings = value
  678. # Copied from transformers.models.roberta.modeling_roberta.RobertaModel._prune_heads
  679. def _prune_heads(self, heads_to_prune):
  680. """
  681. Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
  682. class PreTrainedModel
  683. """
  684. for layer, heads in heads_to_prune.items():
  685. self.encoder.layer[layer].attention.prune_heads(heads)
  686. @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  687. def forward(
  688. self,
  689. input_ids: Optional[torch.Tensor] = None,
  690. lang_ids: Optional[torch.LongTensor] = None,
  691. attention_mask: Optional[torch.Tensor] = None,
  692. token_type_ids: Optional[torch.Tensor] = None,
  693. position_ids: Optional[torch.Tensor] = None,
  694. head_mask: Optional[torch.Tensor] = None,
  695. inputs_embeds: Optional[torch.Tensor] = None,
  696. encoder_hidden_states: Optional[torch.Tensor] = None,
  697. encoder_attention_mask: Optional[torch.Tensor] = None,
  698. past_key_values: Optional[List[torch.FloatTensor]] = None,
  699. use_cache: Optional[bool] = None,
  700. output_attentions: Optional[bool] = None,
  701. output_hidden_states: Optional[bool] = None,
  702. return_dict: Optional[bool] = None,
  703. ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
  704. r"""
  705. encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  706. Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
  707. the model is configured as a decoder.
  708. encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
  709. Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
  710. the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
  711. - 1 for tokens that are **not masked**,
  712. - 0 for tokens that are **masked**.
  713. past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors:
  714. of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
  715. Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
  716. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
  717. don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
  718. `decoder_input_ids` of shape `(batch_size, sequence_length)`.
  719. use_cache (`bool`, *optional*):
  720. If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
  721. `past_key_values`).
  722. """
  723. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  724. output_hidden_states = (
  725. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  726. )
  727. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  728. if self.config.is_decoder:
  729. use_cache = use_cache if use_cache is not None else self.config.use_cache
  730. else:
  731. use_cache = False
  732. if input_ids is not None and inputs_embeds is not None:
  733. raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
  734. elif input_ids is not None:
  735. self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
  736. input_shape = input_ids.size()
  737. elif inputs_embeds is not None:
  738. input_shape = inputs_embeds.size()[:-1]
  739. else:
  740. raise ValueError("You have to specify either input_ids or inputs_embeds")
  741. batch_size, seq_length = input_shape
  742. device = input_ids.device if input_ids is not None else inputs_embeds.device
  743. # past_key_values_length
  744. past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
  745. if lang_ids is None:
  746. if self.config.default_language is None:
  747. raise ValueError("Input language unknown. Please call `XmodPreTrainedModel.set_default_language()`")
  748. adapter_languages = list(self.encoder.layer[0].output.adapter_modules.keys())
  749. default_lang_id = adapter_languages.index(self.config.default_language)
  750. lang_ids = default_lang_id * torch.ones(batch_size, device=device)
  751. if attention_mask is None:
  752. attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
  753. if token_type_ids is None:
  754. if hasattr(self.embeddings, "token_type_ids"):
  755. buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
  756. buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
  757. token_type_ids = buffered_token_type_ids_expanded
  758. else:
  759. token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
  760. # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
  761. # ourselves in which case we just need to make it broadcastable to all heads.
  762. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
  763. # If a 2D or 3D attention mask is provided for the cross-attention
  764. # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
  765. if self.config.is_decoder and encoder_hidden_states is not None:
  766. encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
  767. encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
  768. if encoder_attention_mask is None:
  769. encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
  770. encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
  771. else:
  772. encoder_extended_attention_mask = None
  773. # Prepare head mask if needed
  774. # 1.0 in head_mask indicate we keep the head
  775. # attention_probs has shape bsz x n_heads x N x N
  776. # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
  777. # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
  778. head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
  779. embedding_output = self.embeddings(
  780. input_ids=input_ids,
  781. position_ids=position_ids,
  782. token_type_ids=token_type_ids,
  783. inputs_embeds=inputs_embeds,
  784. past_key_values_length=past_key_values_length,
  785. )
  786. encoder_outputs = self.encoder(
  787. embedding_output,
  788. lang_ids=lang_ids,
  789. attention_mask=extended_attention_mask,
  790. head_mask=head_mask,
  791. encoder_hidden_states=encoder_hidden_states,
  792. encoder_attention_mask=encoder_extended_attention_mask,
  793. past_key_values=past_key_values,
  794. use_cache=use_cache,
  795. output_attentions=output_attentions,
  796. output_hidden_states=output_hidden_states,
  797. return_dict=return_dict,
  798. )
  799. sequence_output = encoder_outputs[0]
  800. pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
  801. if not return_dict:
  802. return (sequence_output, pooled_output) + encoder_outputs[1:]
  803. return BaseModelOutputWithPoolingAndCrossAttentions(
  804. last_hidden_state=sequence_output,
  805. pooler_output=pooled_output,
  806. past_key_values=encoder_outputs.past_key_values,
  807. hidden_states=encoder_outputs.hidden_states,
  808. attentions=encoder_outputs.attentions,
  809. cross_attentions=encoder_outputs.cross_attentions,
  810. )
  811. @add_start_docstrings(
  812. "X-MOD Model with a `language modeling` head on top for CLM fine-tuning.",
  813. XMOD_START_DOCSTRING,
  814. )
  815. class XmodForCausalLM(XmodPreTrainedModel, GenerationMixin):
  816. _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
  817. # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.__init__ with Roberta->Xmod
  818. def __init__(self, config):
  819. super().__init__(config)
  820. if not config.is_decoder:
  821. logger.warning("If you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`")
  822. self.roberta = XmodModel(config, add_pooling_layer=False)
  823. self.lm_head = XmodLMHead(config)
  824. # Initialize weights and apply final processing
  825. self.post_init()
  826. # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.get_output_embeddings
  827. def get_output_embeddings(self):
  828. return self.lm_head.decoder
  829. # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.set_output_embeddings
  830. def set_output_embeddings(self, new_embeddings):
  831. self.lm_head.decoder = new_embeddings
  832. @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  833. def forward(
  834. self,
  835. input_ids: Optional[torch.LongTensor] = None,
  836. lang_ids: Optional[torch.LongTensor] = None,
  837. attention_mask: Optional[torch.FloatTensor] = None,
  838. token_type_ids: Optional[torch.LongTensor] = None,
  839. position_ids: Optional[torch.LongTensor] = None,
  840. head_mask: Optional[torch.FloatTensor] = None,
  841. inputs_embeds: Optional[torch.FloatTensor] = None,
  842. encoder_hidden_states: Optional[torch.FloatTensor] = None,
  843. encoder_attention_mask: Optional[torch.FloatTensor] = None,
  844. labels: Optional[torch.LongTensor] = None,
  845. past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
  846. use_cache: Optional[bool] = None,
  847. output_attentions: Optional[bool] = None,
  848. output_hidden_states: Optional[bool] = None,
  849. return_dict: Optional[bool] = None,
  850. ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
  851. r"""
  852. encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  853. Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
  854. the model is configured as a decoder.
  855. encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
  856. Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
  857. the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
  858. - 1 for tokens that are **not masked**,
  859. - 0 for tokens that are **masked**.
  860. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  861. Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
  862. `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
  863. ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
  864. past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
  865. Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
  866. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
  867. don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
  868. `decoder_input_ids` of shape `(batch_size, sequence_length)`.
  869. use_cache (`bool`, *optional*):
  870. If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
  871. `past_key_values`).
  872. Returns: `transformers.modeling_outputs.CausalLMOutputWithCrossAttentions` or `tuple(torch.FloatTensor)`
  873. Example:
  874. ```python
  875. >>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
  876. >>> import torch
  877. >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
  878. >>> config = AutoConfig.from_pretrained("facebook/xmod-base")
  879. >>> config.is_decoder = True
  880. >>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
  881. >>> model.set_default_language("en_XX")
  882. >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
  883. >>> outputs = model(**inputs)
  884. >>> prediction_logits = outputs.logits
  885. ```"""
  886. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  887. if labels is not None:
  888. use_cache = False
  889. outputs = self.roberta(
  890. input_ids,
  891. lang_ids=lang_ids,
  892. attention_mask=attention_mask,
  893. token_type_ids=token_type_ids,
  894. position_ids=position_ids,
  895. head_mask=head_mask,
  896. inputs_embeds=inputs_embeds,
  897. encoder_hidden_states=encoder_hidden_states,
  898. encoder_attention_mask=encoder_attention_mask,
  899. past_key_values=past_key_values,
  900. use_cache=use_cache,
  901. output_attentions=output_attentions,
  902. output_hidden_states=output_hidden_states,
  903. return_dict=return_dict,
  904. )
  905. sequence_output = outputs[0]
  906. prediction_scores = self.lm_head(sequence_output)
  907. lm_loss = None
  908. if labels is not None:
  909. # we are doing next-token prediction; shift prediction scores and input ids by one
  910. shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
  911. labels = labels[:, 1:].contiguous()
  912. loss_fct = CrossEntropyLoss()
  913. lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
  914. if not return_dict:
  915. output = (prediction_scores,) + outputs[2:]
  916. return ((lm_loss,) + output) if lm_loss is not None else output
  917. return CausalLMOutputWithCrossAttentions(
  918. loss=lm_loss,
  919. logits=prediction_scores,
  920. past_key_values=outputs.past_key_values,
  921. hidden_states=outputs.hidden_states,
  922. attentions=outputs.attentions,
  923. cross_attentions=outputs.cross_attentions,
  924. )
  925. # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM._reorder_cache
  926. def _reorder_cache(self, past_key_values, beam_idx):
  927. reordered_past = ()
  928. for layer_past in past_key_values:
  929. reordered_past += (
  930. tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
  931. )
  932. return reordered_past
  933. @add_start_docstrings(
  934. """X-MOD Model with a `language modeling` head on top.""",
  935. XMOD_START_DOCSTRING,
  936. )
  937. class XmodForMaskedLM(XmodPreTrainedModel):
  938. _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
  939. # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Xmod
  940. def __init__(self, config):
  941. super().__init__(config)
  942. if config.is_decoder:
  943. logger.warning(
  944. "If you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for "
  945. "bi-directional self-attention."
  946. )
  947. self.roberta = XmodModel(config, add_pooling_layer=False)
  948. self.lm_head = XmodLMHead(config)
  949. # Initialize weights and apply final processing
  950. self.post_init()
  951. # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.get_output_embeddings
  952. def get_output_embeddings(self):
  953. return self.lm_head.decoder
  954. # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.set_output_embeddings
  955. def set_output_embeddings(self, new_embeddings):
  956. self.lm_head.decoder = new_embeddings
  957. @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  958. def forward(
  959. self,
  960. input_ids: Optional[torch.LongTensor] = None,
  961. lang_ids: Optional[torch.LongTensor] = None,
  962. attention_mask: Optional[torch.FloatTensor] = None,
  963. token_type_ids: Optional[torch.LongTensor] = None,
  964. position_ids: Optional[torch.LongTensor] = None,
  965. head_mask: Optional[torch.FloatTensor] = None,
  966. inputs_embeds: Optional[torch.FloatTensor] = None,
  967. encoder_hidden_states: Optional[torch.FloatTensor] = None,
  968. encoder_attention_mask: Optional[torch.FloatTensor] = None,
  969. labels: Optional[torch.LongTensor] = None,
  970. output_attentions: Optional[bool] = None,
  971. output_hidden_states: Optional[bool] = None,
  972. return_dict: Optional[bool] = None,
  973. ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
  974. r"""
  975. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  976. Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
  977. config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
  978. loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
  979. kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
  980. Used to hide legacy arguments that have been deprecated.
  981. """
  982. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  983. outputs = self.roberta(
  984. input_ids,
  985. lang_ids=lang_ids,
  986. attention_mask=attention_mask,
  987. token_type_ids=token_type_ids,
  988. position_ids=position_ids,
  989. head_mask=head_mask,
  990. inputs_embeds=inputs_embeds,
  991. encoder_hidden_states=encoder_hidden_states,
  992. encoder_attention_mask=encoder_attention_mask,
  993. output_attentions=output_attentions,
  994. output_hidden_states=output_hidden_states,
  995. return_dict=return_dict,
  996. )
  997. sequence_output = outputs[0]
  998. prediction_scores = self.lm_head(sequence_output)
  999. masked_lm_loss = None
  1000. if labels is not None:
  1001. loss_fct = CrossEntropyLoss()
  1002. masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
  1003. if not return_dict:
  1004. output = (prediction_scores,) + outputs[2:]
  1005. return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
  1006. return MaskedLMOutput(
  1007. loss=masked_lm_loss,
  1008. logits=prediction_scores,
  1009. hidden_states=outputs.hidden_states,
  1010. attentions=outputs.attentions,
  1011. )
  1012. # Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
  1013. class XmodLMHead(nn.Module):
  1014. """Roberta Head for masked language modeling."""
  1015. def __init__(self, config):
  1016. super().__init__()
  1017. self.dense = nn.Linear(config.hidden_size, config.hidden_size)
  1018. self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  1019. self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
  1020. self.bias = nn.Parameter(torch.zeros(config.vocab_size))
  1021. self.decoder.bias = self.bias
  1022. def forward(self, features, **kwargs):
  1023. x = self.dense(features)
  1024. x = gelu(x)
  1025. x = self.layer_norm(x)
  1026. # project back to size of vocabulary with bias
  1027. x = self.decoder(x)
  1028. return x
  1029. def _tie_weights(self):
  1030. # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
  1031. # For accelerate compatibility and to not break backward compatibility
  1032. if self.decoder.bias.device.type == "meta":
  1033. self.decoder.bias = self.bias
  1034. else:
  1035. self.bias = self.decoder.bias
  1036. @add_start_docstrings(
  1037. """
  1038. X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
  1039. output) e.g. for GLUE tasks.
  1040. """,
  1041. XMOD_START_DOCSTRING,
  1042. )
  1043. class XmodForSequenceClassification(XmodPreTrainedModel):
  1044. # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Xmod
  1045. def __init__(self, config):
  1046. super().__init__(config)
  1047. self.num_labels = config.num_labels
  1048. self.config = config
  1049. self.roberta = XmodModel(config, add_pooling_layer=False)
  1050. self.classifier = XmodClassificationHead(config)
  1051. # Initialize weights and apply final processing
  1052. self.post_init()
  1053. @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1054. def forward(
  1055. self,
  1056. input_ids: Optional[torch.LongTensor] = None,
  1057. lang_ids: Optional[torch.LongTensor] = None,
  1058. attention_mask: Optional[torch.FloatTensor] = None,
  1059. token_type_ids: Optional[torch.LongTensor] = None,
  1060. position_ids: Optional[torch.LongTensor] = None,
  1061. head_mask: Optional[torch.FloatTensor] = None,
  1062. inputs_embeds: Optional[torch.FloatTensor] = None,
  1063. labels: Optional[torch.LongTensor] = None,
  1064. output_attentions: Optional[bool] = None,
  1065. output_hidden_states: Optional[bool] = None,
  1066. return_dict: Optional[bool] = None,
  1067. ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
  1068. r"""
  1069. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
  1070. Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
  1071. config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
  1072. `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
  1073. """
  1074. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1075. outputs = self.roberta(
  1076. input_ids,
  1077. lang_ids=lang_ids,
  1078. attention_mask=attention_mask,
  1079. token_type_ids=token_type_ids,
  1080. position_ids=position_ids,
  1081. head_mask=head_mask,
  1082. inputs_embeds=inputs_embeds,
  1083. output_attentions=output_attentions,
  1084. output_hidden_states=output_hidden_states,
  1085. return_dict=return_dict,
  1086. )
  1087. sequence_output = outputs[0]
  1088. logits = self.classifier(sequence_output)
  1089. loss = None
  1090. if labels is not None:
  1091. if self.config.problem_type is None:
  1092. if self.num_labels == 1:
  1093. self.config.problem_type = "regression"
  1094. elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
  1095. self.config.problem_type = "single_label_classification"
  1096. else:
  1097. self.config.problem_type = "multi_label_classification"
  1098. if self.config.problem_type == "regression":
  1099. loss_fct = MSELoss()
  1100. if self.num_labels == 1:
  1101. loss = loss_fct(logits.squeeze(), labels.squeeze())
  1102. else:
  1103. loss = loss_fct(logits, labels)
  1104. elif self.config.problem_type == "single_label_classification":
  1105. loss_fct = CrossEntropyLoss()
  1106. loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  1107. elif self.config.problem_type == "multi_label_classification":
  1108. loss_fct = BCEWithLogitsLoss()
  1109. loss = loss_fct(logits, labels)
  1110. if not return_dict:
  1111. output = (logits,) + outputs[2:]
  1112. return ((loss,) + output) if loss is not None else output
  1113. return SequenceClassifierOutput(
  1114. loss=loss,
  1115. logits=logits,
  1116. hidden_states=outputs.hidden_states,
  1117. attentions=outputs.attentions,
  1118. )
  1119. @add_start_docstrings(
  1120. """
  1121. X-MOD Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
  1122. softmax) e.g. for RocStories/SWAG tasks.
  1123. """,
  1124. XMOD_START_DOCSTRING,
  1125. )
  1126. class XmodForMultipleChoice(XmodPreTrainedModel):
  1127. # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice.__init__ with Roberta->Xmod
  1128. def __init__(self, config):
  1129. super().__init__(config)
  1130. self.roberta = XmodModel(config)
  1131. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  1132. self.classifier = nn.Linear(config.hidden_size, 1)
  1133. # Initialize weights and apply final processing
  1134. self.post_init()
  1135. @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
  1136. def forward(
  1137. self,
  1138. input_ids: Optional[torch.LongTensor] = None,
  1139. lang_ids: Optional[torch.LongTensor] = None,
  1140. token_type_ids: Optional[torch.LongTensor] = None,
  1141. attention_mask: Optional[torch.FloatTensor] = None,
  1142. labels: Optional[torch.LongTensor] = None,
  1143. position_ids: Optional[torch.LongTensor] = None,
  1144. head_mask: Optional[torch.FloatTensor] = None,
  1145. inputs_embeds: Optional[torch.FloatTensor] = None,
  1146. output_attentions: Optional[bool] = None,
  1147. output_hidden_states: Optional[bool] = None,
  1148. return_dict: Optional[bool] = None,
  1149. ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
  1150. r"""
  1151. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
  1152. Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
  1153. num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
  1154. `input_ids` above)
  1155. """
  1156. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1157. num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
  1158. flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
  1159. flat_lang_ids = lang_ids.repeat(input_ids.size(0) * input_ids.size(1)) if lang_ids is not None else None
  1160. flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
  1161. flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
  1162. flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
  1163. flat_inputs_embeds = (
  1164. inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
  1165. if inputs_embeds is not None
  1166. else None
  1167. )
  1168. outputs = self.roberta(
  1169. flat_input_ids,
  1170. lang_ids=flat_lang_ids,
  1171. position_ids=flat_position_ids,
  1172. token_type_ids=flat_token_type_ids,
  1173. attention_mask=flat_attention_mask,
  1174. head_mask=head_mask,
  1175. inputs_embeds=flat_inputs_embeds,
  1176. output_attentions=output_attentions,
  1177. output_hidden_states=output_hidden_states,
  1178. return_dict=return_dict,
  1179. )
  1180. pooled_output = outputs[1]
  1181. pooled_output = self.dropout(pooled_output)
  1182. logits = self.classifier(pooled_output)
  1183. reshaped_logits = logits.view(-1, num_choices)
  1184. loss = None
  1185. if labels is not None:
  1186. loss_fct = CrossEntropyLoss()
  1187. loss = loss_fct(reshaped_logits, labels)
  1188. if not return_dict:
  1189. output = (reshaped_logits,) + outputs[2:]
  1190. return ((loss,) + output) if loss is not None else output
  1191. return MultipleChoiceModelOutput(
  1192. loss=loss,
  1193. logits=reshaped_logits,
  1194. hidden_states=outputs.hidden_states,
  1195. attentions=outputs.attentions,
  1196. )
  1197. @add_start_docstrings(
  1198. """
  1199. X-MOD Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
  1200. Named-Entity-Recognition (NER) tasks.
  1201. """,
  1202. XMOD_START_DOCSTRING,
  1203. )
  1204. class XmodForTokenClassification(XmodPreTrainedModel):
  1205. # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Xmod
  1206. def __init__(self, config):
  1207. super().__init__(config)
  1208. self.num_labels = config.num_labels
  1209. self.roberta = XmodModel(config, add_pooling_layer=False)
  1210. classifier_dropout = (
  1211. config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
  1212. )
  1213. self.dropout = nn.Dropout(classifier_dropout)
  1214. self.classifier = nn.Linear(config.hidden_size, config.num_labels)
  1215. # Initialize weights and apply final processing
  1216. self.post_init()
  1217. @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1218. def forward(
  1219. self,
  1220. input_ids: Optional[torch.LongTensor] = None,
  1221. lang_ids: Optional[torch.LongTensor] = None,
  1222. attention_mask: Optional[torch.FloatTensor] = None,
  1223. token_type_ids: Optional[torch.LongTensor] = None,
  1224. position_ids: Optional[torch.LongTensor] = None,
  1225. head_mask: Optional[torch.FloatTensor] = None,
  1226. inputs_embeds: Optional[torch.FloatTensor] = None,
  1227. labels: Optional[torch.LongTensor] = None,
  1228. output_attentions: Optional[bool] = None,
  1229. output_hidden_states: Optional[bool] = None,
  1230. return_dict: Optional[bool] = None,
  1231. ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
  1232. r"""
  1233. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  1234. Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
  1235. """
  1236. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1237. outputs = self.roberta(
  1238. input_ids,
  1239. lang_ids=lang_ids,
  1240. attention_mask=attention_mask,
  1241. token_type_ids=token_type_ids,
  1242. position_ids=position_ids,
  1243. head_mask=head_mask,
  1244. inputs_embeds=inputs_embeds,
  1245. output_attentions=output_attentions,
  1246. output_hidden_states=output_hidden_states,
  1247. return_dict=return_dict,
  1248. )
  1249. sequence_output = outputs[0]
  1250. sequence_output = self.dropout(sequence_output)
  1251. logits = self.classifier(sequence_output)
  1252. loss = None
  1253. if labels is not None:
  1254. loss_fct = CrossEntropyLoss()
  1255. loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  1256. if not return_dict:
  1257. output = (logits,) + outputs[2:]
  1258. return ((loss,) + output) if loss is not None else output
  1259. return TokenClassifierOutput(
  1260. loss=loss,
  1261. logits=logits,
  1262. hidden_states=outputs.hidden_states,
  1263. attentions=outputs.attentions,
  1264. )
  1265. # Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead
  1266. class XmodClassificationHead(nn.Module):
  1267. """Head for sentence-level classification tasks."""
  1268. def __init__(self, config):
  1269. super().__init__()
  1270. self.dense = nn.Linear(config.hidden_size, config.hidden_size)
  1271. classifier_dropout = (
  1272. config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
  1273. )
  1274. self.dropout = nn.Dropout(classifier_dropout)
  1275. self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
  1276. def forward(self, features, **kwargs):
  1277. x = features[:, 0, :] # take <s> token (equiv. to [CLS])
  1278. x = self.dropout(x)
  1279. x = self.dense(x)
  1280. x = torch.tanh(x)
  1281. x = self.dropout(x)
  1282. x = self.out_proj(x)
  1283. return x
  1284. @add_start_docstrings(
  1285. """
  1286. X-MOD Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
  1287. layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
  1288. """,
  1289. XMOD_START_DOCSTRING,
  1290. )
  1291. class XmodForQuestionAnswering(XmodPreTrainedModel):
  1292. # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Xmod
  1293. def __init__(self, config):
  1294. super().__init__(config)
  1295. self.num_labels = config.num_labels
  1296. self.roberta = XmodModel(config, add_pooling_layer=False)
  1297. self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
  1298. # Initialize weights and apply final processing
  1299. self.post_init()
  1300. @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1301. def forward(
  1302. self,
  1303. input_ids: Optional[torch.LongTensor] = None,
  1304. lang_ids: Optional[torch.LongTensor] = None,
  1305. attention_mask: Optional[torch.FloatTensor] = None,
  1306. token_type_ids: Optional[torch.LongTensor] = None,
  1307. position_ids: Optional[torch.LongTensor] = None,
  1308. head_mask: Optional[torch.FloatTensor] = None,
  1309. inputs_embeds: Optional[torch.FloatTensor] = None,
  1310. start_positions: Optional[torch.LongTensor] = None,
  1311. end_positions: Optional[torch.LongTensor] = None,
  1312. output_attentions: Optional[bool] = None,
  1313. output_hidden_states: Optional[bool] = None,
  1314. return_dict: Optional[bool] = None,
  1315. ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
  1316. r"""
  1317. start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
  1318. Labels for position (index) of the start of the labelled span for computing the token classification loss.
  1319. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
  1320. are not taken into account for computing the loss.
  1321. end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
  1322. Labels for position (index) of the end of the labelled span for computing the token classification loss.
  1323. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
  1324. are not taken into account for computing the loss.
  1325. """
  1326. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1327. outputs = self.roberta(
  1328. input_ids,
  1329. lang_ids=lang_ids,
  1330. attention_mask=attention_mask,
  1331. token_type_ids=token_type_ids,
  1332. position_ids=position_ids,
  1333. head_mask=head_mask,
  1334. inputs_embeds=inputs_embeds,
  1335. output_attentions=output_attentions,
  1336. output_hidden_states=output_hidden_states,
  1337. return_dict=return_dict,
  1338. )
  1339. sequence_output = outputs[0]
  1340. logits = self.qa_outputs(sequence_output)
  1341. start_logits, end_logits = logits.split(1, dim=-1)
  1342. start_logits = start_logits.squeeze(-1).contiguous()
  1343. end_logits = end_logits.squeeze(-1).contiguous()
  1344. total_loss = None
  1345. if start_positions is not None and end_positions is not None:
  1346. # If we are on multi-GPU, split add a dimension
  1347. if len(start_positions.size()) > 1:
  1348. start_positions = start_positions.squeeze(-1)
  1349. if len(end_positions.size()) > 1:
  1350. end_positions = end_positions.squeeze(-1)
  1351. # sometimes the start/end positions are outside our model inputs, we ignore these terms
  1352. ignored_index = start_logits.size(1)
  1353. start_positions = start_positions.clamp(0, ignored_index)
  1354. end_positions = end_positions.clamp(0, ignored_index)
  1355. loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
  1356. start_loss = loss_fct(start_logits, start_positions)
  1357. end_loss = loss_fct(end_logits, end_positions)
  1358. total_loss = (start_loss + end_loss) / 2
  1359. if not return_dict:
  1360. output = (start_logits, end_logits) + outputs[2:]
  1361. return ((total_loss,) + output) if total_loss is not None else output
  1362. return QuestionAnsweringModelOutput(
  1363. loss=total_loss,
  1364. start_logits=start_logits,
  1365. end_logits=end_logits,
  1366. hidden_states=outputs.hidden_states,
  1367. attentions=outputs.attentions,
  1368. )
  1369. # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
  1370. def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
  1371. """
  1372. Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
  1373. are ignored. This is modified from fairseq's `utils.make_positions`.
  1374. Args:
  1375. x: torch.Tensor x:
  1376. Returns: torch.Tensor
  1377. """
  1378. # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
  1379. mask = input_ids.ne(padding_idx).int()
  1380. incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
  1381. return incremental_indices.long() + padding_idx