modeling_tf_layoutlm.py 71 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681
  1. # coding=utf-8
  2. # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """TF 2.0 LayoutLM model."""
  16. from __future__ import annotations
  17. import math
  18. import warnings
  19. from typing import Dict, Optional, Tuple, Union
  20. import numpy as np
  21. import tensorflow as tf
  22. from ...activations_tf import get_tf_activation
  23. from ...modeling_tf_outputs import (
  24. TFBaseModelOutputWithPastAndCrossAttentions,
  25. TFBaseModelOutputWithPoolingAndCrossAttentions,
  26. TFMaskedLMOutput,
  27. TFQuestionAnsweringModelOutput,
  28. TFSequenceClassifierOutput,
  29. TFTokenClassifierOutput,
  30. )
  31. from ...modeling_tf_utils import (
  32. TFMaskedLanguageModelingLoss,
  33. TFModelInputType,
  34. TFPreTrainedModel,
  35. TFQuestionAnsweringLoss,
  36. TFSequenceClassificationLoss,
  37. TFTokenClassificationLoss,
  38. get_initializer,
  39. keras,
  40. keras_serializable,
  41. unpack_inputs,
  42. )
  43. from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
  44. from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
  45. from .configuration_layoutlm import LayoutLMConfig
  46. logger = logging.get_logger(__name__)
  47. _CONFIG_FOR_DOC = "LayoutLMConfig"
  48. class TFLayoutLMEmbeddings(keras.layers.Layer):
  49. """Construct the embeddings from word, position and token_type embeddings."""
  50. def __init__(self, config: LayoutLMConfig, **kwargs):
  51. super().__init__(**kwargs)
  52. self.config = config
  53. self.hidden_size = config.hidden_size
  54. self.max_position_embeddings = config.max_position_embeddings
  55. self.max_2d_position_embeddings = config.max_2d_position_embeddings
  56. self.initializer_range = config.initializer_range
  57. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  58. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  59. def build(self, input_shape=None):
  60. with tf.name_scope("word_embeddings"):
  61. self.weight = self.add_weight(
  62. name="weight",
  63. shape=[self.config.vocab_size, self.hidden_size],
  64. initializer=get_initializer(self.initializer_range),
  65. )
  66. with tf.name_scope("token_type_embeddings"):
  67. self.token_type_embeddings = self.add_weight(
  68. name="embeddings",
  69. shape=[self.config.type_vocab_size, self.hidden_size],
  70. initializer=get_initializer(self.initializer_range),
  71. )
  72. with tf.name_scope("position_embeddings"):
  73. self.position_embeddings = self.add_weight(
  74. name="embeddings",
  75. shape=[self.max_position_embeddings, self.hidden_size],
  76. initializer=get_initializer(self.initializer_range),
  77. )
  78. with tf.name_scope("x_position_embeddings"):
  79. self.x_position_embeddings = self.add_weight(
  80. name="embeddings",
  81. shape=[self.max_2d_position_embeddings, self.hidden_size],
  82. initializer=get_initializer(self.initializer_range),
  83. )
  84. with tf.name_scope("y_position_embeddings"):
  85. self.y_position_embeddings = self.add_weight(
  86. name="embeddings",
  87. shape=[self.max_2d_position_embeddings, self.hidden_size],
  88. initializer=get_initializer(self.initializer_range),
  89. )
  90. with tf.name_scope("h_position_embeddings"):
  91. self.h_position_embeddings = self.add_weight(
  92. name="embeddings",
  93. shape=[self.max_2d_position_embeddings, self.hidden_size],
  94. initializer=get_initializer(self.initializer_range),
  95. )
  96. with tf.name_scope("w_position_embeddings"):
  97. self.w_position_embeddings = self.add_weight(
  98. name="embeddings",
  99. shape=[self.max_2d_position_embeddings, self.hidden_size],
  100. initializer=get_initializer(self.initializer_range),
  101. )
  102. if self.built:
  103. return
  104. self.built = True
  105. if getattr(self, "LayerNorm", None) is not None:
  106. with tf.name_scope(self.LayerNorm.name):
  107. self.LayerNorm.build([None, None, self.config.hidden_size])
  108. def call(
  109. self,
  110. input_ids: tf.Tensor = None,
  111. bbox: tf.Tensor = None,
  112. position_ids: tf.Tensor = None,
  113. token_type_ids: tf.Tensor = None,
  114. inputs_embeds: tf.Tensor = None,
  115. training: bool = False,
  116. ) -> tf.Tensor:
  117. """
  118. Applies embedding based on inputs tensor.
  119. Returns:
  120. final_embeddings (`tf.Tensor`): output embedding tensor.
  121. """
  122. assert not (input_ids is None and inputs_embeds is None)
  123. if input_ids is not None:
  124. check_embeddings_within_bounds(input_ids, self.config.vocab_size)
  125. inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
  126. input_shape = shape_list(inputs_embeds)[:-1]
  127. if token_type_ids is None:
  128. token_type_ids = tf.fill(dims=input_shape, value=0)
  129. if position_ids is None:
  130. position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
  131. if position_ids is None:
  132. position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
  133. if bbox is None:
  134. bbox = bbox = tf.fill(input_shape + [4], value=0)
  135. try:
  136. left_position_embeddings = tf.gather(self.x_position_embeddings, bbox[:, :, 0])
  137. upper_position_embeddings = tf.gather(self.y_position_embeddings, bbox[:, :, 1])
  138. right_position_embeddings = tf.gather(self.x_position_embeddings, bbox[:, :, 2])
  139. lower_position_embeddings = tf.gather(self.y_position_embeddings, bbox[:, :, 3])
  140. except IndexError as e:
  141. raise IndexError("The `bbox`coordinate values should be within 0-1000 range.") from e
  142. h_position_embeddings = tf.gather(self.h_position_embeddings, bbox[:, :, 3] - bbox[:, :, 1])
  143. w_position_embeddings = tf.gather(self.w_position_embeddings, bbox[:, :, 2] - bbox[:, :, 0])
  144. position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
  145. token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
  146. final_embeddings = (
  147. inputs_embeds
  148. + position_embeds
  149. + token_type_embeds
  150. + left_position_embeddings
  151. + upper_position_embeddings
  152. + right_position_embeddings
  153. + lower_position_embeddings
  154. + h_position_embeddings
  155. + w_position_embeddings
  156. )
  157. final_embeddings = self.LayerNorm(inputs=final_embeddings)
  158. final_embeddings = self.dropout(inputs=final_embeddings, training=training)
  159. return final_embeddings
  160. # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->LayoutLM
  161. class TFLayoutLMSelfAttention(keras.layers.Layer):
  162. def __init__(self, config: LayoutLMConfig, **kwargs):
  163. super().__init__(**kwargs)
  164. if config.hidden_size % config.num_attention_heads != 0:
  165. raise ValueError(
  166. f"The hidden size ({config.hidden_size}) is not a multiple of the number "
  167. f"of attention heads ({config.num_attention_heads})"
  168. )
  169. self.num_attention_heads = config.num_attention_heads
  170. self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
  171. self.all_head_size = self.num_attention_heads * self.attention_head_size
  172. self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
  173. self.query = keras.layers.Dense(
  174. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
  175. )
  176. self.key = keras.layers.Dense(
  177. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
  178. )
  179. self.value = keras.layers.Dense(
  180. units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
  181. )
  182. self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
  183. self.is_decoder = config.is_decoder
  184. self.config = config
  185. def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
  186. # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
  187. tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
  188. # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
  189. return tf.transpose(tensor, perm=[0, 2, 1, 3])
  190. def call(
  191. self,
  192. hidden_states: tf.Tensor,
  193. attention_mask: tf.Tensor,
  194. head_mask: tf.Tensor,
  195. encoder_hidden_states: tf.Tensor,
  196. encoder_attention_mask: tf.Tensor,
  197. past_key_value: Tuple[tf.Tensor],
  198. output_attentions: bool,
  199. training: bool = False,
  200. ) -> Tuple[tf.Tensor]:
  201. batch_size = shape_list(hidden_states)[0]
  202. mixed_query_layer = self.query(inputs=hidden_states)
  203. # If this is instantiated as a cross-attention module, the keys
  204. # and values come from an encoder; the attention mask needs to be
  205. # such that the encoder's padding tokens are not attended to.
  206. is_cross_attention = encoder_hidden_states is not None
  207. if is_cross_attention and past_key_value is not None:
  208. # reuse k,v, cross_attentions
  209. key_layer = past_key_value[0]
  210. value_layer = past_key_value[1]
  211. attention_mask = encoder_attention_mask
  212. elif is_cross_attention:
  213. key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
  214. value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
  215. attention_mask = encoder_attention_mask
  216. elif past_key_value is not None:
  217. key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
  218. value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
  219. key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
  220. value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
  221. else:
  222. key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
  223. value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
  224. query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
  225. if self.is_decoder:
  226. # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
  227. # Further calls to cross_attention layer can then reuse all cross-attention
  228. # key/value_states (first "if" case)
  229. # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
  230. # all previous decoder key/value_states. Further calls to uni-directional self-attention
  231. # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
  232. # if encoder bi-directional self-attention `past_key_value` is always `None`
  233. past_key_value = (key_layer, value_layer)
  234. # Take the dot product between "query" and "key" to get the raw attention scores.
  235. # (batch size, num_heads, seq_len_q, seq_len_k)
  236. attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  237. dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
  238. attention_scores = tf.divide(attention_scores, dk)
  239. if attention_mask is not None:
  240. # Apply the attention mask is (precomputed for all layers in TFLayoutLMModel call() function)
  241. attention_scores = tf.add(attention_scores, attention_mask)
  242. # Normalize the attention scores to probabilities.
  243. attention_probs = stable_softmax(logits=attention_scores, axis=-1)
  244. # This is actually dropping out entire tokens to attend to, which might
  245. # seem a bit unusual, but is taken from the original Transformer paper.
  246. attention_probs = self.dropout(inputs=attention_probs, training=training)
  247. # Mask heads if we want to
  248. if head_mask is not None:
  249. attention_probs = tf.multiply(attention_probs, head_mask)
  250. attention_output = tf.matmul(attention_probs, value_layer)
  251. attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
  252. # (batch_size, seq_len_q, all_head_size)
  253. attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
  254. outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
  255. if self.is_decoder:
  256. outputs = outputs + (past_key_value,)
  257. return outputs
  258. def build(self, input_shape=None):
  259. if self.built:
  260. return
  261. self.built = True
  262. if getattr(self, "query", None) is not None:
  263. with tf.name_scope(self.query.name):
  264. self.query.build([None, None, self.config.hidden_size])
  265. if getattr(self, "key", None) is not None:
  266. with tf.name_scope(self.key.name):
  267. self.key.build([None, None, self.config.hidden_size])
  268. if getattr(self, "value", None) is not None:
  269. with tf.name_scope(self.value.name):
  270. self.value.build([None, None, self.config.hidden_size])
  271. # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->LayoutLM
  272. class TFLayoutLMSelfOutput(keras.layers.Layer):
  273. def __init__(self, config: LayoutLMConfig, **kwargs):
  274. super().__init__(**kwargs)
  275. self.dense = keras.layers.Dense(
  276. units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  277. )
  278. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  279. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  280. self.config = config
  281. def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
  282. hidden_states = self.dense(inputs=hidden_states)
  283. hidden_states = self.dropout(inputs=hidden_states, training=training)
  284. hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
  285. return hidden_states
  286. def build(self, input_shape=None):
  287. if self.built:
  288. return
  289. self.built = True
  290. if getattr(self, "dense", None) is not None:
  291. with tf.name_scope(self.dense.name):
  292. self.dense.build([None, None, self.config.hidden_size])
  293. if getattr(self, "LayerNorm", None) is not None:
  294. with tf.name_scope(self.LayerNorm.name):
  295. self.LayerNorm.build([None, None, self.config.hidden_size])
  296. # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->LayoutLM
  297. class TFLayoutLMAttention(keras.layers.Layer):
  298. def __init__(self, config: LayoutLMConfig, **kwargs):
  299. super().__init__(**kwargs)
  300. self.self_attention = TFLayoutLMSelfAttention(config, name="self")
  301. self.dense_output = TFLayoutLMSelfOutput(config, name="output")
  302. def prune_heads(self, heads):
  303. raise NotImplementedError
  304. def call(
  305. self,
  306. input_tensor: tf.Tensor,
  307. attention_mask: tf.Tensor,
  308. head_mask: tf.Tensor,
  309. encoder_hidden_states: tf.Tensor,
  310. encoder_attention_mask: tf.Tensor,
  311. past_key_value: Tuple[tf.Tensor],
  312. output_attentions: bool,
  313. training: bool = False,
  314. ) -> Tuple[tf.Tensor]:
  315. self_outputs = self.self_attention(
  316. hidden_states=input_tensor,
  317. attention_mask=attention_mask,
  318. head_mask=head_mask,
  319. encoder_hidden_states=encoder_hidden_states,
  320. encoder_attention_mask=encoder_attention_mask,
  321. past_key_value=past_key_value,
  322. output_attentions=output_attentions,
  323. training=training,
  324. )
  325. attention_output = self.dense_output(
  326. hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
  327. )
  328. # add attentions (possibly with past_key_value) if we output them
  329. outputs = (attention_output,) + self_outputs[1:]
  330. return outputs
  331. def build(self, input_shape=None):
  332. if self.built:
  333. return
  334. self.built = True
  335. if getattr(self, "self_attention", None) is not None:
  336. with tf.name_scope(self.self_attention.name):
  337. self.self_attention.build(None)
  338. if getattr(self, "dense_output", None) is not None:
  339. with tf.name_scope(self.dense_output.name):
  340. self.dense_output.build(None)
  341. # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->LayoutLM
  342. class TFLayoutLMIntermediate(keras.layers.Layer):
  343. def __init__(self, config: LayoutLMConfig, **kwargs):
  344. super().__init__(**kwargs)
  345. self.dense = keras.layers.Dense(
  346. units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  347. )
  348. if isinstance(config.hidden_act, str):
  349. self.intermediate_act_fn = get_tf_activation(config.hidden_act)
  350. else:
  351. self.intermediate_act_fn = config.hidden_act
  352. self.config = config
  353. def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
  354. hidden_states = self.dense(inputs=hidden_states)
  355. hidden_states = self.intermediate_act_fn(hidden_states)
  356. return hidden_states
  357. def build(self, input_shape=None):
  358. if self.built:
  359. return
  360. self.built = True
  361. if getattr(self, "dense", None) is not None:
  362. with tf.name_scope(self.dense.name):
  363. self.dense.build([None, None, self.config.hidden_size])
  364. # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->LayoutLM
  365. class TFLayoutLMOutput(keras.layers.Layer):
  366. def __init__(self, config: LayoutLMConfig, **kwargs):
  367. super().__init__(**kwargs)
  368. self.dense = keras.layers.Dense(
  369. units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
  370. )
  371. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  372. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  373. self.config = config
  374. def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
  375. hidden_states = self.dense(inputs=hidden_states)
  376. hidden_states = self.dropout(inputs=hidden_states, training=training)
  377. hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
  378. return hidden_states
  379. def build(self, input_shape=None):
  380. if self.built:
  381. return
  382. self.built = True
  383. if getattr(self, "dense", None) is not None:
  384. with tf.name_scope(self.dense.name):
  385. self.dense.build([None, None, self.config.intermediate_size])
  386. if getattr(self, "LayerNorm", None) is not None:
  387. with tf.name_scope(self.LayerNorm.name):
  388. self.LayerNorm.build([None, None, self.config.hidden_size])
  389. # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->LayoutLM
  390. class TFLayoutLMLayer(keras.layers.Layer):
  391. def __init__(self, config: LayoutLMConfig, **kwargs):
  392. super().__init__(**kwargs)
  393. self.attention = TFLayoutLMAttention(config, name="attention")
  394. self.is_decoder = config.is_decoder
  395. self.add_cross_attention = config.add_cross_attention
  396. if self.add_cross_attention:
  397. if not self.is_decoder:
  398. raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
  399. self.crossattention = TFLayoutLMAttention(config, name="crossattention")
  400. self.intermediate = TFLayoutLMIntermediate(config, name="intermediate")
  401. self.bert_output = TFLayoutLMOutput(config, name="output")
  402. def call(
  403. self,
  404. hidden_states: tf.Tensor,
  405. attention_mask: tf.Tensor,
  406. head_mask: tf.Tensor,
  407. encoder_hidden_states: tf.Tensor | None,
  408. encoder_attention_mask: tf.Tensor | None,
  409. past_key_value: Tuple[tf.Tensor] | None,
  410. output_attentions: bool,
  411. training: bool = False,
  412. ) -> Tuple[tf.Tensor]:
  413. # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
  414. self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
  415. self_attention_outputs = self.attention(
  416. input_tensor=hidden_states,
  417. attention_mask=attention_mask,
  418. head_mask=head_mask,
  419. encoder_hidden_states=None,
  420. encoder_attention_mask=None,
  421. past_key_value=self_attn_past_key_value,
  422. output_attentions=output_attentions,
  423. training=training,
  424. )
  425. attention_output = self_attention_outputs[0]
  426. # if decoder, the last output is tuple of self-attn cache
  427. if self.is_decoder:
  428. outputs = self_attention_outputs[1:-1]
  429. present_key_value = self_attention_outputs[-1]
  430. else:
  431. outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
  432. cross_attn_present_key_value = None
  433. if self.is_decoder and encoder_hidden_states is not None:
  434. if not hasattr(self, "crossattention"):
  435. raise ValueError(
  436. f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
  437. " by setting `config.add_cross_attention=True`"
  438. )
  439. # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
  440. cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
  441. cross_attention_outputs = self.crossattention(
  442. input_tensor=attention_output,
  443. attention_mask=attention_mask,
  444. head_mask=head_mask,
  445. encoder_hidden_states=encoder_hidden_states,
  446. encoder_attention_mask=encoder_attention_mask,
  447. past_key_value=cross_attn_past_key_value,
  448. output_attentions=output_attentions,
  449. training=training,
  450. )
  451. attention_output = cross_attention_outputs[0]
  452. outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
  453. # add cross-attn cache to positions 3,4 of present_key_value tuple
  454. cross_attn_present_key_value = cross_attention_outputs[-1]
  455. present_key_value = present_key_value + cross_attn_present_key_value
  456. intermediate_output = self.intermediate(hidden_states=attention_output)
  457. layer_output = self.bert_output(
  458. hidden_states=intermediate_output, input_tensor=attention_output, training=training
  459. )
  460. outputs = (layer_output,) + outputs # add attentions if we output them
  461. # if decoder, return the attn key/values as the last output
  462. if self.is_decoder:
  463. outputs = outputs + (present_key_value,)
  464. return outputs
  465. def build(self, input_shape=None):
  466. if self.built:
  467. return
  468. self.built = True
  469. if getattr(self, "attention", None) is not None:
  470. with tf.name_scope(self.attention.name):
  471. self.attention.build(None)
  472. if getattr(self, "intermediate", None) is not None:
  473. with tf.name_scope(self.intermediate.name):
  474. self.intermediate.build(None)
  475. if getattr(self, "bert_output", None) is not None:
  476. with tf.name_scope(self.bert_output.name):
  477. self.bert_output.build(None)
  478. if getattr(self, "crossattention", None) is not None:
  479. with tf.name_scope(self.crossattention.name):
  480. self.crossattention.build(None)
  481. # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->LayoutLM
  482. class TFLayoutLMEncoder(keras.layers.Layer):
  483. def __init__(self, config: LayoutLMConfig, **kwargs):
  484. super().__init__(**kwargs)
  485. self.config = config
  486. self.layer = [TFLayoutLMLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
  487. def call(
  488. self,
  489. hidden_states: tf.Tensor,
  490. attention_mask: tf.Tensor,
  491. head_mask: tf.Tensor,
  492. encoder_hidden_states: tf.Tensor | None,
  493. encoder_attention_mask: tf.Tensor | None,
  494. past_key_values: Tuple[Tuple[tf.Tensor]] | None,
  495. use_cache: Optional[bool],
  496. output_attentions: bool,
  497. output_hidden_states: bool,
  498. return_dict: bool,
  499. training: bool = False,
  500. ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
  501. all_hidden_states = () if output_hidden_states else None
  502. all_attentions = () if output_attentions else None
  503. all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
  504. next_decoder_cache = () if use_cache else None
  505. for i, layer_module in enumerate(self.layer):
  506. if output_hidden_states:
  507. all_hidden_states = all_hidden_states + (hidden_states,)
  508. past_key_value = past_key_values[i] if past_key_values is not None else None
  509. layer_outputs = layer_module(
  510. hidden_states=hidden_states,
  511. attention_mask=attention_mask,
  512. head_mask=head_mask[i],
  513. encoder_hidden_states=encoder_hidden_states,
  514. encoder_attention_mask=encoder_attention_mask,
  515. past_key_value=past_key_value,
  516. output_attentions=output_attentions,
  517. training=training,
  518. )
  519. hidden_states = layer_outputs[0]
  520. if use_cache:
  521. next_decoder_cache += (layer_outputs[-1],)
  522. if output_attentions:
  523. all_attentions = all_attentions + (layer_outputs[1],)
  524. if self.config.add_cross_attention and encoder_hidden_states is not None:
  525. all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
  526. # Add last layer
  527. if output_hidden_states:
  528. all_hidden_states = all_hidden_states + (hidden_states,)
  529. if not return_dict:
  530. return tuple(
  531. v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
  532. )
  533. return TFBaseModelOutputWithPastAndCrossAttentions(
  534. last_hidden_state=hidden_states,
  535. past_key_values=next_decoder_cache,
  536. hidden_states=all_hidden_states,
  537. attentions=all_attentions,
  538. cross_attentions=all_cross_attentions,
  539. )
  540. def build(self, input_shape=None):
  541. if self.built:
  542. return
  543. self.built = True
  544. if getattr(self, "layer", None) is not None:
  545. for layer in self.layer:
  546. with tf.name_scope(layer.name):
  547. layer.build(None)
  548. # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->LayoutLM
  549. class TFLayoutLMPooler(keras.layers.Layer):
  550. def __init__(self, config: LayoutLMConfig, **kwargs):
  551. super().__init__(**kwargs)
  552. self.dense = keras.layers.Dense(
  553. units=config.hidden_size,
  554. kernel_initializer=get_initializer(config.initializer_range),
  555. activation="tanh",
  556. name="dense",
  557. )
  558. self.config = config
  559. def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
  560. # We "pool" the model by simply taking the hidden state corresponding
  561. # to the first token.
  562. first_token_tensor = hidden_states[:, 0]
  563. pooled_output = self.dense(inputs=first_token_tensor)
  564. return pooled_output
  565. def build(self, input_shape=None):
  566. if self.built:
  567. return
  568. self.built = True
  569. if getattr(self, "dense", None) is not None:
  570. with tf.name_scope(self.dense.name):
  571. self.dense.build([None, None, self.config.hidden_size])
  572. # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->LayoutLM
  573. class TFLayoutLMPredictionHeadTransform(keras.layers.Layer):
  574. def __init__(self, config: LayoutLMConfig, **kwargs):
  575. super().__init__(**kwargs)
  576. self.dense = keras.layers.Dense(
  577. units=config.hidden_size,
  578. kernel_initializer=get_initializer(config.initializer_range),
  579. name="dense",
  580. )
  581. if isinstance(config.hidden_act, str):
  582. self.transform_act_fn = get_tf_activation(config.hidden_act)
  583. else:
  584. self.transform_act_fn = config.hidden_act
  585. self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
  586. self.config = config
  587. def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
  588. hidden_states = self.dense(inputs=hidden_states)
  589. hidden_states = self.transform_act_fn(hidden_states)
  590. hidden_states = self.LayerNorm(inputs=hidden_states)
  591. return hidden_states
  592. def build(self, input_shape=None):
  593. if self.built:
  594. return
  595. self.built = True
  596. if getattr(self, "dense", None) is not None:
  597. with tf.name_scope(self.dense.name):
  598. self.dense.build([None, None, self.config.hidden_size])
  599. if getattr(self, "LayerNorm", None) is not None:
  600. with tf.name_scope(self.LayerNorm.name):
  601. self.LayerNorm.build([None, None, self.config.hidden_size])
  602. # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->LayoutLM
  603. class TFLayoutLMLMPredictionHead(keras.layers.Layer):
  604. def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs):
  605. super().__init__(**kwargs)
  606. self.config = config
  607. self.hidden_size = config.hidden_size
  608. self.transform = TFLayoutLMPredictionHeadTransform(config, name="transform")
  609. # The output weights are the same as the input embeddings, but there is
  610. # an output-only bias for each token.
  611. self.input_embeddings = input_embeddings
  612. def build(self, input_shape=None):
  613. self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
  614. if self.built:
  615. return
  616. self.built = True
  617. if getattr(self, "transform", None) is not None:
  618. with tf.name_scope(self.transform.name):
  619. self.transform.build(None)
  620. def get_output_embeddings(self) -> keras.layers.Layer:
  621. return self.input_embeddings
  622. def set_output_embeddings(self, value: tf.Variable):
  623. self.input_embeddings.weight = value
  624. self.input_embeddings.vocab_size = shape_list(value)[0]
  625. def get_bias(self) -> Dict[str, tf.Variable]:
  626. return {"bias": self.bias}
  627. def set_bias(self, value: tf.Variable):
  628. self.bias = value["bias"]
  629. self.config.vocab_size = shape_list(value["bias"])[0]
  630. def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
  631. hidden_states = self.transform(hidden_states=hidden_states)
  632. seq_length = shape_list(hidden_states)[1]
  633. hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
  634. hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
  635. hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
  636. hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
  637. return hidden_states
  638. # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->LayoutLM
  639. class TFLayoutLMMLMHead(keras.layers.Layer):
  640. def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs):
  641. super().__init__(**kwargs)
  642. self.predictions = TFLayoutLMLMPredictionHead(config, input_embeddings, name="predictions")
  643. def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
  644. prediction_scores = self.predictions(hidden_states=sequence_output)
  645. return prediction_scores
  646. def build(self, input_shape=None):
  647. if self.built:
  648. return
  649. self.built = True
  650. if getattr(self, "predictions", None) is not None:
  651. with tf.name_scope(self.predictions.name):
  652. self.predictions.build(None)
  653. @keras_serializable
  654. class TFLayoutLMMainLayer(keras.layers.Layer):
  655. config_class = LayoutLMConfig
  656. def __init__(self, config: LayoutLMConfig, add_pooling_layer: bool = True, **kwargs):
  657. super().__init__(**kwargs)
  658. self.config = config
  659. self.embeddings = TFLayoutLMEmbeddings(config, name="embeddings")
  660. self.encoder = TFLayoutLMEncoder(config, name="encoder")
  661. self.pooler = TFLayoutLMPooler(config, name="pooler") if add_pooling_layer else None
  662. def get_input_embeddings(self) -> keras.layers.Layer:
  663. return self.embeddings
  664. def set_input_embeddings(self, value: tf.Variable):
  665. self.embeddings.weight = value
  666. self.embeddings.vocab_size = shape_list(value)[0]
  667. def _prune_heads(self, heads_to_prune):
  668. """
  669. Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
  670. class PreTrainedModel
  671. """
  672. raise NotImplementedError
  673. @unpack_inputs
  674. def call(
  675. self,
  676. input_ids: TFModelInputType | None = None,
  677. bbox: np.ndarray | tf.Tensor | None = None,
  678. attention_mask: np.ndarray | tf.Tensor | None = None,
  679. token_type_ids: np.ndarray | tf.Tensor | None = None,
  680. position_ids: np.ndarray | tf.Tensor | None = None,
  681. head_mask: np.ndarray | tf.Tensor | None = None,
  682. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  683. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  684. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  685. output_attentions: Optional[bool] = None,
  686. output_hidden_states: Optional[bool] = None,
  687. return_dict: Optional[bool] = None,
  688. training: bool = False,
  689. ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
  690. if input_ids is not None and inputs_embeds is not None:
  691. raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
  692. elif input_ids is not None:
  693. input_shape = shape_list(input_ids)
  694. elif inputs_embeds is not None:
  695. input_shape = shape_list(inputs_embeds)[:-1]
  696. else:
  697. raise ValueError("You have to specify either input_ids or inputs_embeds")
  698. if attention_mask is None:
  699. attention_mask = tf.fill(dims=input_shape, value=1)
  700. if token_type_ids is None:
  701. token_type_ids = tf.fill(dims=input_shape, value=0)
  702. if bbox is None:
  703. bbox = tf.fill(dims=input_shape + [4], value=0)
  704. embedding_output = self.embeddings(
  705. input_ids=input_ids,
  706. bbox=bbox,
  707. position_ids=position_ids,
  708. token_type_ids=token_type_ids,
  709. inputs_embeds=inputs_embeds,
  710. training=training,
  711. )
  712. # We create a 3D attention mask from a 2D tensor mask.
  713. # Sizes are [batch_size, 1, 1, to_seq_length]
  714. # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
  715. # this attention mask is more simple than the triangular masking of causal attention
  716. # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
  717. extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))
  718. # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
  719. # masked positions, this operation will create a tensor which is 0.0 for
  720. # positions we want to attend and -10000.0 for masked positions.
  721. # Since we are adding it to the raw scores before the softmax, this is
  722. # effectively the same as removing these entirely.
  723. extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
  724. one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
  725. ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
  726. extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
  727. # Prepare head mask if needed
  728. # 1.0 in head_mask indicate we keep the head
  729. # attention_probs has shape bsz x n_heads x N x N
  730. # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
  731. # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
  732. if head_mask is not None:
  733. raise NotImplementedError
  734. else:
  735. head_mask = [None] * self.config.num_hidden_layers
  736. encoder_outputs = self.encoder(
  737. hidden_states=embedding_output,
  738. attention_mask=extended_attention_mask,
  739. head_mask=head_mask,
  740. # Need to pass these required positional arguments to `Encoder`
  741. encoder_hidden_states=encoder_hidden_states,
  742. encoder_attention_mask=None,
  743. past_key_values=None,
  744. use_cache=False,
  745. output_attentions=output_attentions,
  746. output_hidden_states=output_hidden_states,
  747. return_dict=return_dict,
  748. training=training,
  749. )
  750. sequence_output = encoder_outputs[0]
  751. pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
  752. if not return_dict:
  753. return (
  754. sequence_output,
  755. pooled_output,
  756. ) + encoder_outputs[1:]
  757. return TFBaseModelOutputWithPoolingAndCrossAttentions(
  758. last_hidden_state=sequence_output,
  759. pooler_output=pooled_output,
  760. hidden_states=encoder_outputs.hidden_states,
  761. attentions=encoder_outputs.attentions,
  762. cross_attentions=encoder_outputs.cross_attentions,
  763. )
  764. def build(self, input_shape=None):
  765. if self.built:
  766. return
  767. self.built = True
  768. if getattr(self, "embeddings", None) is not None:
  769. with tf.name_scope(self.embeddings.name):
  770. self.embeddings.build(None)
  771. if getattr(self, "encoder", None) is not None:
  772. with tf.name_scope(self.encoder.name):
  773. self.encoder.build(None)
  774. if getattr(self, "pooler", None) is not None:
  775. with tf.name_scope(self.pooler.name):
  776. self.pooler.build(None)
  777. class TFLayoutLMPreTrainedModel(TFPreTrainedModel):
  778. """
  779. An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
  780. models.
  781. """
  782. config_class = LayoutLMConfig
  783. base_model_prefix = "layoutlm"
  784. @property
  785. def input_signature(self):
  786. signature = super().input_signature
  787. signature["bbox"] = tf.TensorSpec(shape=(None, None, 4), dtype=tf.int32, name="bbox")
  788. return signature
  789. LAYOUTLM_START_DOCSTRING = r"""
  790. This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
  791. library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  792. etc.)
  793. This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
  794. as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
  795. behavior.
  796. <Tip>
  797. TensorFlow models and layers in `transformers` accept two formats as input:
  798. - having all inputs as keyword arguments (like PyTorch models), or
  799. - having all inputs as a list, tuple or dict in the first positional argument.
  800. The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
  801. and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
  802. pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
  803. format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
  804. the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
  805. positional argument:
  806. - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
  807. - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
  808. `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
  809. - a dictionary with one or several input Tensors associated to the input names given in the docstring:
  810. `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
  811. Note that when creating models and layers with
  812. [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
  813. about any of this, as you can just pass inputs like you would to any other Python function!
  814. </Tip>
  815. Args:
  816. config ([`LayoutLMConfig`]): Model configuration class with all the parameters of the model.
  817. Initializing with a config file does not load the weights associated with the model, only the
  818. configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
  819. """
  820. LAYOUTLM_INPUTS_DOCSTRING = r"""
  821. Args:
  822. input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
  823. Indices of input sequence tokens in the vocabulary.
  824. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
  825. [`PreTrainedTokenizer.encode`] for details.
  826. [What are input IDs?](../glossary#input-ids)
  827. bbox (`Numpy array` or `tf.Tensor` of shape `({0}, 4)`, *optional*):
  828. Bounding Boxes of each input sequence tokens. Selected in the range `[0, config.max_2d_position_embeddings-
  829. 1]`.
  830. attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
  831. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  832. - 1 for tokens that are **not masked**,
  833. - 0 for tokens that are **masked**.
  834. [What are attention masks?](../glossary#attention-mask)
  835. token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
  836. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
  837. 1]`:
  838. - 0 corresponds to a *sentence A* token,
  839. - 1 corresponds to a *sentence B* token.
  840. [What are token type IDs?](../glossary#token-type-ids)
  841. position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
  842. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  843. config.max_position_embeddings - 1]`.
  844. [What are position IDs?](../glossary#position-ids)
  845. head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
  846. Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
  847. - 1 indicates the head is **not masked**,
  848. - 0 indicates the head is **masked**.
  849. inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
  850. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
  851. is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
  852. model's internal embedding lookup matrix.
  853. output_attentions (`bool`, *optional*):
  854. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  855. tensors for more detail.
  856. output_hidden_states (`bool`, *optional*):
  857. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  858. more detail.
  859. return_dict (`bool`, *optional*):
  860. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  861. training (`bool`, *optional*, defaults to `False`):
  862. Whether or not to use the model in training mode (some modules like dropout modules have different
  863. behaviors between training and evaluation).
  864. """
  865. @add_start_docstrings(
  866. "The bare LayoutLM Model transformer outputting raw hidden-states without any specific head on top.",
  867. LAYOUTLM_START_DOCSTRING,
  868. )
  869. class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
  870. def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
  871. super().__init__(config, *inputs, **kwargs)
  872. self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm")
  873. @unpack_inputs
  874. @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  875. @replace_return_docstrings(
  876. output_type=TFBaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC
  877. )
  878. def call(
  879. self,
  880. input_ids: TFModelInputType | None = None,
  881. bbox: np.ndarray | tf.Tensor | None = None,
  882. attention_mask: np.ndarray | tf.Tensor | None = None,
  883. token_type_ids: np.ndarray | tf.Tensor | None = None,
  884. position_ids: np.ndarray | tf.Tensor | None = None,
  885. head_mask: np.ndarray | tf.Tensor | None = None,
  886. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  887. encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
  888. encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
  889. output_attentions: Optional[bool] = None,
  890. output_hidden_states: Optional[bool] = None,
  891. return_dict: Optional[bool] = None,
  892. training: Optional[bool] = False,
  893. ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
  894. r"""
  895. Returns:
  896. Examples:
  897. ```python
  898. >>> from transformers import AutoTokenizer, TFLayoutLMModel
  899. >>> import tensorflow as tf
  900. >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
  901. >>> model = TFLayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
  902. >>> words = ["Hello", "world"]
  903. >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
  904. >>> token_boxes = []
  905. >>> for word, box in zip(words, normalized_word_boxes):
  906. ... word_tokens = tokenizer.tokenize(word)
  907. ... token_boxes.extend([box] * len(word_tokens))
  908. >>> # add bounding boxes of cls + sep tokens
  909. >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
  910. >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
  911. >>> input_ids = encoding["input_ids"]
  912. >>> attention_mask = encoding["attention_mask"]
  913. >>> token_type_ids = encoding["token_type_ids"]
  914. >>> bbox = tf.convert_to_tensor([token_boxes])
  915. >>> outputs = model(
  916. ... input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids
  917. ... )
  918. >>> last_hidden_states = outputs.last_hidden_state
  919. ```"""
  920. outputs = self.layoutlm(
  921. input_ids=input_ids,
  922. bbox=bbox,
  923. attention_mask=attention_mask,
  924. token_type_ids=token_type_ids,
  925. position_ids=position_ids,
  926. head_mask=head_mask,
  927. inputs_embeds=inputs_embeds,
  928. output_attentions=output_attentions,
  929. output_hidden_states=output_hidden_states,
  930. return_dict=return_dict,
  931. training=training,
  932. )
  933. return outputs
  934. def build(self, input_shape=None):
  935. if self.built:
  936. return
  937. self.built = True
  938. if getattr(self, "layoutlm", None) is not None:
  939. with tf.name_scope(self.layoutlm.name):
  940. self.layoutlm.build(None)
  941. @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
  942. class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingLoss):
  943. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  944. _keys_to_ignore_on_load_unexpected = [
  945. r"pooler",
  946. r"cls.seq_relationship",
  947. r"cls.predictions.decoder.weight",
  948. r"nsp___cls",
  949. ]
  950. def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
  951. super().__init__(config, *inputs, **kwargs)
  952. if config.is_decoder:
  953. logger.warning(
  954. "If you want to use `TFLayoutLMForMaskedLM` make sure `config.is_decoder=False` for "
  955. "bi-directional self-attention."
  956. )
  957. self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
  958. self.mlm = TFLayoutLMMLMHead(config, input_embeddings=self.layoutlm.embeddings, name="mlm___cls")
  959. def get_lm_head(self) -> keras.layers.Layer:
  960. return self.mlm.predictions
  961. def get_prefix_bias_name(self) -> str:
  962. warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
  963. return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
  964. @unpack_inputs
  965. @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  966. @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
  967. def call(
  968. self,
  969. input_ids: TFModelInputType | None = None,
  970. bbox: np.ndarray | tf.Tensor | None = None,
  971. attention_mask: np.ndarray | tf.Tensor | None = None,
  972. token_type_ids: np.ndarray | tf.Tensor | None = None,
  973. position_ids: np.ndarray | tf.Tensor | None = None,
  974. head_mask: np.ndarray | tf.Tensor | None = None,
  975. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  976. output_attentions: Optional[bool] = None,
  977. output_hidden_states: Optional[bool] = None,
  978. return_dict: Optional[bool] = None,
  979. labels: np.ndarray | tf.Tensor | None = None,
  980. training: Optional[bool] = False,
  981. ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
  982. r"""
  983. labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
  984. Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
  985. config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
  986. loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
  987. Returns:
  988. Examples:
  989. ```python
  990. >>> from transformers import AutoTokenizer, TFLayoutLMForMaskedLM
  991. >>> import tensorflow as tf
  992. >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
  993. >>> model = TFLayoutLMForMaskedLM.from_pretrained("microsoft/layoutlm-base-uncased")
  994. >>> words = ["Hello", "[MASK]"]
  995. >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
  996. >>> token_boxes = []
  997. >>> for word, box in zip(words, normalized_word_boxes):
  998. ... word_tokens = tokenizer.tokenize(word)
  999. ... token_boxes.extend([box] * len(word_tokens))
  1000. >>> # add bounding boxes of cls + sep tokens
  1001. >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
  1002. >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
  1003. >>> input_ids = encoding["input_ids"]
  1004. >>> attention_mask = encoding["attention_mask"]
  1005. >>> token_type_ids = encoding["token_type_ids"]
  1006. >>> bbox = tf.convert_to_tensor([token_boxes])
  1007. >>> labels = tokenizer("Hello world", return_tensors="tf")["input_ids"]
  1008. >>> outputs = model(
  1009. ... input_ids=input_ids,
  1010. ... bbox=bbox,
  1011. ... attention_mask=attention_mask,
  1012. ... token_type_ids=token_type_ids,
  1013. ... labels=labels,
  1014. ... )
  1015. >>> loss = outputs.loss
  1016. ```"""
  1017. outputs = self.layoutlm(
  1018. input_ids=input_ids,
  1019. bbox=bbox,
  1020. attention_mask=attention_mask,
  1021. token_type_ids=token_type_ids,
  1022. position_ids=position_ids,
  1023. head_mask=head_mask,
  1024. inputs_embeds=inputs_embeds,
  1025. output_attentions=output_attentions,
  1026. output_hidden_states=output_hidden_states,
  1027. return_dict=return_dict,
  1028. training=training,
  1029. )
  1030. sequence_output = outputs[0]
  1031. prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
  1032. loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
  1033. if not return_dict:
  1034. output = (prediction_scores,) + outputs[2:]
  1035. return ((loss,) + output) if loss is not None else output
  1036. return TFMaskedLMOutput(
  1037. loss=loss,
  1038. logits=prediction_scores,
  1039. hidden_states=outputs.hidden_states,
  1040. attentions=outputs.attentions,
  1041. )
  1042. def build(self, input_shape=None):
  1043. if self.built:
  1044. return
  1045. self.built = True
  1046. if getattr(self, "layoutlm", None) is not None:
  1047. with tf.name_scope(self.layoutlm.name):
  1048. self.layoutlm.build(None)
  1049. if getattr(self, "mlm", None) is not None:
  1050. with tf.name_scope(self.mlm.name):
  1051. self.mlm.build(None)
  1052. @add_start_docstrings(
  1053. """
  1054. LayoutLM Model transformer with a sequence classification/regression head on top (a linear layer on top of the
  1055. pooled output) e.g. for GLUE tasks.
  1056. """,
  1057. LAYOUTLM_START_DOCSTRING,
  1058. )
  1059. class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceClassificationLoss):
  1060. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  1061. _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
  1062. _keys_to_ignore_on_load_missing = [r"dropout"]
  1063. def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
  1064. super().__init__(config, *inputs, **kwargs)
  1065. self.num_labels = config.num_labels
  1066. self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm")
  1067. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  1068. self.classifier = keras.layers.Dense(
  1069. units=config.num_labels,
  1070. kernel_initializer=get_initializer(config.initializer_range),
  1071. name="classifier",
  1072. )
  1073. self.config = config
  1074. @unpack_inputs
  1075. @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1076. @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
  1077. def call(
  1078. self,
  1079. input_ids: TFModelInputType | None = None,
  1080. bbox: np.ndarray | tf.Tensor | None = None,
  1081. attention_mask: np.ndarray | tf.Tensor | None = None,
  1082. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1083. position_ids: np.ndarray | tf.Tensor | None = None,
  1084. head_mask: np.ndarray | tf.Tensor | None = None,
  1085. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1086. output_attentions: Optional[bool] = None,
  1087. output_hidden_states: Optional[bool] = None,
  1088. return_dict: Optional[bool] = None,
  1089. labels: np.ndarray | tf.Tensor | None = None,
  1090. training: Optional[bool] = False,
  1091. ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
  1092. r"""
  1093. labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
  1094. Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
  1095. config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
  1096. `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
  1097. Returns:
  1098. Examples:
  1099. ```python
  1100. >>> from transformers import AutoTokenizer, TFLayoutLMForSequenceClassification
  1101. >>> import tensorflow as tf
  1102. >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
  1103. >>> model = TFLayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased")
  1104. >>> words = ["Hello", "world"]
  1105. >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
  1106. >>> token_boxes = []
  1107. >>> for word, box in zip(words, normalized_word_boxes):
  1108. ... word_tokens = tokenizer.tokenize(word)
  1109. ... token_boxes.extend([box] * len(word_tokens))
  1110. >>> # add bounding boxes of cls + sep tokens
  1111. >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
  1112. >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
  1113. >>> input_ids = encoding["input_ids"]
  1114. >>> attention_mask = encoding["attention_mask"]
  1115. >>> token_type_ids = encoding["token_type_ids"]
  1116. >>> bbox = tf.convert_to_tensor([token_boxes])
  1117. >>> sequence_label = tf.convert_to_tensor([1])
  1118. >>> outputs = model(
  1119. ... input_ids=input_ids,
  1120. ... bbox=bbox,
  1121. ... attention_mask=attention_mask,
  1122. ... token_type_ids=token_type_ids,
  1123. ... labels=sequence_label,
  1124. ... )
  1125. >>> loss = outputs.loss
  1126. >>> logits = outputs.logits
  1127. ```"""
  1128. outputs = self.layoutlm(
  1129. input_ids=input_ids,
  1130. bbox=bbox,
  1131. attention_mask=attention_mask,
  1132. token_type_ids=token_type_ids,
  1133. position_ids=position_ids,
  1134. head_mask=head_mask,
  1135. inputs_embeds=inputs_embeds,
  1136. output_attentions=output_attentions,
  1137. output_hidden_states=output_hidden_states,
  1138. return_dict=return_dict,
  1139. training=training,
  1140. )
  1141. pooled_output = outputs[1]
  1142. pooled_output = self.dropout(inputs=pooled_output, training=training)
  1143. logits = self.classifier(inputs=pooled_output)
  1144. loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
  1145. if not return_dict:
  1146. output = (logits,) + outputs[2:]
  1147. return ((loss,) + output) if loss is not None else output
  1148. return TFSequenceClassifierOutput(
  1149. loss=loss,
  1150. logits=logits,
  1151. hidden_states=outputs.hidden_states,
  1152. attentions=outputs.attentions,
  1153. )
  1154. def build(self, input_shape=None):
  1155. if self.built:
  1156. return
  1157. self.built = True
  1158. if getattr(self, "layoutlm", None) is not None:
  1159. with tf.name_scope(self.layoutlm.name):
  1160. self.layoutlm.build(None)
  1161. if getattr(self, "classifier", None) is not None:
  1162. with tf.name_scope(self.classifier.name):
  1163. self.classifier.build([None, None, self.config.hidden_size])
  1164. @add_start_docstrings(
  1165. """
  1166. LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
  1167. Named-Entity-Recognition (NER) tasks.
  1168. """,
  1169. LAYOUTLM_START_DOCSTRING,
  1170. )
  1171. class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassificationLoss):
  1172. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  1173. _keys_to_ignore_on_load_unexpected = [
  1174. r"pooler",
  1175. r"mlm___cls",
  1176. r"nsp___cls",
  1177. r"cls.predictions",
  1178. r"cls.seq_relationship",
  1179. ]
  1180. _keys_to_ignore_on_load_missing = [r"dropout"]
  1181. def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
  1182. super().__init__(config, *inputs, **kwargs)
  1183. self.num_labels = config.num_labels
  1184. self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
  1185. self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
  1186. self.classifier = keras.layers.Dense(
  1187. units=config.num_labels,
  1188. kernel_initializer=get_initializer(config.initializer_range),
  1189. name="classifier",
  1190. )
  1191. self.config = config
  1192. @unpack_inputs
  1193. @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1194. @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
  1195. def call(
  1196. self,
  1197. input_ids: TFModelInputType | None = None,
  1198. bbox: np.ndarray | tf.Tensor | None = None,
  1199. attention_mask: np.ndarray | tf.Tensor | None = None,
  1200. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1201. position_ids: np.ndarray | tf.Tensor | None = None,
  1202. head_mask: np.ndarray | tf.Tensor | None = None,
  1203. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1204. output_attentions: Optional[bool] = None,
  1205. output_hidden_states: Optional[bool] = None,
  1206. return_dict: Optional[bool] = None,
  1207. labels: np.ndarray | tf.Tensor | None = None,
  1208. training: Optional[bool] = False,
  1209. ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
  1210. r"""
  1211. labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
  1212. Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
  1213. Returns:
  1214. Examples:
  1215. ```python
  1216. >>> import tensorflow as tf
  1217. >>> from transformers import AutoTokenizer, TFLayoutLMForTokenClassification
  1218. >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
  1219. >>> model = TFLayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased")
  1220. >>> words = ["Hello", "world"]
  1221. >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
  1222. >>> token_boxes = []
  1223. >>> for word, box in zip(words, normalized_word_boxes):
  1224. ... word_tokens = tokenizer.tokenize(word)
  1225. ... token_boxes.extend([box] * len(word_tokens))
  1226. >>> # add bounding boxes of cls + sep tokens
  1227. >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
  1228. >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
  1229. >>> input_ids = encoding["input_ids"]
  1230. >>> attention_mask = encoding["attention_mask"]
  1231. >>> token_type_ids = encoding["token_type_ids"]
  1232. >>> bbox = tf.convert_to_tensor([token_boxes])
  1233. >>> token_labels = tf.convert_to_tensor([1, 1, 0, 0])
  1234. >>> outputs = model(
  1235. ... input_ids=input_ids,
  1236. ... bbox=bbox,
  1237. ... attention_mask=attention_mask,
  1238. ... token_type_ids=token_type_ids,
  1239. ... labels=token_labels,
  1240. ... )
  1241. >>> loss = outputs.loss
  1242. >>> logits = outputs.logits
  1243. ```"""
  1244. outputs = self.layoutlm(
  1245. input_ids=input_ids,
  1246. bbox=bbox,
  1247. attention_mask=attention_mask,
  1248. token_type_ids=token_type_ids,
  1249. position_ids=position_ids,
  1250. head_mask=head_mask,
  1251. inputs_embeds=inputs_embeds,
  1252. output_attentions=output_attentions,
  1253. output_hidden_states=output_hidden_states,
  1254. return_dict=return_dict,
  1255. training=training,
  1256. )
  1257. sequence_output = outputs[0]
  1258. sequence_output = self.dropout(inputs=sequence_output, training=training)
  1259. logits = self.classifier(inputs=sequence_output)
  1260. loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
  1261. if not return_dict:
  1262. output = (logits,) + outputs[2:]
  1263. return ((loss,) + output) if loss is not None else output
  1264. return TFTokenClassifierOutput(
  1265. loss=loss,
  1266. logits=logits,
  1267. hidden_states=outputs.hidden_states,
  1268. attentions=outputs.attentions,
  1269. )
  1270. def build(self, input_shape=None):
  1271. if self.built:
  1272. return
  1273. self.built = True
  1274. if getattr(self, "layoutlm", None) is not None:
  1275. with tf.name_scope(self.layoutlm.name):
  1276. self.layoutlm.build(None)
  1277. if getattr(self, "classifier", None) is not None:
  1278. with tf.name_scope(self.classifier.name):
  1279. self.classifier.build([None, None, self.config.hidden_size])
  1280. @add_start_docstrings(
  1281. """
  1282. LayoutLM Model with a span classification head on top for extractive question-answering tasks such as
  1283. [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the final hidden-states output to compute `span
  1284. start logits` and `span end logits`).
  1285. """,
  1286. LAYOUTLM_START_DOCSTRING,
  1287. )
  1288. class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnsweringLoss):
  1289. # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
  1290. _keys_to_ignore_on_load_unexpected = [
  1291. r"pooler",
  1292. r"mlm___cls",
  1293. r"nsp___cls",
  1294. r"cls.predictions",
  1295. r"cls.seq_relationship",
  1296. ]
  1297. def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
  1298. super().__init__(config, *inputs, **kwargs)
  1299. self.num_labels = config.num_labels
  1300. self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
  1301. self.qa_outputs = keras.layers.Dense(
  1302. units=config.num_labels,
  1303. kernel_initializer=get_initializer(config.initializer_range),
  1304. name="qa_outputs",
  1305. )
  1306. self.config = config
  1307. @unpack_inputs
  1308. @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  1309. @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
  1310. def call(
  1311. self,
  1312. input_ids: TFModelInputType | None = None,
  1313. bbox: np.ndarray | tf.Tensor | None = None,
  1314. attention_mask: np.ndarray | tf.Tensor | None = None,
  1315. token_type_ids: np.ndarray | tf.Tensor | None = None,
  1316. position_ids: np.ndarray | tf.Tensor | None = None,
  1317. head_mask: np.ndarray | tf.Tensor | None = None,
  1318. inputs_embeds: np.ndarray | tf.Tensor | None = None,
  1319. output_attentions: Optional[bool] = None,
  1320. output_hidden_states: Optional[bool] = None,
  1321. return_dict: Optional[bool] = None,
  1322. start_positions: np.ndarray | tf.Tensor | None = None,
  1323. end_positions: np.ndarray | tf.Tensor | None = None,
  1324. training: Optional[bool] = False,
  1325. ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
  1326. r"""
  1327. start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
  1328. Labels for position (index) of the start of the labelled span for computing the token classification loss.
  1329. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
  1330. are not taken into account for computing the loss.
  1331. end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
  1332. Labels for position (index) of the end of the labelled span for computing the token classification loss.
  1333. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
  1334. are not taken into account for computing the loss.
  1335. Returns:
  1336. Examples:
  1337. ```python
  1338. >>> import tensorflow as tf
  1339. >>> from transformers import AutoTokenizer, TFLayoutLMForQuestionAnswering
  1340. >>> from datasets import load_dataset
  1341. >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
  1342. >>> model = TFLayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")
  1343. >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
  1344. >>> example = dataset[0]
  1345. >>> question = "what's his name?"
  1346. >>> words = example["words"]
  1347. >>> boxes = example["bboxes"]
  1348. >>> encoding = tokenizer(
  1349. ... question.split(), words, is_split_into_words=True, return_token_type_ids=True, return_tensors="tf"
  1350. ... )
  1351. >>> bbox = []
  1352. >>> for i, s, w in zip(encoding.input_ids[0], encoding.sequence_ids(0), encoding.word_ids(0)):
  1353. ... if s == 1:
  1354. ... bbox.append(boxes[w])
  1355. ... elif i == tokenizer.sep_token_id:
  1356. ... bbox.append([1000] * 4)
  1357. ... else:
  1358. ... bbox.append([0] * 4)
  1359. >>> encoding["bbox"] = tf.convert_to_tensor([bbox])
  1360. >>> word_ids = encoding.word_ids(0)
  1361. >>> outputs = model(**encoding)
  1362. >>> loss = outputs.loss
  1363. >>> start_scores = outputs.start_logits
  1364. >>> end_scores = outputs.end_logits
  1365. >>> start, end = word_ids[tf.math.argmax(start_scores, -1)[0]], word_ids[tf.math.argmax(end_scores, -1)[0]]
  1366. >>> print(" ".join(words[start : end + 1]))
  1367. M. Hamann P. Harper, P. Martinez
  1368. ```"""
  1369. outputs = self.layoutlm(
  1370. input_ids=input_ids,
  1371. bbox=bbox,
  1372. attention_mask=attention_mask,
  1373. token_type_ids=token_type_ids,
  1374. position_ids=position_ids,
  1375. head_mask=head_mask,
  1376. inputs_embeds=inputs_embeds,
  1377. output_attentions=output_attentions,
  1378. output_hidden_states=output_hidden_states,
  1379. return_dict=return_dict,
  1380. training=training,
  1381. )
  1382. sequence_output = outputs[0]
  1383. logits = self.qa_outputs(inputs=sequence_output)
  1384. start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
  1385. start_logits = tf.squeeze(input=start_logits, axis=-1)
  1386. end_logits = tf.squeeze(input=end_logits, axis=-1)
  1387. loss = None
  1388. if start_positions is not None and end_positions is not None:
  1389. labels = {"start_position": start_positions}
  1390. labels["end_position"] = end_positions
  1391. loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
  1392. if not return_dict:
  1393. output = (start_logits, end_logits) + outputs[2:]
  1394. return ((loss,) + output) if loss is not None else output
  1395. return TFQuestionAnsweringModelOutput(
  1396. loss=loss,
  1397. start_logits=start_logits,
  1398. end_logits=end_logits,
  1399. hidden_states=outputs.hidden_states,
  1400. attentions=outputs.attentions,
  1401. )
  1402. def build(self, input_shape=None):
  1403. if self.built:
  1404. return
  1405. self.built = True
  1406. if getattr(self, "layoutlm", None) is not None:
  1407. with tf.name_scope(self.layoutlm.name):
  1408. self.layoutlm.build(None)
  1409. if getattr(self, "qa_outputs", None) is not None:
  1410. with tf.name_scope(self.qa_outputs.name):
  1411. self.qa_outputs.build([None, None, self.config.hidden_size])