configuration_bloom.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. # coding=utf-8
  2. # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Bloom configuration"""
  16. from collections import OrderedDict
  17. from typing import TYPE_CHECKING, Any, List, Mapping, Optional
  18. from packaging import version
  19. if TYPE_CHECKING:
  20. from ... import PreTrainedTokenizer, TensorType
  21. from ...configuration_utils import PretrainedConfig
  22. from ...onnx import OnnxConfigWithPast, PatchingSpec
  23. from ...utils import is_torch_available, logging
  24. logger = logging.get_logger(__name__)
  25. class BloomConfig(PretrainedConfig):
  26. """
  27. This is the configuration class to store the configuration of a [`BloomModel`]. It is used to instantiate a Bloom
  28. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  29. defaults will yield a similar configuration to the Bloom architecture
  30. [bigscience/bloom](https://huggingface.co/bigscience/bloom).
  31. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  32. documentation from [`PretrainedConfig`] for more information.
  33. Args:
  34. vocab_size (`int`, *optional*, defaults to 250880):
  35. Vocabulary size of the Bloom model. Defines the maximum number of different tokens that can be represented
  36. by the `inputs_ids` passed when calling [`BloomModel`]. Check [this
  37. discussion](https://huggingface.co/bigscience/bloom/discussions/120#633d28389addb8530b406c2a) on how the
  38. `vocab_size` has been defined.
  39. hidden_size (`int`, *optional*, defaults to 64):
  40. Dimensionality of the embeddings and hidden states.
  41. n_layer (`int`, *optional*, defaults to 2):
  42. Number of hidden layers in the Transformer encoder.
  43. n_head (`int`, *optional*, defaults to 8):
  44. Number of attention heads for each attention layer in the Transformer encoder.
  45. layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
  46. The epsilon to use in the layer normalization layers.
  47. initializer_range (`float`, *optional*, defaults to 0.02):
  48. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  49. apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
  50. If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
  51. hidden_dropout (`float`, *optional*, defaults to 0.1):
  52. Dropout rate of the dropout function on the bias dropout.
  53. attention_dropout (`float`, *optional*, defaults to 0.1):
  54. Dropout rate applied to the attention probs
  55. use_cache (`bool`, *optional*, defaults to `True`):
  56. Whether or not the model should return the last key/values attentions (not used by all models).
  57. pretraining_tp (`int`, *optional*, defaults to `1`):
  58. Experimental feature. Tensor parallelism rank used during pretraining with Megatron. Please refer to [this
  59. document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
  60. necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
  61. issue](https://github.com/pytorch/pytorch/issues/76232). Note also that this is enabled only when
  62. `slow_but_exact=True`.
  63. slow_but_exact (`bool`, *optional*, defaults to `False`):
  64. Experimental feature. Whether to use slow but exact implementation of the attention mechanism. While
  65. merging the TP rank tensors, due to slicing operations the results may be slightly different between the
  66. model trained on Megatron and our model. Please refer to [this
  67. issue](https://github.com/pytorch/pytorch/issues/76232). A solution to obtain more accurate results is to
  68. enable this feature. Enabling this will hurt the computational time of the inference. Will be probably
  69. resolved in the future once the main model has been fine-tuned with TP_rank=1.
  70. Example:
  71. ```python
  72. >>> from transformers import BloomConfig, BloomModel
  73. >>> # Initializing a Bloom configuration
  74. >>> configuration = BloomConfig()
  75. >>> # Initializing a model (with random weights) from the configuration
  76. >>> model = BloomModel(configuration)
  77. >>> # Accessing the model configuration
  78. >>> configuration = model.config
  79. ```"""
  80. model_type = "bloom"
  81. keys_to_ignore_at_inference = ["past_key_values"]
  82. attribute_map = {
  83. "num_hidden_layers": "n_layer",
  84. "num_attention_heads": "n_head",
  85. }
  86. def __init__(
  87. self,
  88. vocab_size=250880,
  89. hidden_size=64,
  90. n_layer=2,
  91. n_head=8,
  92. layer_norm_epsilon=1e-5,
  93. initializer_range=0.02,
  94. use_cache=True,
  95. bos_token_id=1,
  96. eos_token_id=2,
  97. apply_residual_connection_post_layernorm=False,
  98. hidden_dropout=0.0,
  99. attention_dropout=0.0,
  100. pretraining_tp=1, # TP rank used when training with megatron
  101. slow_but_exact=False,
  102. **kwargs,
  103. ):
  104. self.vocab_size = vocab_size
  105. # Backward compatibility with n_embed kwarg
  106. n_embed = kwargs.pop("n_embed", None)
  107. self.hidden_size = hidden_size if n_embed is None else n_embed
  108. self.n_layer = n_layer
  109. self.n_head = n_head
  110. self.layer_norm_epsilon = layer_norm_epsilon
  111. self.initializer_range = initializer_range
  112. self.use_cache = use_cache
  113. self.pretraining_tp = pretraining_tp
  114. self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
  115. self.hidden_dropout = hidden_dropout
  116. self.attention_dropout = attention_dropout
  117. self.bos_token_id = bos_token_id
  118. self.eos_token_id = eos_token_id
  119. self.slow_but_exact = slow_but_exact
  120. super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
  121. class BloomOnnxConfig(OnnxConfigWithPast):
  122. torch_onnx_minimum_version = version.parse("1.12")
  123. def __init__(
  124. self,
  125. config: PretrainedConfig,
  126. task: str = "default",
  127. patching_specs: List[PatchingSpec] = None,
  128. use_past: bool = False,
  129. ):
  130. super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
  131. if not getattr(self._config, "pad_token_id", None):
  132. # TODO: how to do that better?
  133. self._config.pad_token_id = 0
  134. @property
  135. def inputs(self) -> Mapping[str, Mapping[int, str]]:
  136. common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
  137. if self.use_past:
  138. # BLOOM stores values on dynamic axis 2. For more details see: https://github.com/huggingface/transformers/pull/18344
  139. self.fill_with_past_key_values_(common_inputs, direction="inputs", inverted_values_shape=True)
  140. common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
  141. else:
  142. common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
  143. return common_inputs
  144. @property
  145. def num_layers(self) -> int:
  146. return self._config.n_layer
  147. @property
  148. def num_attention_heads(self) -> int:
  149. return self._config.n_head
  150. @property
  151. def atol_for_validation(self) -> float:
  152. return 1e-3
  153. def generate_dummy_inputs(
  154. self,
  155. tokenizer: "PreTrainedTokenizer",
  156. batch_size: int = -1,
  157. seq_length: int = -1,
  158. is_pair: bool = False,
  159. framework: Optional["TensorType"] = None,
  160. ) -> Mapping[str, Any]:
  161. common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
  162. tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
  163. )
  164. # We need to order the input in the way they appears in the forward()
  165. ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
  166. # Need to add the past_keys
  167. if self.use_past:
  168. if not is_torch_available():
  169. raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
  170. else:
  171. import torch
  172. batch, seqlen = common_inputs["input_ids"].shape
  173. # Not using the same length for past_key_values
  174. past_key_values_length = seqlen + 2
  175. head_dim = self._config.hidden_size // self.num_attention_heads
  176. past_key_shape = (
  177. batch * self.num_attention_heads,
  178. head_dim,
  179. past_key_values_length,
  180. )
  181. past_value_shape = (
  182. batch * self.num_attention_heads,
  183. past_key_values_length,
  184. head_dim,
  185. )
  186. ordered_inputs["past_key_values"] = [
  187. (torch.zeros(past_key_shape), torch.zeros(past_value_shape)) for _ in range(self.num_layers)
  188. ]
  189. ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
  190. if self.use_past:
  191. mask_dtype = ordered_inputs["attention_mask"].dtype
  192. ordered_inputs["attention_mask"] = torch.cat(
  193. [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
  194. )
  195. return ordered_inputs
  196. @property
  197. def default_onnx_opset(self) -> int:
  198. return 13