activations_tf.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. # Copyright 2020 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import math
  15. import tensorflow as tf
  16. from packaging.version import parse
  17. try:
  18. import tf_keras as keras
  19. except (ModuleNotFoundError, ImportError):
  20. import keras
  21. if parse(keras.__version__).major > 2:
  22. raise ValueError(
  23. "Your currently installed version of Keras is Keras 3, but this is not yet supported in "
  24. "Transformers. Please install the backwards-compatible tf-keras package with "
  25. "`pip install tf-keras`."
  26. )
  27. def _gelu(x):
  28. """
  29. Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
  30. initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
  31. 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
  32. https://arxiv.org/abs/1606.08415
  33. """
  34. x = tf.convert_to_tensor(x)
  35. cdf = 0.5 * (1.0 + tf.math.erf(x / tf.cast(tf.sqrt(2.0), x.dtype)))
  36. return x * cdf
  37. def _gelu_new(x):
  38. """
  39. Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://arxiv.org/abs/1606.0841
  40. Args:
  41. x: float Tensor to perform activation
  42. Returns:
  43. `x` with the GELU activation applied.
  44. """
  45. x = tf.convert_to_tensor(x)
  46. pi = tf.cast(math.pi, x.dtype)
  47. coeff = tf.cast(0.044715, x.dtype)
  48. cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3))))
  49. return x * cdf
  50. def mish(x):
  51. x = tf.convert_to_tensor(x)
  52. return x * tf.tanh(tf.math.softplus(x))
  53. def gelu_fast(x):
  54. x = tf.convert_to_tensor(x)
  55. coeff1 = tf.cast(0.044715, x.dtype)
  56. coeff2 = tf.cast(0.7978845608, x.dtype)
  57. return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
  58. def quick_gelu(x):
  59. x = tf.convert_to_tensor(x)
  60. coeff = tf.cast(1.702, x.dtype)
  61. return x * tf.math.sigmoid(coeff * x)
  62. def gelu_10(x):
  63. """
  64. Clip the range of possible GeLU outputs between [-10, 10]. This is especially useful for quantization purpose, as
  65. it allows mapping 2 negatives values in the GeLU spectrum. For more information on this trick, please refer to
  66. https://arxiv.org/abs/2004.09602
  67. Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
  68. initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
  69. 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
  70. https://arxiv.org/abs/1606.08415 :param x: :return:
  71. """
  72. return tf.clip_by_value(_gelu(x), -10, 10)
  73. def glu(x, axis=-1):
  74. """
  75. Gated Linear Unit. Implementation as defined in the original paper (see https://arxiv.org/abs/1612.08083), where
  76. the input `x` is split in two halves across a dimension (`axis`), A and B, returning A * sigmoid(B).
  77. Args:
  78. `x`: float Tensor to perform activation
  79. `axis`: dimension across which `x` be split in half
  80. Returns:
  81. `x` with the GLU activation applied (with its size halved across the dimension `axis`).
  82. """
  83. a, b = tf.split(x, 2, axis=axis)
  84. return a * tf.math.sigmoid(b)
  85. if parse(tf.version.VERSION) >= parse("2.4"):
  86. def approximate_gelu_wrap(x):
  87. return keras.activations.gelu(x, approximate=True)
  88. gelu = keras.activations.gelu
  89. gelu_new = approximate_gelu_wrap
  90. else:
  91. gelu = _gelu
  92. gelu_new = _gelu_new
  93. ACT2FN = {
  94. "gelu": gelu,
  95. "gelu_10": gelu_10,
  96. "gelu_fast": gelu_fast,
  97. "gelu_new": gelu_new,
  98. "glu": glu,
  99. "mish": mish,
  100. "quick_gelu": quick_gelu,
  101. "relu": keras.activations.relu,
  102. "sigmoid": keras.activations.sigmoid,
  103. "silu": keras.activations.swish,
  104. "swish": keras.activations.swish,
  105. "tanh": keras.activations.tanh,
  106. }
  107. def get_tf_activation(activation_string):
  108. if activation_string in ACT2FN:
  109. return ACT2FN[activation_string]
  110. else:
  111. raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")