| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379 |
- # Copyright 2019 The TensorFlow Authors, The Hugging Face Team. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """Functions and classes related to optimization (weight updates)."""
- import re
- from typing import Callable, List, Optional, Union
- import tensorflow as tf
- try:
- from tf_keras.optimizers.legacy import Adam
- except (ImportError, ModuleNotFoundError):
- from tensorflow.keras.optimizers.legacy import Adam
- from .modeling_tf_utils import keras
- # This block because Keras loves randomly moving things to different places - this changed somewhere between 2.10 - 2.15
- if hasattr(keras.optimizers.schedules, "learning_rate_schedule"):
- schedules = keras.optimizers.schedules.learning_rate_schedule
- else:
- schedules = keras.optimizers.schedules
- class WarmUp(schedules.LearningRateSchedule):
- """
- Applies a warmup schedule on a given learning rate decay schedule.
- Args:
- initial_learning_rate (`float`):
- The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
- of the warmup).
- decay_schedule_fn (`Callable`):
- The schedule function to apply after the warmup for the rest of training.
- warmup_steps (`int`):
- The number of steps for the warmup part of training.
- power (`float`, *optional*, defaults to 1.0):
- The power to use for the polynomial warmup (defaults is a linear warmup).
- name (`str`, *optional*):
- Optional name prefix for the returned tensors during the schedule.
- """
- def __init__(
- self,
- initial_learning_rate: float,
- decay_schedule_fn: Callable,
- warmup_steps: int,
- power: float = 1.0,
- name: str = None,
- ):
- super().__init__()
- self.initial_learning_rate = initial_learning_rate
- self.warmup_steps = warmup_steps
- self.power = power
- self.decay_schedule_fn = decay_schedule_fn
- self.name = name
- def __call__(self, step):
- with tf.name_scope(self.name or "WarmUp") as name:
- # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
- # learning rate will be `global_step/num_warmup_steps * init_lr`.
- global_step_float = tf.cast(step, tf.float32)
- warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
- warmup_percent_done = global_step_float / warmup_steps_float
- warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
- return tf.cond(
- global_step_float < warmup_steps_float,
- lambda: warmup_learning_rate,
- lambda: self.decay_schedule_fn(step - self.warmup_steps),
- name=name,
- )
- def get_config(self):
- return {
- "initial_learning_rate": self.initial_learning_rate,
- "decay_schedule_fn": self.decay_schedule_fn,
- "warmup_steps": self.warmup_steps,
- "power": self.power,
- "name": self.name,
- }
- def create_optimizer(
- init_lr: float,
- num_train_steps: int,
- num_warmup_steps: int,
- min_lr_ratio: float = 0.0,
- adam_beta1: float = 0.9,
- adam_beta2: float = 0.999,
- adam_epsilon: float = 1e-8,
- adam_clipnorm: Optional[float] = None,
- adam_global_clipnorm: Optional[float] = None,
- weight_decay_rate: float = 0.0,
- power: float = 1.0,
- include_in_weight_decay: Optional[List[str]] = None,
- ):
- """
- Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.
- Args:
- init_lr (`float`):
- The desired learning rate at the end of the warmup phase.
- num_train_steps (`int`):
- The total number of training steps.
- num_warmup_steps (`int`):
- The number of warmup steps.
- min_lr_ratio (`float`, *optional*, defaults to 0):
- The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
- adam_beta1 (`float`, *optional*, defaults to 0.9):
- The beta1 to use in Adam.
- adam_beta2 (`float`, *optional*, defaults to 0.999):
- The beta2 to use in Adam.
- adam_epsilon (`float`, *optional*, defaults to 1e-8):
- The epsilon to use in Adam.
- adam_clipnorm (`float`, *optional*, defaults to `None`):
- If not `None`, clip the gradient norm for each weight tensor to this value.
- adam_global_clipnorm (`float`, *optional*, defaults to `None`)
- If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
- weight tensors, as if they were concatenated into a single vector.
- weight_decay_rate (`float`, *optional*, defaults to 0):
- The weight decay to use.
- power (`float`, *optional*, defaults to 1.0):
- The power to use for PolynomialDecay.
- include_in_weight_decay (`List[str]`, *optional*):
- List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
- applied to all parameters except bias and layer norm parameters.
- """
- # Implements linear decay of the learning rate.
- lr_schedule = schedules.PolynomialDecay(
- initial_learning_rate=init_lr,
- decay_steps=num_train_steps - num_warmup_steps,
- end_learning_rate=init_lr * min_lr_ratio,
- power=power,
- )
- if num_warmup_steps:
- lr_schedule = WarmUp(
- initial_learning_rate=init_lr,
- decay_schedule_fn=lr_schedule,
- warmup_steps=num_warmup_steps,
- )
- if weight_decay_rate > 0.0:
- optimizer = AdamWeightDecay(
- learning_rate=lr_schedule,
- weight_decay_rate=weight_decay_rate,
- beta_1=adam_beta1,
- beta_2=adam_beta2,
- epsilon=adam_epsilon,
- clipnorm=adam_clipnorm,
- global_clipnorm=adam_global_clipnorm,
- exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
- include_in_weight_decay=include_in_weight_decay,
- )
- else:
- optimizer = keras.optimizers.Adam(
- learning_rate=lr_schedule,
- beta_1=adam_beta1,
- beta_2=adam_beta2,
- epsilon=adam_epsilon,
- clipnorm=adam_clipnorm,
- global_clipnorm=adam_global_clipnorm,
- )
- # We return the optimizer and the LR scheduler in order to better track the
- # evolution of the LR independently of the optimizer.
- return optimizer, lr_schedule
- class AdamWeightDecay(Adam):
- """
- Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
- loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
- with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
- Regularization](https://arxiv.org/abs/1711.05101).
- Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
- to adding the square of the weights to the loss with plain (non-momentum) SGD.
- Args:
- learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
- The learning rate to use or a schedule.
- beta_1 (`float`, *optional*, defaults to 0.9):
- The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
- beta_2 (`float`, *optional*, defaults to 0.999):
- The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
- epsilon (`float`, *optional*, defaults to 1e-07):
- The epsilon parameter in Adam, which is a small constant for numerical stability.
- amsgrad (`bool`, *optional*, defaults to `False`):
- Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
- Beyond](https://arxiv.org/abs/1904.09237).
- weight_decay_rate (`float`, *optional*, defaults to 0.0):
- The weight decay to apply.
- include_in_weight_decay (`List[str]`, *optional*):
- List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
- applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
- exclude_from_weight_decay (`List[str]`, *optional*):
- List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
- `include_in_weight_decay` is passed, the names in it will supersede this list.
- name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
- Optional name for the operations created when applying gradients.
- kwargs (`Dict[str, Any]`, *optional*):
- Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
- norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
- inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
- `learning_rate` instead.
- """
- def __init__(
- self,
- learning_rate: Union[float, schedules.LearningRateSchedule] = 0.001,
- beta_1: float = 0.9,
- beta_2: float = 0.999,
- epsilon: float = 1e-7,
- amsgrad: bool = False,
- weight_decay_rate: float = 0.0,
- include_in_weight_decay: Optional[List[str]] = None,
- exclude_from_weight_decay: Optional[List[str]] = None,
- name: str = "AdamWeightDecay",
- **kwargs,
- ):
- super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
- self.weight_decay_rate = weight_decay_rate
- self._include_in_weight_decay = include_in_weight_decay
- self._exclude_from_weight_decay = exclude_from_weight_decay
- @classmethod
- def from_config(cls, config):
- """Creates an optimizer from its config with WarmUp custom object."""
- custom_objects = {"WarmUp": WarmUp}
- return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)
- def _prepare_local(self, var_device, var_dtype, apply_state):
- super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
- apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant(
- self.weight_decay_rate, name="adam_weight_decay_rate"
- )
- def _decay_weights_op(self, var, learning_rate, apply_state):
- do_decay = self._do_use_weight_decay(var.name)
- if do_decay:
- return var.assign_sub(
- learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"],
- use_locking=self._use_locking,
- )
- return tf.no_op()
- def apply_gradients(self, grads_and_vars, name=None, **kwargs):
- grads, tvars = list(zip(*grads_and_vars))
- return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name, **kwargs)
- def _get_lr(self, var_device, var_dtype, apply_state):
- """Retrieves the learning rate with the given state."""
- if apply_state is None:
- return self._decayed_lr_t[var_dtype], {}
- apply_state = apply_state or {}
- coefficients = apply_state.get((var_device, var_dtype))
- if coefficients is None:
- coefficients = self._fallback_apply_state(var_device, var_dtype)
- apply_state[(var_device, var_dtype)] = coefficients
- return coefficients["lr_t"], {"apply_state": apply_state}
- def _resource_apply_dense(self, grad, var, apply_state=None):
- lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
- decay = self._decay_weights_op(var, lr_t, apply_state)
- with tf.control_dependencies([decay]):
- return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
- def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
- lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
- decay = self._decay_weights_op(var, lr_t, apply_state)
- with tf.control_dependencies([decay]):
- return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)
- def get_config(self):
- config = super().get_config()
- config.update({"weight_decay_rate": self.weight_decay_rate})
- return config
- def _do_use_weight_decay(self, param_name):
- """Whether to use L2 weight decay for `param_name`."""
- if self.weight_decay_rate == 0:
- return False
- if self._include_in_weight_decay:
- for r in self._include_in_weight_decay:
- if re.search(r, param_name) is not None:
- return True
- if self._exclude_from_weight_decay:
- for r in self._exclude_from_weight_decay:
- if re.search(r, param_name) is not None:
- return False
- return True
- # Extracted from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
- class GradientAccumulator:
- """
- Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
- replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
- then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
- """
- # We use the ON_READ synchronization policy so that no synchronization is
- # performed on assignment. To get the value, we call .value() which returns the
- # value on the current replica without synchronization.
- def __init__(self):
- """Initializes the accumulator."""
- self._gradients = []
- self._accum_steps = None
- @property
- def step(self):
- """Number of accumulated steps."""
- if self._accum_steps is None:
- self._accum_steps = tf.Variable(
- tf.constant(0, dtype=tf.int64),
- trainable=False,
- synchronization=tf.VariableSynchronization.ON_READ,
- aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
- )
- return self._accum_steps.value()
- @property
- def gradients(self):
- """The accumulated gradients on the current replica."""
- if not self._gradients:
- raise ValueError("The accumulator should be called first to initialize the gradients")
- return [gradient.value() if gradient is not None else gradient for gradient in self._gradients]
- def __call__(self, gradients):
- """Accumulates `gradients` on the current replica."""
- if not self._gradients:
- _ = self.step # Create the step variable.
- self._gradients.extend(
- [
- tf.Variable(
- tf.zeros_like(gradient),
- trainable=False,
- synchronization=tf.VariableSynchronization.ON_READ,
- aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
- )
- if gradient is not None
- else gradient
- for gradient in gradients
- ]
- )
- if len(gradients) != len(self._gradients):
- raise ValueError(f"Expected {len(self._gradients)} gradients, but got {len(gradients)}")
- for accum_gradient, gradient in zip(self._gradients, gradients):
- if accum_gradient is not None and gradient is not None:
- accum_gradient.assign_add(gradient)
- self._accum_steps.assign_add(1)
- def reset(self):
- """Resets the accumulated gradients on the current replica."""
- if not self._gradients:
- return
- self._accum_steps.assign(0)
- for gradient in self._gradients:
- if gradient is not None:
- gradient.assign(tf.zeros_like(gradient))
|