| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958 |
- # coding=utf-8
- # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """PyTorch optimization for BERT model."""
- import math
- import warnings
- from functools import partial
- from typing import Callable, Iterable, Optional, Tuple, Union
- import torch
- from torch import nn
- from torch.optim import Optimizer
- from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
- from .trainer_pt_utils import LayerWiseDummyOptimizer, LayerWiseDummyScheduler
- from .trainer_utils import SchedulerType
- from .utils import logging
- from .utils.versions import require_version
- logger = logging.get_logger(__name__)
- def _get_constant_lambda(_=None):
- return 1
- def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
- """
- Create a schedule with a constant learning rate, using the learning rate set in optimizer.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch)
- def get_reduce_on_plateau_schedule(optimizer: Optimizer, **kwargs):
- """
- Create a schedule with a constant learning rate that decreases when a metric has stopped improving.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- kwargs (`dict`, *optional*):
- Extra parameters to be passed to the scheduler. See `torch.optim.lr_scheduler.ReduceLROnPlateau`
- for possible parameters.
- Return:
- `torch.optim.lr_scheduler.ReduceLROnPlateau` with the appropriate schedule.
- """
- return ReduceLROnPlateau(optimizer, **kwargs)
- def _get_constant_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int):
- if current_step < num_warmup_steps:
- return float(current_step) / float(max(1.0, num_warmup_steps))
- return 1.0
- def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
- """
- Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
- increases linearly between 0 and the initial lr set in the optimizer.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- num_warmup_steps (`int`):
- The number of steps for the warmup phase.
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- lr_lambda = partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps)
- return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
- def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int):
- if current_step < num_warmup_steps:
- return float(current_step) / float(max(1, num_warmup_steps))
- return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
- def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
- """
- Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
- a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- num_warmup_steps (`int`):
- The number of steps for the warmup phase.
- num_training_steps (`int`):
- The total number of training steps.
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- lr_lambda = partial(
- _get_linear_schedule_with_warmup_lr_lambda,
- num_warmup_steps=num_warmup_steps,
- num_training_steps=num_training_steps,
- )
- return LambdaLR(optimizer, lr_lambda, last_epoch)
- def _get_cosine_schedule_with_warmup_lr_lambda(
- current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float
- ):
- if current_step < num_warmup_steps:
- return float(current_step) / float(max(1, num_warmup_steps))
- progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
- return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
- def get_cosine_schedule_with_warmup(
- optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
- ):
- """
- Create a schedule with a learning rate that decreases following the values of the cosine function between the
- initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
- initial lr set in the optimizer.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- num_warmup_steps (`int`):
- The number of steps for the warmup phase.
- num_training_steps (`int`):
- The total number of training steps.
- num_cycles (`float`, *optional*, defaults to 0.5):
- The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
- following a half-cosine).
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- lr_lambda = partial(
- _get_cosine_schedule_with_warmup_lr_lambda,
- num_warmup_steps=num_warmup_steps,
- num_training_steps=num_training_steps,
- num_cycles=num_cycles,
- )
- return LambdaLR(optimizer, lr_lambda, last_epoch)
- def _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda(
- current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: int
- ):
- if current_step < num_warmup_steps:
- return float(current_step) / float(max(1, num_warmup_steps))
- progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
- if progress >= 1.0:
- return 0.0
- return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
- def get_cosine_with_hard_restarts_schedule_with_warmup(
- optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
- ):
- """
- Create a schedule with a learning rate that decreases following the values of the cosine function between the
- initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
- linearly between 0 and the initial lr set in the optimizer.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- num_warmup_steps (`int`):
- The number of steps for the warmup phase.
- num_training_steps (`int`):
- The total number of training steps.
- num_cycles (`int`, *optional*, defaults to 1):
- The number of hard restarts to use.
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- lr_lambda = partial(
- _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda,
- num_warmup_steps=num_warmup_steps,
- num_training_steps=num_training_steps,
- num_cycles=num_cycles,
- )
- return LambdaLR(optimizer, lr_lambda, last_epoch)
- def _get_polynomial_decay_schedule_with_warmup_lr_lambda(
- current_step: int,
- *,
- num_warmup_steps: int,
- num_training_steps: int,
- lr_end: float,
- power: float,
- lr_init: int,
- ):
- if current_step < num_warmup_steps:
- return float(current_step) / float(max(1, num_warmup_steps))
- elif current_step > num_training_steps:
- return lr_end / lr_init # as LambdaLR multiplies by lr_init
- else:
- lr_range = lr_init - lr_end
- decay_steps = num_training_steps - num_warmup_steps
- pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
- decay = lr_range * pct_remaining**power + lr_end
- return decay / lr_init # as LambdaLR multiplies by lr_init
- def get_polynomial_decay_schedule_with_warmup(
- optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
- ):
- """
- Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
- optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
- initial lr set in the optimizer.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- num_warmup_steps (`int`):
- The number of steps for the warmup phase.
- num_training_steps (`int`):
- The total number of training steps.
- lr_end (`float`, *optional*, defaults to 1e-7):
- The end LR.
- power (`float`, *optional*, defaults to 1.0):
- Power factor.
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
- implementation at
- https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- lr_init = optimizer.defaults["lr"]
- if not (lr_init > lr_end):
- raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})")
- lr_lambda = partial(
- _get_polynomial_decay_schedule_with_warmup_lr_lambda,
- num_warmup_steps=num_warmup_steps,
- num_training_steps=num_training_steps,
- lr_end=lr_end,
- power=power,
- lr_init=lr_init,
- )
- return LambdaLR(optimizer, lr_lambda, last_epoch)
- def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int = None):
- if current_step < num_warmup_steps:
- return float(current_step) / float(max(1, num_warmup_steps))
- shift = timescale - num_warmup_steps
- decay = 1.0 / math.sqrt((current_step + shift) / timescale)
- return decay
- def get_inverse_sqrt_schedule(
- optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
- ):
- """
- Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a
- warmup period which increases lr linearly from 0 to the initial lr set in the optimizer.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- num_warmup_steps (`int`):
- The number of steps for the warmup phase.
- timescale (`int`, *optional*, defaults to `num_warmup_steps`):
- Time scale.
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- # Note: this implementation is adapted from
- # https://github.com/google-research/big_vision/blob/f071ce68852d56099437004fd70057597a95f6ef/big_vision/utils.py#L930
- if timescale is None:
- timescale = num_warmup_steps or 10_000
- lr_lambda = partial(_get_inverse_sqrt_schedule_lr_lambda, num_warmup_steps=num_warmup_steps, timescale=timescale)
- return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
- def _get_cosine_schedule_with_warmup_lr_lambda(
- current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, min_lr_rate: float = 0.0
- ):
- if current_step < num_warmup_steps:
- return float(current_step) / float(max(1, num_warmup_steps))
- progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
- factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
- factor = factor * (1 - min_lr_rate) + min_lr_rate
- return max(0, factor)
- def get_cosine_with_min_lr_schedule_with_warmup(
- optimizer: Optimizer,
- num_warmup_steps: int,
- num_training_steps: int,
- num_cycles: float = 0.5,
- last_epoch: int = -1,
- min_lr: float = None,
- min_lr_rate: float = None,
- ):
- """
- Create a schedule with a learning rate that decreases following the values of the cosine function between the
- initial lr set in the optimizer to min_lr, after a warmup period during which it increases linearly between 0 and the
- initial lr set in the optimizer.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- num_warmup_steps (`int`):
- The number of steps for the warmup phase.
- num_training_steps (`int`):
- The total number of training steps.
- num_cycles (`float`, *optional*, defaults to 0.5):
- The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
- following a half-cosine).
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- min_lr (`float`, *optional*):
- The minimum learning rate to reach after the cosine schedule.
- min_lr_rate (`float`, *optional*):
- The minimum learning rate as a ratio of the initial learning rate. If set, `min_lr` should not be set.
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- if min_lr is not None and min_lr_rate is not None:
- raise ValueError("Only one of min_lr or min_lr_rate should be set")
- elif min_lr is not None:
- min_lr_rate = min_lr / optimizer.defaults["lr"]
- elif min_lr_rate is None:
- raise ValueError("One of min_lr or min_lr_rate should be set through the `lr_scheduler_kwargs`")
- lr_lambda = partial(
- _get_cosine_schedule_with_warmup_lr_lambda,
- num_warmup_steps=num_warmup_steps,
- num_training_steps=num_training_steps,
- num_cycles=num_cycles,
- min_lr_rate=min_lr_rate,
- )
- return LambdaLR(optimizer, lr_lambda, last_epoch)
- def _get_wsd_scheduler_lambda(
- current_step: int,
- *,
- num_warmup_steps: int,
- num_stable_steps: int,
- num_decay_steps: int,
- num_cycles: float,
- min_lr_ratio: float,
- ):
- if current_step < num_warmup_steps:
- return float(current_step) / float(max(1, num_warmup_steps))
- if current_step < num_warmup_steps + num_stable_steps:
- return 1.0
- if current_step < num_warmup_steps + num_stable_steps + num_decay_steps:
- progress = float(current_step - num_warmup_steps - num_stable_steps) / float(max(1, num_decay_steps))
- value = max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
- return (1.0 - min_lr_ratio) * value + min_lr_ratio
- return min_lr_ratio
- def get_wsd_schedule(
- optimizer: Optimizer,
- num_warmup_steps: int,
- num_stable_steps: int,
- num_decay_steps: int,
- min_lr_ratio: float = 0,
- num_cycles: float = 0.5,
- last_epoch: int = -1,
- ):
- """
- Create a schedule with a learning rate that has three stages:
- 1. linear increase from 0 to initial lr.
- 2. constant lr (equal to initial lr).
- 3. decrease following the values of the cosine function between the initial lr set in the optimizer to
- a fraction of initial lr.
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- num_warmup_steps (`int`):
- The number of steps for the warmup phase.
- num_stable_steps (`int`):
- The number of steps for the stable phase.
- num_decay_steps (`int`):
- The number of steps for the cosine annealing phase.
- min_lr_ratio (`float`, *optional*, defaults to 0):
- The minimum learning rate as a ratio of the initial learning rate.
- num_cycles (`float`, *optional*, defaults to 0.5):
- The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
- following a half-cosine).
- last_epoch (`int`, *optional*, defaults to -1):
- The index of the last epoch when resuming training.
- Return:
- `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
- """
- lr_lambda = partial(
- _get_wsd_scheduler_lambda,
- num_warmup_steps=num_warmup_steps,
- num_stable_steps=num_stable_steps,
- num_decay_steps=num_decay_steps,
- min_lr_ratio=min_lr_ratio,
- num_cycles=num_cycles,
- )
- return LambdaLR(optimizer, lr_lambda, last_epoch)
- TYPE_TO_SCHEDULER_FUNCTION = {
- SchedulerType.LINEAR: get_linear_schedule_with_warmup,
- SchedulerType.COSINE: get_cosine_schedule_with_warmup,
- SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
- SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
- SchedulerType.CONSTANT: get_constant_schedule,
- SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
- SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
- SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
- SchedulerType.COSINE_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup,
- SchedulerType.WARMUP_STABLE_DECAY: get_wsd_schedule,
- }
- def get_scheduler(
- name: Union[str, SchedulerType],
- optimizer: Optimizer,
- num_warmup_steps: Optional[int] = None,
- num_training_steps: Optional[int] = None,
- scheduler_specific_kwargs: Optional[dict] = None,
- ):
- """
- Unified API to get any scheduler from its name.
- Args:
- name (`str` or `SchedulerType`):
- The name of the scheduler to use.
- optimizer (`torch.optim.Optimizer`):
- The optimizer that will be used during training.
- num_warmup_steps (`int`, *optional*):
- The number of warmup steps to do. This is not required by all schedulers (hence the argument being
- optional), the function will raise an error if it's unset and the scheduler type requires it.
- num_training_steps (`int``, *optional*):
- The number of training steps to do. This is not required by all schedulers (hence the argument being
- optional), the function will raise an error if it's unset and the scheduler type requires it.
- scheduler_specific_kwargs (`dict`, *optional*):
- Extra parameters for schedulers such as cosine with restarts. Mismatched scheduler types and scheduler
- parameters will cause the scheduler function to raise a TypeError.
- """
- name = SchedulerType(name)
- schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
- # If a `LayerWiseDummyOptimizer` is passed we extract the optimizer dict and
- # recursively call `get_scheduler` to get the proper schedulers on each parameter
- if optimizer is not None and isinstance(optimizer, LayerWiseDummyOptimizer):
- optimizer_dict = optimizer.optimizer_dict
- scheduler_dict = {}
- for param in optimizer_dict.keys():
- scheduler_dict[param] = get_scheduler(
- name,
- optimizer=optimizer_dict[param],
- num_warmup_steps=num_warmup_steps,
- num_training_steps=num_training_steps,
- )
- def scheduler_hook(param):
- # Since the optimizer hook has been already attached we only need to
- # attach the scheduler hook, the gradients have been zeroed here
- scheduler_dict[param].step()
- for param in optimizer_dict.keys():
- if param.requires_grad:
- param.register_post_accumulate_grad_hook(scheduler_hook)
- return LayerWiseDummyScheduler(optimizer_dict=optimizer_dict, lr=optimizer.defaults["lr"])
- if name == SchedulerType.CONSTANT:
- return schedule_func(optimizer)
- if scheduler_specific_kwargs is None:
- scheduler_specific_kwargs = {}
- if name == SchedulerType.REDUCE_ON_PLATEAU:
- return schedule_func(optimizer, **scheduler_specific_kwargs)
- # All other schedulers require `num_warmup_steps`
- if num_warmup_steps is None:
- raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
- if name == SchedulerType.CONSTANT_WITH_WARMUP:
- return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
- if name == SchedulerType.INVERSE_SQRT:
- return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
- if name == SchedulerType.WARMUP_STABLE_DECAY:
- return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **scheduler_specific_kwargs)
- # All other schedulers require `num_training_steps`
- if num_training_steps is None:
- raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
- return schedule_func(
- optimizer,
- num_warmup_steps=num_warmup_steps,
- num_training_steps=num_training_steps,
- **scheduler_specific_kwargs,
- )
- class AdamW(Optimizer):
- """
- Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay
- Regularization](https://arxiv.org/abs/1711.05101).
- Parameters:
- params (`Iterable[nn.parameter.Parameter]`):
- Iterable of parameters to optimize or dictionaries defining parameter groups.
- lr (`float`, *optional*, defaults to 0.001):
- The learning rate to use.
- betas (`Tuple[float,float]`, *optional*, defaults to `(0.9, 0.999)`):
- Adam's betas parameters (b1, b2).
- eps (`float`, *optional*, defaults to 1e-06):
- Adam's epsilon for numerical stability.
- weight_decay (`float`, *optional*, defaults to 0.0):
- Decoupled weight decay to apply.
- correct_bias (`bool`, *optional*, defaults to `True`):
- Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
- no_deprecation_warning (`bool`, *optional*, defaults to `False`):
- A flag used to disable the deprecation warning (set to `True` to disable the warning).
- """
- def __init__(
- self,
- params: Iterable[nn.parameter.Parameter],
- lr: float = 1e-3,
- betas: Tuple[float, float] = (0.9, 0.999),
- eps: float = 1e-6,
- weight_decay: float = 0.0,
- correct_bias: bool = True,
- no_deprecation_warning: bool = False,
- ):
- if not no_deprecation_warning:
- warnings.warn(
- "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch"
- " implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this"
- " warning",
- FutureWarning,
- )
- require_version("torch>=1.5.0") # add_ with alpha
- if lr < 0.0:
- raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
- if not 0.0 <= betas[0] < 1.0:
- raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
- if not 0.0 <= betas[1] < 1.0:
- raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
- if not 0.0 <= eps:
- raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
- defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
- super().__init__(params, defaults)
- @torch.no_grad()
- def step(self, closure: Callable = None):
- """
- Performs a single optimization step.
- Arguments:
- closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
- """
- loss = None
- if closure is not None:
- loss = closure()
- for group in self.param_groups:
- for p in group["params"]:
- if p.grad is None:
- continue
- grad = p.grad
- if grad.is_sparse:
- raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
- state = self.state[p]
- # State initialization
- if len(state) == 0:
- state["step"] = 0
- # Exponential moving average of gradient values
- state["exp_avg"] = torch.zeros_like(p)
- # Exponential moving average of squared gradient values
- state["exp_avg_sq"] = torch.zeros_like(p)
- exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
- beta1, beta2 = group["betas"]
- state["step"] += 1
- # Decay the first and second moment running average coefficient
- # In-place operations to update the averages at the same time
- exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
- exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
- denom = exp_avg_sq.sqrt().add_(group["eps"])
- step_size = group["lr"]
- if group["correct_bias"]: # No bias correction for Bert
- bias_correction1 = 1.0 - beta1 ** state["step"]
- bias_correction2 = 1.0 - beta2 ** state["step"]
- step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
- p.addcdiv_(exp_avg, denom, value=-step_size)
- # Just adding the square of the weights to the loss function is *not*
- # the correct way of using L2 regularization/weight decay with Adam,
- # since that will interact with the m and v parameters in strange ways.
- #
- # Instead we want to decay the weights in a manner that doesn't interact
- # with the m/v parameters. This is equivalent to adding the square
- # of the weights to the loss with plain (non-momentum) SGD.
- # Add weight decay at the end (fixed version)
- if group["weight_decay"] > 0.0:
- p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
- return loss
- class Adafactor(Optimizer):
- """
- AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
- https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
- Paper: *Adafactor: Adaptive Learning Rates with Sublinear Memory Cost* https://arxiv.org/abs/1804.04235 Note that
- this optimizer internally adjusts the learning rate depending on the `scale_parameter`, `relative_step` and
- `warmup_init` options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
- `relative_step=False`.
- Arguments:
- params (`Iterable[nn.parameter.Parameter]`):
- Iterable of parameters to optimize or dictionaries defining parameter groups.
- lr (`float`, *optional*):
- The external learning rate.
- eps (`Tuple[float, float]`, *optional*, defaults to `(1e-30, 0.001)`):
- Regularization constants for square gradient and parameter scale respectively
- clip_threshold (`float`, *optional*, defaults to 1.0):
- Threshold of root mean square of final gradient update
- decay_rate (`float`, *optional*, defaults to -0.8):
- Coefficient used to compute running averages of square
- beta1 (`float`, *optional*):
- Coefficient used for computing running averages of gradient
- weight_decay (`float`, *optional*, defaults to 0.0):
- Weight decay (L2 penalty)
- scale_parameter (`bool`, *optional*, defaults to `True`):
- If True, learning rate is scaled by root mean square
- relative_step (`bool`, *optional*, defaults to `True`):
- If True, time-dependent learning rate is computed instead of external learning rate
- warmup_init (`bool`, *optional*, defaults to `False`):
- Time-dependent learning rate computation depends on whether warm-up initialization is being used
- This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested.
- Recommended T5 finetuning settings (https://discuss.huggingface.co/t/t5-finetuning-tips/684/3):
- - Training without LR warmup or clip_threshold is not recommended.
- - use scheduled LR warm-up to fixed LR
- - use clip_threshold=1.0 (https://arxiv.org/abs/1804.04235)
- - Disable relative updates
- - Use scale_parameter=False
- - Additional optimizer operations like gradient clipping should not be used alongside Adafactor
- Example:
- ```python
- Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3)
- ```
- Others reported the following combination to work well:
- ```python
- Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
- ```
- When using `lr=None` with [`Trainer`] you will most likely need to use [`~optimization.AdafactorSchedule`]
- scheduler as following:
- ```python
- from transformers.optimization import Adafactor, AdafactorSchedule
- optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
- lr_scheduler = AdafactorSchedule(optimizer)
- trainer = Trainer(..., optimizers=(optimizer, lr_scheduler))
- ```
- Usage:
- ```python
- # replace AdamW with Adafactor
- optimizer = Adafactor(
- model.parameters(),
- lr=1e-3,
- eps=(1e-30, 1e-3),
- clip_threshold=1.0,
- decay_rate=-0.8,
- beta1=None,
- weight_decay=0.0,
- relative_step=False,
- scale_parameter=False,
- warmup_init=False,
- )
- ```"""
- def __init__(
- self,
- params,
- lr=None,
- eps=(1e-30, 1e-3),
- clip_threshold=1.0,
- decay_rate=-0.8,
- beta1=None,
- weight_decay=0.0,
- scale_parameter=True,
- relative_step=True,
- warmup_init=False,
- ):
- require_version("torch>=1.5.0") # add_ with alpha
- if lr is not None and relative_step:
- raise ValueError("Cannot combine manual `lr` and `relative_step=True` options")
- if warmup_init and not relative_step:
- raise ValueError("`warmup_init=True` requires `relative_step=True`")
- defaults = {
- "lr": lr,
- "eps": eps,
- "clip_threshold": clip_threshold,
- "decay_rate": decay_rate,
- "beta1": beta1,
- "weight_decay": weight_decay,
- "scale_parameter": scale_parameter,
- "relative_step": relative_step,
- "warmup_init": warmup_init,
- }
- super().__init__(params, defaults)
- @staticmethod
- def _get_lr(param_group, param_state):
- rel_step_sz = param_group["lr"]
- if param_group["relative_step"]:
- min_step = 1e-6 * param_state["step"] if param_group["warmup_init"] else 1e-2
- rel_step_sz = min(min_step, 1.0 / math.sqrt(param_state["step"]))
- param_scale = 1.0
- if param_group["scale_parameter"]:
- param_scale = max(param_group["eps"][1], param_state["RMS"])
- return param_scale * rel_step_sz
- @staticmethod
- def _get_options(param_group, param_shape):
- factored = len(param_shape) >= 2
- use_first_moment = param_group["beta1"] is not None
- return factored, use_first_moment
- @staticmethod
- def _rms(tensor):
- return tensor.norm(2) / (tensor.numel() ** 0.5)
- @staticmethod
- def _approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col):
- # copy from fairseq's adafactor implementation:
- # https://github.com/huggingface/transformers/blob/8395f14de6068012787d83989c3627c3df6a252b/src/transformers/optimization.py#L505
- r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1)
- c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
- return torch.mul(r_factor, c_factor)
- @torch.no_grad()
- def step(self, closure=None):
- """
- Performs a single optimization step
- Arguments:
- closure (callable, optional): A closure that reevaluates the model
- and returns the loss.
- """
- loss = None
- if closure is not None:
- loss = closure()
- for group in self.param_groups:
- for p in group["params"]:
- if p.grad is None:
- continue
- grad = p.grad
- if grad.dtype in {torch.float16, torch.bfloat16}:
- grad = grad.float()
- if grad.is_sparse:
- raise RuntimeError("Adafactor does not support sparse gradients.")
- state = self.state[p]
- grad_shape = grad.shape
- factored, use_first_moment = self._get_options(group, grad_shape)
- # State Initialization
- if len(state) == 0:
- state["step"] = 0
- if use_first_moment:
- # Exponential moving average of gradient values
- state["exp_avg"] = torch.zeros_like(grad)
- if factored:
- state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad)
- state["exp_avg_sq_col"] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
- else:
- state["exp_avg_sq"] = torch.zeros_like(grad)
- state["RMS"] = 0
- else:
- if use_first_moment:
- state["exp_avg"] = state["exp_avg"].to(grad)
- if factored:
- state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad)
- state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad)
- else:
- state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
- p_data_fp32 = p
- if p.dtype in {torch.float16, torch.bfloat16}:
- p_data_fp32 = p_data_fp32.float()
- state["step"] += 1
- state["RMS"] = self._rms(p_data_fp32)
- lr = self._get_lr(group, state)
- beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
- update = (grad**2) + group["eps"][0]
- if factored:
- exp_avg_sq_row = state["exp_avg_sq_row"]
- exp_avg_sq_col = state["exp_avg_sq_col"]
- exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=(1.0 - beta2t))
- exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=(1.0 - beta2t))
- # Approximation of exponential moving average of square of gradient
- update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
- update.mul_(grad)
- else:
- exp_avg_sq = state["exp_avg_sq"]
- exp_avg_sq.mul_(beta2t).add_(update, alpha=(1.0 - beta2t))
- update = exp_avg_sq.rsqrt().mul_(grad)
- update.div_((self._rms(update) / group["clip_threshold"]).clamp_(min=1.0))
- update.mul_(lr)
- if use_first_moment:
- exp_avg = state["exp_avg"]
- exp_avg.mul_(group["beta1"]).add_(update, alpha=(1 - group["beta1"]))
- update = exp_avg
- if group["weight_decay"] != 0:
- p_data_fp32.add_(p_data_fp32, alpha=(-group["weight_decay"] * lr))
- p_data_fp32.add_(-update)
- if p.dtype in {torch.float16, torch.bfloat16}:
- p.copy_(p_data_fp32)
- return loss
- class AdafactorSchedule(LambdaLR):
- """
- Since [`~optimization.Adafactor`] performs its own scheduling, if the training loop relies on a scheduler (e.g.,
- for logging), this class creates a proxy object that retrieves the current lr values from the optimizer.
- It returns `initial_lr` during startup and the actual `lr` during stepping.
- """
- def __init__(self, optimizer, initial_lr=0.0):
- def lr_lambda(_):
- return initial_lr
- for group in optimizer.param_groups:
- group["initial_lr"] = initial_lr
- super().__init__(optimizer, lr_lambda)
- for group in optimizer.param_groups:
- del group["initial_lr"]
- def get_lr(self):
- opt = self.optimizer
- lrs = [
- opt._get_lr(group, opt.state[group["params"][0]])
- for group in opt.param_groups
- if group["params"][0].grad is not None
- ]
- if len(lrs) == 0:
- lrs = self.base_lrs # if called before stepping
- return lrs
- def get_adafactor_schedule(optimizer, initial_lr=0.0):
- """
- Get a proxy schedule for [`~optimization.Adafactor`]
- Args:
- optimizer ([`~torch.optim.Optimizer`]):
- The optimizer for which to schedule the learning rate.
- initial_lr (`float`, *optional*, defaults to 0.0):
- Initial lr
- Return:
- [`~optimization.Adafactor`] proxy schedule object.
- """
- return AdafactorSchedule(optimizer, initial_lr)
|