checkpoint.py 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430
  1. # mypy: allow-untyped-defs
  2. import contextlib
  3. import platform
  4. import uuid
  5. import warnings
  6. import weakref
  7. from collections import defaultdict
  8. from itertools import count
  9. from typing import (
  10. Any,
  11. Callable,
  12. ContextManager,
  13. DefaultDict,
  14. Dict,
  15. Iterable,
  16. List,
  17. Optional,
  18. Tuple,
  19. )
  20. from weakref import ReferenceType
  21. import torch
  22. import torch.fx.traceback as fx_traceback
  23. from torch._functorch._aot_autograd.functional_utils import is_fun
  24. from torch.utils._pytree import tree_map
  25. from torch.testing._internal.logging_tensor import capture_logs, LoggingTensorMode
  26. from torch.utils._python_dispatch import TorchDispatchMode
  27. __all__ = [
  28. "checkpoint",
  29. "checkpoint_sequential",
  30. "CheckpointError",
  31. "CheckpointFunction",
  32. "check_backward_validity",
  33. "detach_variable",
  34. "get_device_states",
  35. "set_device_states",
  36. "noop_context_fn",
  37. "set_checkpoint_early_stop",
  38. "DefaultDeviceType",
  39. "set_checkpoint_debug_enabled",
  40. ]
  41. _DEFAULT_DETERMINISM_MODE = "default"
  42. _checkpoint_debug_enabled: Optional[bool] = None
  43. @contextlib.contextmanager
  44. def set_checkpoint_debug_enabled(enabled: Optional[bool]):
  45. """
  46. Context manager that sets whether checkpoint should print additional debug
  47. information when running. See the ``debug`` flag for
  48. :func:`~torch.utils.checkpoint.checkpoint` for more information. Note that
  49. when set, this context manager overrides the value of ``debug`` passed to
  50. checkpoint. To defer to the local setting, pass ``None`` to this context.
  51. Args:
  52. enabled (bool): Whether checkpoint should print debug information.
  53. Default is 'None'.
  54. """
  55. global _checkpoint_debug_enabled
  56. try:
  57. prev = _checkpoint_debug_enabled
  58. _checkpoint_debug_enabled = enabled
  59. yield
  60. finally:
  61. _checkpoint_debug_enabled = prev
  62. def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
  63. if isinstance(inputs, tuple):
  64. out = []
  65. for inp in inputs:
  66. if not isinstance(inp, torch.Tensor):
  67. out.append(inp)
  68. continue
  69. x = inp.detach()
  70. x.requires_grad = inp.requires_grad
  71. out.append(x)
  72. return tuple(out)
  73. else:
  74. raise RuntimeError(
  75. "Only tuple of tensors is supported. Got Unsupported input type: ",
  76. type(inputs).__name__,
  77. )
  78. def check_backward_validity(inputs: Iterable[Any]) -> None:
  79. if not any(inp.requires_grad for inp in inputs if isinstance(inp, torch.Tensor)):
  80. warnings.warn(
  81. "None of the inputs have requires_grad=True. Gradients will be None"
  82. )
  83. def _get_device_module(device="cuda"):
  84. device_module = getattr(torch, device)
  85. return device_module
  86. class DefaultDeviceType:
  87. r"""
  88. A class that manages the default device type for checkpointing.
  89. If no non-CPU tensors are present, the default device type will
  90. be used. The default value is 'cuda'. The device type is used in
  91. the checkpointing process when determining which device states
  92. to save and restore for recomputation.
  93. """
  94. _default_device_type = "cuda"
  95. @staticmethod
  96. def set_device_type(device: str = "cuda"):
  97. """
  98. Set the default device type for checkpointing.
  99. Args:
  100. device (str): The device type to be set as default. Default is 'cuda'.
  101. """
  102. DefaultDeviceType._default_device_type = device
  103. @staticmethod
  104. def get_device_type() -> str:
  105. """
  106. Get the current default device type for checkpointing.
  107. Returns:
  108. str: The current default device type.
  109. """
  110. return DefaultDeviceType._default_device_type
  111. def _infer_device_type(*args):
  112. device_types = []
  113. def add_device_types(arg):
  114. nonlocal device_types
  115. if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu":
  116. device_types.append(arg.device.type)
  117. tree_map(add_device_types, args)
  118. device_types_set = set(device_types)
  119. if len(device_types_set) > 1:
  120. warnings.warn(
  121. "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. "
  122. "Device state will only be saved for devices of a single device type, and the remaining "
  123. "devices will be ignored. Consequently, if any checkpointed functions involve randomness, "
  124. "this may result in incorrect gradients. (Note that if CUDA devices are among the devices "
  125. "detected, it will be prioritized; otherwise, the first device encountered will be selected.)"
  126. f"\nDevice types: {sorted(device_types_set)} first device type: {device_types[0]}"
  127. )
  128. if len(device_types) == 0:
  129. return DefaultDeviceType.get_device_type()
  130. elif "cuda" in device_types_set:
  131. return "cuda"
  132. else:
  133. return device_types[0]
  134. # We can't know if the run_fn will internally move some args to different devices,
  135. # which would require logic to preserve rng states for those devices as well.
  136. # We could paranoically stash and restore ALL the rng states for all visible devices,
  137. # but that seems very wasteful for most cases. Compromise: Stash the RNG state for
  138. # the device of all Tensor args.
  139. #
  140. # To consider: maybe get_device_states and set_device_states should reside in torch/random.py?
  141. def get_device_states(*args) -> Tuple[List[int], List[torch.Tensor]]:
  142. # This will not error out if "arg" is a CPU tensor or a non-tensor type because
  143. # the conditionals short-circuit.
  144. fwd_device_ids = []
  145. def add_device_ids(arg):
  146. nonlocal fwd_device_ids
  147. if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu":
  148. fwd_device_ids.append(arg.get_device())
  149. tree_map(add_device_ids, args)
  150. fwd_device_states = []
  151. device_module = _get_device_module(_infer_device_type(*args))
  152. for device_id in fwd_device_ids:
  153. with device_module.device(device_id):
  154. fwd_device_states.append(device_module.get_rng_state())
  155. return fwd_device_ids, fwd_device_states
  156. def set_device_states(devices, states) -> None:
  157. device_module = _get_device_module(_infer_device_type(*states))
  158. for device, state in zip(devices, states):
  159. with device_module.device(device):
  160. device_module.set_rng_state(state)
  161. def _get_autocast_kwargs(device="cuda"):
  162. if torch.amp.is_autocast_available(device):
  163. device_autocast_kwargs = {
  164. "enabled": torch.is_autocast_enabled(device),
  165. "dtype": torch.get_autocast_dtype(device),
  166. "cache_enabled": torch.is_autocast_cache_enabled(),
  167. }
  168. else:
  169. device_autocast_kwargs = None
  170. cpu_autocast_kwargs = {
  171. "enabled": torch.is_autocast_enabled('cpu'),
  172. "dtype": torch.get_autocast_dtype('cpu'),
  173. "cache_enabled": torch.is_autocast_cache_enabled(),
  174. }
  175. return device_autocast_kwargs, cpu_autocast_kwargs
  176. class CheckpointFunction(torch.autograd.Function):
  177. @staticmethod
  178. def forward(ctx, run_function, preserve_rng_state, *args):
  179. check_backward_validity(args)
  180. ctx.run_function = run_function
  181. ctx.preserve_rng_state = preserve_rng_state
  182. # Accommodates the (remote) possibility that autocast is enabled for cpu AND gpu.
  183. ctx.device = _infer_device_type(*args)
  184. ctx.device_autocast_kwargs, ctx.cpu_autocast_kwargs = _get_autocast_kwargs(
  185. ctx.device
  186. )
  187. if preserve_rng_state:
  188. ctx.fwd_cpu_state = torch.get_rng_state()
  189. # Don't eagerly initialize the cuda context by accident.
  190. # (If the user intends that the context is initialized later, within their
  191. # run_function, we SHOULD actually stash the cuda state here. Unfortunately,
  192. # we have no way to anticipate this will happen before we run the function.)
  193. ctx.had_device_in_fwd = False
  194. device_module = _get_device_module(ctx.device)
  195. if getattr(device_module, "_initialized", False):
  196. ctx.had_device_in_fwd = True
  197. ctx.fwd_devices, ctx.fwd_device_states = get_device_states(*args)
  198. # Save non-tensor inputs in ctx, keep a placeholder None for tensors
  199. # to be filled out during the backward.
  200. ctx.inputs = []
  201. ctx.tensor_indices = []
  202. tensor_inputs = []
  203. for i, arg in enumerate(args):
  204. if torch.is_tensor(arg):
  205. tensor_inputs.append(arg)
  206. ctx.tensor_indices.append(i)
  207. ctx.inputs.append(None)
  208. else:
  209. ctx.inputs.append(arg)
  210. ctx.save_for_backward(*tensor_inputs)
  211. with torch.no_grad():
  212. outputs = run_function(*args)
  213. return outputs
  214. @staticmethod
  215. def backward(ctx, *args):
  216. if not torch.autograd._is_checkpoint_valid():
  217. raise RuntimeError(
  218. "When use_reentrant=True, torch.utils.checkpoint is incompatible"
  219. " with .grad() or passing an `inputs` parameter to .backward()."
  220. " To resolve this error, you can either set use_reentrant=False,"
  221. " or call .backward() without passing the `inputs` argument."
  222. )
  223. # Copy the list to avoid modifying original list.
  224. inputs = list(ctx.inputs)
  225. tensor_indices = ctx.tensor_indices
  226. tensors = ctx.saved_tensors
  227. device_module = _get_device_module(ctx.device)
  228. # Fill in inputs with appropriate saved tensors.
  229. for i, idx in enumerate(tensor_indices):
  230. inputs[idx] = tensors[i]
  231. # Stash the surrounding rng state, and mimic the state that was
  232. # present at this time during forward. Restore the surrounding state
  233. # when we're done.
  234. rng_devices = []
  235. if ctx.preserve_rng_state and ctx.had_device_in_fwd:
  236. rng_devices = ctx.fwd_devices
  237. with torch.random.fork_rng(
  238. devices=rng_devices, enabled=ctx.preserve_rng_state, device_type=ctx.device
  239. ):
  240. if ctx.preserve_rng_state:
  241. torch.set_rng_state(ctx.fwd_cpu_state)
  242. if ctx.had_device_in_fwd:
  243. set_device_states(ctx.fwd_devices, ctx.fwd_device_states)
  244. detached_inputs = detach_variable(tuple(inputs))
  245. device_autocast_ctx = torch.amp.autocast(
  246. device_type=ctx.device, **ctx.device_autocast_kwargs
  247. ) if torch.amp.is_autocast_available(ctx.device) else contextlib.nullcontext()
  248. with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
  249. outputs = ctx.run_function(*detached_inputs)
  250. if isinstance(outputs, torch.Tensor):
  251. outputs = (outputs,)
  252. # run backward() with only tensor that requires grad
  253. outputs_with_grad = []
  254. args_with_grad = []
  255. for i in range(len(outputs)):
  256. if torch.is_tensor(outputs[i]) and outputs[i].requires_grad:
  257. outputs_with_grad.append(outputs[i])
  258. args_with_grad.append(args[i])
  259. if len(outputs_with_grad) == 0:
  260. raise RuntimeError(
  261. "none of output has requires_grad=True,"
  262. " this checkpoint() is not necessary"
  263. )
  264. torch.autograd.backward(outputs_with_grad, args_with_grad)
  265. grads = tuple(
  266. inp.grad if isinstance(inp, torch.Tensor) else None
  267. for inp in detached_inputs
  268. )
  269. return (None, None) + grads
  270. def noop_context_fn():
  271. return contextlib.nullcontext(), contextlib.nullcontext()
  272. # TorchDynamo does not step inside utils.checkpoint function. The flow
  273. # looks likes this
  274. # 1) TorchDynamo tries to wrap utils.checkpoint in a HigherOrderOp by
  275. # speculatively checking if the forward function is safe to trace.
  276. # 2) If yes, then Dynamo-generated Fx graph has the wrapped higher
  277. # order op. As a result, TorchDynamo does not look inside utils.checkpoint.
  278. # 3) If not, then TorchDynamo falls back to eager by performing a graph
  279. # break. And here, the following disable wrapper ensures that
  280. # TorchDynamo does not trigger again on the frames created by
  281. # utils.checkpoint innards.
  282. @torch._disable_dynamo
  283. def checkpoint(
  284. function,
  285. *args,
  286. use_reentrant: Optional[bool] = None,
  287. context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
  288. determinism_check: str = _DEFAULT_DETERMINISM_MODE,
  289. debug: bool = False,
  290. **kwargs
  291. ):
  292. r"""Checkpoint a model or part of the model.
  293. Activation checkpointing is a technique that trades compute for memory.
  294. Instead of keeping tensors needed for backward alive until they are used in
  295. gradient computation during backward, forward computation in checkpointed
  296. regions omits saving tensors for backward and recomputes them during the
  297. backward pass. Activation checkpointing can be applied to any part of a
  298. model.
  299. There are currently two checkpointing implementations available, determined
  300. by the :attr:`use_reentrant` parameter. It is recommended that you use
  301. ``use_reentrant=False``. Please refer the note below for a discussion of
  302. their differences.
  303. .. warning::
  304. If the :attr:`function` invocation during the backward pass differs
  305. from the forward pass, e.g., due to a global variable, the checkpointed
  306. version may not be equivalent, potentially causing an
  307. error being raised or leading to silently incorrect gradients.
  308. .. warning::
  309. The ``use_reentrant`` parameter should be passed explicitly. In version
  310. 2.4 we will raise an exception if ``use_reentrant`` is not passed.
  311. If you are using the ``use_reentrant=True`` variant, please refer to the
  312. note below for important considerations and potential limitations.
  313. .. note::
  314. The reentrant variant of checkpoint (``use_reentrant=True``) and
  315. the non-reentrant variant of checkpoint (``use_reentrant=False``)
  316. differ in the following ways:
  317. * Non-reentrant checkpoint stops recomputation as soon as all needed
  318. intermediate activations have been recomputed. This feature is enabled
  319. by default, but can be disabled with :func:`set_checkpoint_early_stop`.
  320. Reentrant checkpoint always recomputes :attr:`function` in its
  321. entirety during the backward pass.
  322. * The reentrant variant does not record the autograd graph during the
  323. forward pass, as it runs with the forward pass under
  324. :func:`torch.no_grad`. The non-reentrant version does record the
  325. autograd graph, allowing one to perform backward on the graph within
  326. checkpointed regions.
  327. * The reentrant checkpoint only supports the
  328. :func:`torch.autograd.backward` API for the backward pass without its
  329. `inputs` argument, while the non-reentrant version supports all ways
  330. of performing the backward pass.
  331. * At least one input and output must have ``requires_grad=True`` for the
  332. reentrant variant. If this condition is unmet, the checkpointed part
  333. of the model will not have gradients. The non-reentrant version does
  334. not have this requirement.
  335. * The reentrant version does not consider tensors in nested structures
  336. (e.g., custom objects, lists, dicts, etc) as participating in
  337. autograd, while the non-reentrant version does.
  338. * The reentrant checkpoint does not support checkpointed regions with
  339. detached tensors from the computational graph, whereas the
  340. non-reentrant version does. For the reentrant variant, if the
  341. checkpointed segment contains tensors detached using ``detach()`` or
  342. with :func:`torch.no_grad`, the backward pass will raise an error.
  343. This is because ``checkpoint`` makes all the outputs require gradients
  344. and this causes issues when a tensor is defined to have no gradient in
  345. the model. To avoid this, detach the tensors outside of the
  346. ``checkpoint`` function.
  347. Args:
  348. function: describes what to run in the forward pass of the model or
  349. part of the model. It should also know how to handle the inputs
  350. passed as the tuple. For example, in LSTM, if user passes
  351. ``(activation, hidden)``, :attr:`function` should correctly use the
  352. first input as ``activation`` and the second input as ``hidden``
  353. preserve_rng_state(bool, optional): Omit stashing and restoring
  354. the RNG state during each checkpoint. Note that under torch.compile,
  355. this flag doesn't take effect and we always preserve RNG state.
  356. Default: ``True``
  357. use_reentrant(bool):
  358. specify whether to use the activation checkpoint variant that
  359. requires reentrant autograd. This parameter should be passed
  360. explicitly. In version 2.4 we will raise an exception if
  361. ``use_reentrant`` is not passed. If ``use_reentrant=False``,
  362. ``checkpoint`` will use an implementation that does not require
  363. reentrant autograd. This allows ``checkpoint`` to support additional
  364. functionality, such as working as expected with
  365. ``torch.autograd.grad`` and support for keyword arguments input into
  366. the checkpointed function.
  367. context_fn(Callable, optional): A callable returning a tuple of two
  368. context managers. The function and its recomputation will be run
  369. under the first and second context managers respectively.
  370. This argument is only supported if ``use_reentrant=False``.
  371. determinism_check(str, optional): A string specifying the determinism
  372. check to perform. By default it is set to ``"default"`` which
  373. compares the shapes, dtypes, and devices of the recomputed tensors
  374. against those the saved tensors. To turn off this check, specify
  375. ``"none"``. Currently these are the only two supported values.
  376. Please open an issue if you would like to see more determinism
  377. checks. This argument is only supported if ``use_reentrant=False``,
  378. if ``use_reentrant=True``, the determinism check is always disabled.
  379. debug(bool, optional): If ``True``, error messages will also include
  380. a trace of the operators ran during the original forward computation
  381. as well as the recomputation. This argument is only supported if
  382. ``use_reentrant=False``.
  383. args: tuple containing inputs to the :attr:`function`
  384. Returns:
  385. Output of running :attr:`function` on :attr:`*args`
  386. """
  387. if use_reentrant is None:
  388. warnings.warn(
  389. "torch.utils.checkpoint: the use_reentrant parameter should be "
  390. "passed explicitly. In version 2.4 we will raise an exception "
  391. "if use_reentrant is not passed. use_reentrant=False is "
  392. "recommended, but if you need to preserve the current default "
  393. "behavior, you can pass use_reentrant=True. Refer to docs for more "
  394. "details on the differences between the two variants.",
  395. stacklevel=2
  396. )
  397. use_reentrant = True
  398. # Hack to mix *args with **kwargs in a python 2.7-compliant way
  399. preserve = kwargs.pop("preserve_rng_state", True)
  400. if kwargs and use_reentrant:
  401. raise ValueError(
  402. "Unexpected keyword arguments: " + ",".join(arg for arg in kwargs)
  403. )
  404. if use_reentrant:
  405. if context_fn is not noop_context_fn or debug is not False:
  406. raise ValueError(
  407. "Passing `context_fn` or `debug` is only supported when "
  408. "use_reentrant=False."
  409. )
  410. return CheckpointFunction.apply(function, preserve, *args)
  411. else:
  412. gen = _checkpoint_without_reentrant_generator(
  413. function, preserve, context_fn, determinism_check, debug, *args, **kwargs
  414. )
  415. # Runs pre-forward logic
  416. next(gen)
  417. ret = function(*args, **kwargs)
  418. # Runs post-forward logic
  419. try:
  420. next(gen)
  421. except StopIteration:
  422. return ret
  423. def checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwargs):
  424. r"""Checkpoint a sequential model to save memory.
  425. Sequential models execute a list of modules/functions in order
  426. (sequentially). Therefore, we can divide such a model in various segments
  427. and checkpoint each segment. All segments except the last will not store
  428. the intermediate activations. The inputs of each checkpointed segment will
  429. be saved for re-running the segment in the backward pass.
  430. .. warning::
  431. The ``use_reentrant`` parameter should be passed explicitly. In version
  432. 2.4 we will raise an exception if ``use_reentrant`` is not passed.
  433. If you are using the ``use_reentrant=True` variant, please see
  434. :func:`~torch.utils.checkpoint.checkpoint` for
  435. the important considerations and limitations of this variant. It is
  436. recommended that you use ``use_reentrant=False``.
  437. .. warning:
  438. Since PyTorch 1.4, it allows only one Tensor as the input and
  439. intermediate outputs, just like :class:`torch.nn.Sequential`.
  440. Args:
  441. functions: A :class:`torch.nn.Sequential` or the list of modules or
  442. functions (comprising the model) to run sequentially.
  443. segments: Number of chunks to create in the model
  444. input: A Tensor that is input to :attr:`functions`
  445. preserve_rng_state(bool, optional): Omit stashing and restoring
  446. the RNG state during each checkpoint.
  447. Default: ``True``
  448. use_reentrant(bool):
  449. specify whether to use the activation checkpoint variant that
  450. requires reentrant autograd. This parameter should be passed
  451. explicitly. In version 2.4 we will raise an exception if
  452. ``use_reentrant`` is not passed. If ``use_reentrant=False``,
  453. ``checkpoint`` will use an implementation that does not require
  454. reentrant autograd. This allows ``checkpoint`` to support additional
  455. functionality, such as working as expected with
  456. ``torch.autograd.grad`` and support for keyword arguments input into
  457. the checkpointed function.
  458. Returns:
  459. Output of running :attr:`functions` sequentially on :attr:`*inputs`
  460. Example:
  461. >>> # xdoctest: +SKIP("stub")
  462. >>> model = nn.Sequential(...)
  463. >>> input_var = checkpoint_sequential(model, chunks, input_var)
  464. """
  465. if use_reentrant is None:
  466. warnings.warn(
  467. "torch.utils.checkpoint.checkpoint_sequential: the use_reentrant "
  468. "parameter should be passed explicitly. "
  469. "In version 2.4 we will raise an exception if use_reentrant "
  470. "is not passed. use_reentrant=False is "
  471. "recommended, but if you need to preserve the current default "
  472. "behavior, you can pass use_reentrant=True. Refer to docs for more "
  473. "details on the differences between the two variants."
  474. )
  475. use_reentrant = True
  476. # Hack for keyword-only parameter in a python 2.7-compliant way
  477. preserve = kwargs.pop("preserve_rng_state", True)
  478. if kwargs:
  479. raise ValueError(
  480. "Unexpected keyword arguments: " + ",".join(arg for arg in kwargs)
  481. )
  482. def run_function(start, end, functions):
  483. def forward(input):
  484. for j in range(start, end + 1):
  485. input = functions[j](input)
  486. return input
  487. return forward
  488. if isinstance(functions, torch.nn.Sequential):
  489. functions = list(functions.children())
  490. segment_size = len(functions) // segments
  491. # the last chunk has to be non-volatile
  492. end = -1
  493. for start in range(0, segment_size * (segments - 1), segment_size):
  494. end = start + segment_size - 1
  495. input = checkpoint(
  496. run_function(start, end, functions),
  497. input,
  498. use_reentrant=use_reentrant,
  499. preserve_rng_state=preserve,
  500. )
  501. return run_function(end + 1, len(functions) - 1, functions)(input)
  502. def _internal_assert(cond):
  503. if not cond:
  504. raise AssertionError(
  505. "Something went unexpectedly wrong in activation checkpoint. "
  506. "Please report this bug by filing an issue to PyTorch."
  507. )
  508. # NOTE [ Nestable Checkpoint ]
  509. #
  510. # The semantics of nested checkpoint can be defined by two basic rules.
  511. # Following the two rules leads to an important implication that is central
  512. # to motivating the design.
  513. #
  514. # Rule 1. Saved tensors are managed by inner-most checkpoint only and hidden
  515. # from any outer layers of checkpoint.
  516. #
  517. # Rule 2. The inputs of inner checkpoints are treated as tensors saved to its
  518. # parent checkpoint.
  519. #
  520. # Implication: To recompute any given saved tensor, we need to recompute all of
  521. # the checkpoints wrapping it.
  522. #
  523. # Why is this implied? To unpack a saved tensor X during backward we need to
  524. # recompute the inner-most checkpoint (#1), and in order to recompute that
  525. # checkpoint I need to have its inputs, which are managed by that checkpoint's
  526. # parent (#2), which thus also needs to be recomputed first. Continue this line
  527. # of reasoning and we realize that in order to unpack X, all checkpoints that
  528. # were active at the time X was saved need to be recomputed. (unless we have
  529. # already done so in that backward for some other saved tensor).
  530. #
  531. # In practice, we use a noop autograd Function to save inputs as saved tensors.
  532. # During unpack calling ctx.saved_tensor triggers the parent checkpoint to
  533. # recompute.
  534. #
  535. # Rule 3. We should start recomputation as if there are no checkpoints currently
  536. # active. Checkpoints encountered during recomputation are still
  537. # respected.
  538. #
  539. # When we start recomputation, we push the saved variable hook meant for
  540. # recomputation on the stack. See examples in Rule 6 for more context.
  541. #
  542. # * * * *
  543. #
  544. # Beyond the basic semantics specific to nested checkpoint, we impose several
  545. # more constraints that may apply to checkpointing in general.
  546. #
  547. # Rule 4. Lifetime of recomputed tensors
  548. #
  549. # Recomputed tensors are considered specific to particular invocations
  550. # of backward and are always cleared immediately as they are unpacked
  551. # Particularly, we require this to happen even if retain_graph=True.
  552. #
  553. # [ Implementation details of Rule 4 ]
  554. #
  555. # If we were okay with recomputed tensors staying alive after backward is run
  556. # with retain_graph=True, we would store recomputed variables as the values of a
  557. # WeakKeyDictionary and pack strong references to the keys, so that as we
  558. # backward, those packed keys would be cleared as long as retain_graph=False.
  559. # Clearing the packed key clears the corresponding entry in the WKD.
  560. #
  561. # If we wish recomputed variables to be immediately cleared as we unpack them in
  562. # the retain_graph=True case, we cannot rely on the packed keys to be cleared by
  563. # backward automatically. Instead of packing the strong reference to the key
  564. # directly, we pack a container object, which we manually clear as we unpack.
  565. #
  566. # An important detail is that if a second backward happens, the second
  567. # recomputation needs to reset the container with a newly created key.
  568. #
  569. # Rule 5. Stop recomputation as soon as we've recomputed the saved tensors we
  570. # know we need.
  571. #
  572. # [ Implementation details of Rule 5 ]
  573. #
  574. # During recomputation, raise an exception if the number of recomputed tensors
  575. # matches the number of tensors that we expected to recompute. We wrap the
  576. # recomputation call with a try-catch to catch this specific exception. See
  577. # Rule #6 below for some examples.
  578. #
  579. # Rule 6. We support doing backward inside checkpoint context
  580. #
  581. # [ retain_graph is True]
  582. #
  583. # def fn(x):
  584. # y = x.sin()
  585. # z = y.cos()
  586. # gx, = torch.autograd.grad(z, x, retains_grad=True)
  587. # return gx, z
  588. #
  589. # out = checkpoint(fn)(inp)
  590. # out.backward()
  591. #
  592. # Because z is saved by cos while checkpoint is enabled, it would not be
  593. # actually saved, and so the .grad() call inside must trigger a recomputation.
  594. #
  595. # During recomputation the "inner pack hook" has two responsibilities:
  596. #
  597. # 1) As usual, populating the WeakKeyDictionary storing recomputed tensors
  598. # 2) Pack the actual tensor (detached) so that one may perform backward on the
  599. # recomputed graph. The tensors saved to this graph will live until the end
  600. # of recomputation, or die earlier if someone performs backward with
  601. # retain_graph=False.
  602. #
  603. # More generally performing backward on the recomputed graph occurs in the
  604. # following cases:
  605. # - If backward is performed inside forward,
  606. # - During the original forward IF early-stop is disabled
  607. # - During the original backward
  608. # - If there are multiple .grad()/.backward() calls, we would perform backward
  609. # on the recomputed graph even if early-stop is enabled (see the example below)
  610. #
  611. # [ retain_graph is False ]
  612. #
  613. # The example below shows what happens if during recomputation we find that some
  614. # of the tensors we are trying to recompute have already been cleared.
  615. #
  616. # Spoiler: we don't do anything special, we just skip over them!
  617. #
  618. # def fn(x):
  619. # y = x.sin() # (1)
  620. # z = y.cos() # (2)
  621. # gx, = torch.autograd.grad(z, x) # (3)
  622. # return x.cos() * gx # (4)
  623. #
  624. # out = checkpoint(fn)(inp)
  625. # out.backward() # (5)
  626. #
  627. # 1, 2. Don't save x and y since we are inside a checkpoint.
  628. # 3. Trigger a recompute of fn since x and y weren't saved.
  629. # And depending on whether early stop is enabled, either stop at (2) or
  630. # continue running the function.
  631. # Because we are running backward with retain_graph=False, we clear x and y's
  632. # holders.
  633. # 4. Don't save x since we are inside a checkpoint.
  634. # 5. Calling backward triggers another recompute of fn. During recompute, we see
  635. # that x and y have already been cleared in the original graph as indicated
  636. # by holder=None. We skip over them. We still save x at (4) (since its holder
  637. # is still alive.)
  638. _enable_checkpoint_early_stop = True
  639. @contextlib.contextmanager
  640. def set_checkpoint_early_stop(enable: bool):
  641. """Context manager that sets whether checkpoint should stop recomputation early.
  642. By default, non-reentrant checkpoint stops recomputation as soon as it
  643. has computed all needed Tensors. This context manager can be used to disable
  644. that feature if it is problematic for your specific application.
  645. This context manager only needs to be active when forward is run. It does
  646. not need to be active during backward.
  647. Example::
  648. >>> # xdoctest: +SKIP(failing)
  649. >>> message = "saved tensors default hooks are disabled"
  650. >>> with set_checkpoint_early_stop(False):
  651. ... # Any checkpoint under this context manager will respect this
  652. ... # context manager, even if its backward is performed outside.
  653. ... out = checkpoint(fn, inputs)
  654. ...
  655. >>> out.backward()
  656. """
  657. global _enable_checkpoint_early_stop
  658. try:
  659. prev = _enable_checkpoint_early_stop
  660. _enable_checkpoint_early_stop = enable
  661. yield
  662. finally:
  663. _enable_checkpoint_early_stop = prev
  664. class _Handle:
  665. pass
  666. class _Holder:
  667. def __init__(self):
  668. self.handles: Dict[int, Optional[_Handle]] = dict()
  669. class _NoopSaveInputs(torch.autograd.Function):
  670. @staticmethod
  671. def forward(*args):
  672. return torch.empty((0,))
  673. @staticmethod
  674. def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
  675. # Only tensors can be saved with ctx.save_for_backward, everything else
  676. # is captured by get_args, which is saved directly on ctx
  677. tensor_indices, tensors = zip(
  678. *[(i, o) for i, o in enumerate(inputs) if isinstance(o, torch.Tensor)]
  679. )
  680. idx2saved_idx = {b: a for a, b in enumerate(tensor_indices)}
  681. # args but with tensors replaced with None as placeholders
  682. args = [None if isinstance(o, torch.Tensor) else o for o in inputs]
  683. def get_args(saved_tensors):
  684. # restore the placeholders with the original tensors grabbed from
  685. # ctx.saved_tensors (which may be saved on a parent checkpoint if
  686. # this checkpoint is nested, and that would trigger a recursive
  687. # unpack!)
  688. ret = [
  689. saved_tensors[idx2saved_idx[i]] if i in tensor_indices else o
  690. for i, o in enumerate(args)
  691. ]
  692. # grab the tail since we also saved the dummy to avoid having to explicitly
  693. # handle the case where there are no tensor inputs
  694. return ret[1:]
  695. ctx.get_args = get_args
  696. ctx.save_for_backward(*tensors)
  697. @staticmethod
  698. def backward(ctx, *grad_outputs):
  699. raise AssertionError("Did not expect to backward on this graph")
  700. class _CheckpointFrame:
  701. def __init__(self, recompute_fn, early_stop, unpack_error_cb, metadata_fn):
  702. self.recompute_fn = recompute_fn
  703. self.input_saver = None
  704. self.weak_holders: List[ReferenceType] = []
  705. # We store this as a weakkeydictionary so that in the case of a partial
  706. # backward, the entries in the dict are cleared alongside the Holder
  707. # which will be removed when the SavedVariable is cleared.
  708. self.recomputed: DefaultDict[
  709. int, weakref.WeakKeyDictionary[_Handle, torch.Tensor]
  710. ] = defaultdict(weakref.WeakKeyDictionary)
  711. # We need both recomp_counter and recomputed since they can diverge
  712. # https://github.com/pytorch/pytorch/pull/90105#discussion_r1135889885
  713. self.recomp_counter: DefaultDict[int, int] = defaultdict(int)
  714. self.is_recomputed: DefaultDict[int, bool] = defaultdict(bool)
  715. # See Rule 5
  716. self.early_stop = early_stop
  717. # Debugging
  718. self.metadata_fn = metadata_fn
  719. self.unpack_error_cb = unpack_error_cb
  720. self.x_metadatas = []
  721. self.forward_completed = False
  722. self.ignore_saved_mismatch = False
  723. def check_recomputed_tensors_match(self, gid):
  724. if self.ignore_saved_mismatch:
  725. # TODO: we can probably make this check stricter by checking that
  726. # the metadata of the first tensors still match.
  727. return
  728. # NOTE [ Error handling for checkpoint ]
  729. #
  730. # At a high level, we need to check that the tensors saved
  731. # during original forward matches tensors saved during recompute
  732. # This means handling 3 cases:
  733. #
  734. # 1. During recompute, more tensors were saved.
  735. #
  736. # Usually this is hidden due to the StopRecomputationError
  737. # but if early stop is not enabled, or we would have errored
  738. # anyway because there aren't enough weak_holders. But we
  739. # do want to have a nice error. See the _recomputation_hook
  740. # for details.
  741. if not len(self.weak_holders) == self.recomp_counter[gid]:
  742. # 2. During recompute, fewer tensors were saved
  743. #
  744. # We know that everytime we save something do original forward
  745. # we append to weak_holder, and every time we save a tensor
  746. # during recompute we increment recompute_counter.
  747. raise CheckpointError(
  748. "torch.utils.checkpoint: A different number of tensors was saved "
  749. "during the original forward and recomputation.\n"
  750. f"Number of tensors saved during forward: {len(self.weak_holders)}\n"
  751. f"Number of tensors saved during recomputation: {self.recomp_counter[gid]}"
  752. )
  753. # 3. During recompute, the same tensors were saved, but they
  754. # have different metadata
  755. nb_meta_different = []
  756. for idx, weak_holder in enumerate(self.weak_holders):
  757. holder = weak_holder()
  758. if holder is None:
  759. continue
  760. # We've seen all holders since we iterate over them in order
  761. # For every holder that is still alive now, it must've been
  762. # alive when we saw it during recompute, therefore, the
  763. # gid must be set.
  764. _internal_assert(gid in holder.handles)
  765. # We know this is the first unpack, so it couldn't have been set
  766. # to None yet.
  767. _internal_assert(holder.handles[gid] is not None)
  768. # We always set these together in the recomputation hook
  769. _internal_assert(holder.handles[gid] in self.recomputed[gid])
  770. # see pack hook, x_metadata is 1:1 with weak_holders.
  771. x_meta = self.x_metadatas[idx]
  772. recomputed_x = self.recomputed[gid][holder.handles[gid]]
  773. if x_meta != self.metadata_fn(recomputed_x):
  774. nb_meta_different.append((idx, x_meta, self.metadata_fn(recomputed_x)))
  775. if len(nb_meta_different) > 0:
  776. mismatched_tensors = ""
  777. for idx, x_meta, recomputed_meta in nb_meta_different:
  778. mismatched_tensors += (
  779. f"tensor at position {idx}:\n"
  780. f"saved metadata: {x_meta}\n"
  781. f"recomputed metadata: {recomputed_meta}\n"
  782. )
  783. raise CheckpointError(
  784. "torch.utils.checkpoint: Recomputed values for the following tensors "
  785. "have different metadata than during the forward pass.\n"
  786. f"{mismatched_tensors}"
  787. )
  788. _checkpoint_error_template = """ \
  789. An error happened while unpacking tensors; dumping logs of latest computation
  790. because you passed `debug=True` to `torch.utils.checkpoint.checkpoint()`.
  791. Scroll all the way down for guidance on how to navigate these logs.
  792. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+
  793. | 1. Stack traces of the operators that ran in the original forward |
  794. +------------------------------------------------------------------------------+
  795. {forward_traces}
  796. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+
  797. | 2. Stack traces of the operators that ran during recomputation |
  798. +------------------------------------------------------------------------------+
  799. {recompute_traces}
  800. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+
  801. | 3. Log of operators in the original forward and recomputation |
  802. +------------------------------------------------------------------------------+
  803. (Scroll up to correlate stack traces with each operation listed below. This
  804. helps identify their source in the code.)
  805. IMPORTANT: Differences in "detach" calls between the original forward and the
  806. recomputation are expected. They are introduced by the checkpointing
  807. mechanism and can be ignored.
  808. Operations executed during the original forward:
  809. {forward_ops}
  810. Operations executed during recomputation:
  811. {recompute_ops}
  812. +------------------------------------------------------------------------------+
  813. ERROR: Detected non-determinism while running activation checkpointing
  814. You are seeing this error because you passed `debug=True` to checkpoint and
  815. tensors to be saved during the original forward and differ between those saved
  816. during recomputation. This can happen if different operators were ran in the
  817. original forward and in the recomputation.
  818. To identify where the mismatch may be coming from, you can do the following:
  819. 1) Compare the operators ran during original forward and recomputation to
  820. see where they differ. These operators are printed above in the order they
  821. were executed.
  822. 2) Review the stack trace for each operator to locate its invocation source.
  823. Each operator's stack trace is printed in their execution order.
  824. Note that the logs can be quite long. Here's how they are structured:
  825. (Tip: you can Ctrl-f for these headers)
  826. 1. Stack traces of the operators that ran in the original forward
  827. 2. Stack traces of the operators that ran during recomputation
  828. 3. Log of operators in the original forward and recomputation
  829. 4. Error message <--- You are here
  830. --------------------------------------------------------------------------------
  831. """
  832. class CheckpointError(RuntimeError):
  833. pass
  834. def _get_debug_context_and_cb() -> Tuple[Callable[[], Any], Callable[[CheckpointError], None]]:
  835. # This function returns the context_fn and error_cb to be used by the
  836. # checkpointing mechanism. error_cb is invoked when an error is detected
  837. # during unpack.
  838. # record_context_cpp is not support on non-linux non-x86_64 platforms
  839. cpp_tb = platform.machine() == 'x86_64' and platform.system() == 'Linux'
  840. class CaptureLogs:
  841. def __init__(self):
  842. self.logs = None
  843. self.tbs = None
  844. def get_context_manager(self):
  845. @contextlib.contextmanager
  846. def logging_mode():
  847. with LoggingTensorMode(), \
  848. capture_logs(True, python_tb=True, script_tb=True, cpp_tb=cpp_tb) as logs_and_tb:
  849. self.logs, self.tbs = logs_and_tb
  850. yield logs_and_tb
  851. return logging_mode()
  852. capture_logs_fwd = CaptureLogs()
  853. capture_logs_recompute = CaptureLogs()
  854. def unpack_error_cb(e: CheckpointError):
  855. def get_str_tb(label, capture_logs):
  856. out = ""
  857. total_len = len(capture_logs.logs)
  858. for i, (log, tb) in enumerate(zip(capture_logs.logs, capture_logs.tbs)):
  859. out += f"{log} ({i + 1} of {total_len} in {label})\n\n"
  860. found_torch_dispatch = False
  861. for line in tb:
  862. # Start printing stack trace only after __torch_dispatch__ is found
  863. is_torch_dispatch = line['name'] == '__torch_dispatch__'
  864. if not found_torch_dispatch and not is_torch_dispatch:
  865. continue
  866. elif is_torch_dispatch:
  867. found_torch_dispatch = True
  868. continue
  869. out += f"{line['filename']}:{line['line']}:{line['name']}\n"
  870. out += "\n\n"
  871. return out
  872. assert capture_logs_fwd.logs is not None
  873. assert capture_logs_recompute.logs is not None
  874. raise CheckpointError(
  875. _checkpoint_error_template.format(
  876. forward_traces=get_str_tb("original", capture_logs_fwd),
  877. recompute_traces=get_str_tb("recompute", capture_logs_recompute),
  878. forward_ops="\n".join(capture_logs_fwd.logs),
  879. recompute_ops="\n".join(capture_logs_recompute.logs)
  880. )
  881. ) from e
  882. def context_fn():
  883. return capture_logs_fwd.get_context_manager(), capture_logs_recompute.get_context_manager()
  884. return context_fn, unpack_error_cb
  885. def _default_meta_extractor(x: torch.Tensor) -> Dict[str, Any]:
  886. # These properties are fast to check, easy to understand
  887. return {
  888. "shape": x.shape,
  889. "dtype": x.dtype,
  890. "device": x.device
  891. }
  892. _allowed_determinism_checks_to_fns: Dict[str, Callable[[torch.Tensor], Any]] = {
  893. _DEFAULT_DETERMINISM_MODE: _default_meta_extractor,
  894. "none": lambda _: None,
  895. }
  896. # See Rule 5
  897. class _StopRecomputationError(Exception):
  898. pass
  899. class _recomputation_hook(torch.autograd.graph.saved_tensors_hooks):
  900. def __init__(self, target_frame_ref: ReferenceType, gid: int):
  901. def pack_hook(x):
  902. target_frame = target_frame_ref()
  903. assert target_frame is not None # appease mypy
  904. recomp_idx = target_frame.recomp_counter[gid]
  905. target_frame.recomp_counter[gid] += 1
  906. if recomp_idx >= len(target_frame.weak_holders):
  907. assert not target_frame.early_stop
  908. if not target_frame.forward_completed:
  909. # We run into this case when early stop is not enabled and do
  910. # grad within checkpoint.
  911. # We need to set this flag, so we don't error out later when
  912. # we check if the number of tensors saved during forward and
  913. # recomputation match.
  914. target_frame.ignore_saved_mismatch = True
  915. return x.detach()
  916. raise CheckpointError(
  917. "torch.utils.checkpoint: trying to save more tensors during "
  918. "recomputation than during the original forward pass."
  919. )
  920. holder = target_frame.weak_holders[recomp_idx]()
  921. # This holder may have been cleared because someone may have called
  922. # backward within forward. If so, we don't need to save.
  923. if holder is not None:
  924. _internal_assert(holder.handles.get(gid, None) is None)
  925. holder.handles[gid] = _Handle()
  926. target_frame.recomputed[gid][holder.handles[gid]] = x.detach()
  927. if target_frame.early_stop and target_frame.recomp_counter[gid] == len(
  928. target_frame.weak_holders
  929. ):
  930. raise _StopRecomputationError
  931. # See Rule 6: [ retain_graph is True ] above
  932. return x.detach()
  933. def unpack_hook(x):
  934. # See Rule 6: [ retain_graph is True ] above for an example of when
  935. # the graph created during recomputation could be backwarded.
  936. return x
  937. super().__init__(pack_hook, unpack_hook)
  938. class _checkpoint_hook(torch.autograd.graph.saved_tensors_hooks):
  939. def __init__(self, frame):
  940. def pack_hook(x):
  941. # See Rule 4 above
  942. holder = _Holder()
  943. frame.weak_holders.append(weakref.ref(holder))
  944. # Save metadata to detect non-determinism
  945. if frame.metadata_fn is not None:
  946. with torch.no_grad():
  947. frame.x_metadatas.append(frame.metadata_fn(x))
  948. return holder
  949. def unpack_hook(holder):
  950. gid = torch._C._current_graph_task_id()
  951. if gid == -1:
  952. # generate a temporary id if we trigger unpack outside of a backward call
  953. gid = int(uuid.uuid4())
  954. if not frame.is_recomputed[gid]:
  955. ctx = frame.input_saver.grad_fn
  956. args = ctx.get_args(ctx.saved_tensors)
  957. try:
  958. with _recomputation_hook(
  959. weakref.ref(frame), gid
  960. ), torch.autograd.enable_grad():
  961. frame.recompute_fn(*args)
  962. except _StopRecomputationError:
  963. pass
  964. frame.is_recomputed[gid] = True
  965. frame.check_recomputed_tensors_match(gid)
  966. _internal_assert(gid in holder.handles)
  967. if holder.handles[gid] is None:
  968. raise CheckpointError(
  969. "torch.utils.checkpoint: Unpack is being triggered for a tensor that was already "
  970. "unpacked once. If you are calling ctx.saved_tensors in backward, make sure to do "
  971. "so only once. Otherwise please open an issue with details on your use case."
  972. )
  973. _internal_assert(holder.handles[gid] in frame.recomputed[gid])
  974. ret = frame.recomputed[gid][holder.handles[gid]]
  975. holder.handles[gid] = None
  976. return ret
  977. if frame.unpack_error_cb is not None:
  978. def unpack_hook_with_error_cb(holder):
  979. try:
  980. return unpack_hook(holder)
  981. except CheckpointError as e:
  982. frame.unpack_error_cb(e)
  983. super().__init__(pack_hook, unpack_hook_with_error_cb)
  984. else:
  985. super().__init__(pack_hook, unpack_hook)
  986. def _is_compiling(func, args, kwargs):
  987. # Check if we are under AOTAutograd tracing
  988. # There should probably be a better way to do this...
  989. # TODO: unify _is_compiling across all compile stacks
  990. for arg in args:
  991. if isinstance(arg, torch.Tensor) and is_fun(arg):
  992. return True
  993. return False
  994. def _detach(x):
  995. if isinstance(x, torch.Tensor):
  996. return x.detach()
  997. return x
  998. uid = count(1)
  999. # NOTE: torch.utils.checkpoint internal logic will call these two functions unknown number of times
  1000. # (i.e. there could be _CachedTorchDispatchMode calls that doesn't map to a _CachingTorchDispatchMode call),
  1001. # so we ignore these ops and just always recompute them.
  1002. _ignored_ops = {
  1003. torch.ops.prim.device.default,
  1004. torch.ops.aten.detach.default,
  1005. } | set(torch._subclasses.functional_tensor.FunctionalTensor.metadata_fns)
  1006. class _CachingTorchDispatchMode(TorchDispatchMode):
  1007. r"""
  1008. A :class:`TorchDispatchMode` to implement selective activation checkpointing
  1009. that's compatible with torch.compile. Used together with _CachedTorchDispatchMode.
  1010. """
  1011. def __init__(self, policy_fn, storage):
  1012. self.policy_fn = policy_fn
  1013. self.storage = storage
  1014. def push_into_storage(self, out, func, args, kwargs):
  1015. out_detached = tree_map(_detach, out)
  1016. self.storage[func].append(out_detached)
  1017. def _handle_compile_in_forward_ctx(self, should_not_recompute, func, args, kwargs):
  1018. if should_not_recompute:
  1019. fx_traceback.current_meta["recompute"] = 0
  1020. # NOTE: Here we just store and reuse output of all ops, since in torch.compile mode
  1021. # we decide and handle recomputation in the partitioner.
  1022. out = func(*args, **kwargs)
  1023. self.push_into_storage(out, func, args, kwargs)
  1024. return out
  1025. def __torch_dispatch__(self, func, types, args=(), kwargs=None):
  1026. if kwargs is None:
  1027. kwargs = {}
  1028. if func in _ignored_ops:
  1029. return func(*args, **kwargs)
  1030. should_not_recompute = self.policy_fn("forward", func, *args, **kwargs)
  1031. if _is_compiling(func, args, kwargs):
  1032. return self._handle_compile_in_forward_ctx(should_not_recompute, func, args, kwargs)
  1033. else:
  1034. if should_not_recompute:
  1035. out = func(*args, **kwargs)
  1036. self.push_into_storage(out, func, args, kwargs)
  1037. else:
  1038. out = func(*args, **kwargs)
  1039. return out
  1040. class _CachedTorchDispatchMode(TorchDispatchMode):
  1041. r"""
  1042. A :class:`TorchDispatchMode` to implement selective activation checkpointing
  1043. that's compatible with torch.compile. Used together with _CachingTorchDispatchMode.
  1044. """
  1045. def __init__(self, policy_fn, storage):
  1046. self.policy_fn = policy_fn
  1047. self.storage = storage
  1048. def pop_from_storage(self, func, args, kwargs):
  1049. assert func in self.storage
  1050. out = self.storage[func].pop(0)
  1051. return out
  1052. def _handle_compile_in_recompute_ctx(self, should_not_recompute, func, args, kwargs):
  1053. out = self.pop_from_storage(func, args, kwargs)
  1054. return out
  1055. def __torch_dispatch__(self, func, types, args=(), kwargs=None):
  1056. if kwargs is None:
  1057. kwargs = {}
  1058. if func in _ignored_ops:
  1059. return func(*args, **kwargs)
  1060. should_not_recompute = self.policy_fn("recompute", func, *args, **kwargs)
  1061. if _is_compiling(func, args, kwargs):
  1062. return self._handle_compile_in_recompute_ctx(should_not_recompute, func, args, kwargs)
  1063. else:
  1064. if should_not_recompute:
  1065. out = self.pop_from_storage(func, args, kwargs)
  1066. else:
  1067. out = func(*args, **kwargs)
  1068. return out
  1069. def _pt2_selective_checkpoint_context_fn_gen(policy_fn):
  1070. """
  1071. A helper function that generates a pair of contexts to be later passed into
  1072. `torch.utils.checkpoint` API to implment selective checkpointing.
  1073. .. warning::
  1074. This is context_fn is intended for use with torch.compile only.
  1075. Args:
  1076. policy_fn (Callable[[Callable, List[Any], Dict[str, Any]], bool]): Policy function
  1077. to decide whether a particular op should be recomputed in backward pass or not.
  1078. In eager mode:
  1079. If policy_fn(...) returns True, the op is guaranteed to NOT be recomputed.
  1080. If policy_fn(...) returns False, the op is guaranteed to be recomputed.
  1081. In torch.compile mode:
  1082. If policy_fn(...) returns True, the op is guaranteed to NOT be recomputed.
  1083. If policy_fn(...) returns False, the op may or may not be recomputed
  1084. (it's up to the partitioner to decide).
  1085. Returns:
  1086. A pair of generated contexts.
  1087. Example:
  1088. >>> # xdoctest: +REQUIRES(LINUX)
  1089. >>>
  1090. >>> def get_custom_policy():
  1091. >>> no_recompute_list = [
  1092. >>> torch.ops.aten.mm.default,
  1093. >>> ]
  1094. >>> def custom_policy(mode, func, *args, **kwargs):
  1095. >>> return func in no_recompute_list
  1096. >>> return custom_policy
  1097. >>>
  1098. >>> def selective_checkpointing_context_fn():
  1099. >>> return _pt2_selective_checkpoint_context_fn_gen(get_custom_policy())
  1100. >>>
  1101. >>> def gn(x, y):
  1102. >>> return torch.sigmoid(torch.matmul(torch.matmul(x, y), y)) * y
  1103. >>>
  1104. >>> def fn(x, y):
  1105. >>> return torch.utils.checkpoint.checkpoint(
  1106. >>> gn, x, y,
  1107. >>> use_reentrant=False,
  1108. >>> context_fn=selective_checkpointing_context_fn,
  1109. >>> )
  1110. >>>
  1111. >>> x = torch.randn(4, 4, requires_grad=True)
  1112. >>> y = torch.randn(4, 4, requires_grad=True)
  1113. >>>
  1114. >>> compiled_fn = torch.compile(fn)
  1115. """
  1116. storage: Dict[Any, List[Any]] = defaultdict(list)
  1117. return _CachingTorchDispatchMode(policy_fn, storage), _CachedTorchDispatchMode(policy_fn, storage)
  1118. # NB: this helper wraps fn before calling checkpoint_impl. kwargs and
  1119. # saving/restoring of global state is handled here.
  1120. def _checkpoint_without_reentrant_generator(
  1121. fn,
  1122. preserve_rng_state=True,
  1123. context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
  1124. determinism_check: str = _DEFAULT_DETERMINISM_MODE,
  1125. debug: bool = False,
  1126. *args,
  1127. **kwargs
  1128. ):
  1129. """Checkpointing without reentrant autograd.
  1130. Args:
  1131. function: describes what to run in the forward pass of the model or
  1132. part of the model. It should also know how to handle the inputs
  1133. passed as the tuple. For example, in LSTM, if user passes
  1134. ``(activation, hidden)``, :attr:`function` should correctly use the
  1135. first input as ``activation`` and the second input as ``hidden``
  1136. preserve_rng_state(bool, optional): Omit stashing and restoring
  1137. the RNG state during each checkpoint.
  1138. Default: ``True``
  1139. context_fn(Callable, optional): A callable returning a tuple of two
  1140. context managers. The function and its recomputation will be run
  1141. under the first and second context managers respectively.
  1142. determinism_check(str, optional): A string specifying the determinism
  1143. check to perform. By default it is set to ``"default"`` which
  1144. compares the shapes, dtypes, and devices of the recomputed tensors
  1145. against those the saved tensors. To turn off this check, specify
  1146. ``"none"``. Currently these are the only two supported values.
  1147. Please open an issue if you would like to see more determinism
  1148. checks.
  1149. debug(bool, optional): If ``True``, error messages will also include
  1150. a trace of the operators ran during the original forward computation
  1151. as well as the recomputation.
  1152. *args: Arguments to pass in to the given ``function``.
  1153. **kwargs: Keyword arguments to pass into the given ``function``.
  1154. """
  1155. unpack_error_cb = None
  1156. if _checkpoint_debug_enabled if _checkpoint_debug_enabled is not None else debug:
  1157. if context_fn != noop_context_fn:
  1158. raise ValueError(
  1159. "debug=True is incompatible with non-default context_fn"
  1160. )
  1161. context_fn, unpack_error_cb = _get_debug_context_and_cb()
  1162. if determinism_check in _allowed_determinism_checks_to_fns:
  1163. metadata_fn = _allowed_determinism_checks_to_fns[determinism_check]
  1164. else:
  1165. raise ValueError(
  1166. f"determinism_check should be one of {list(_allowed_determinism_checks_to_fns.keys())}, "
  1167. f"but got {determinism_check}"
  1168. )
  1169. device = _infer_device_type(*args)
  1170. device_module = _get_device_module(device)
  1171. forward_context, recompute_context = context_fn()
  1172. if _is_compiling(fn, args, kwargs) and context_fn != noop_context_fn:
  1173. assert (
  1174. isinstance(forward_context, TorchDispatchMode) and
  1175. isinstance(recompute_context, TorchDispatchMode)
  1176. ), \
  1177. "In torch.compile mode, `context_fn` arg passed to `torch.utils.checkpoint` " + \
  1178. "must generate a tuple of two `TorchDispatchMode`s."
  1179. # Accommodates the (remote) possibility that autocast is enabled for cpu AND gpu.
  1180. device_autocast_kwargs, cpu_autocast_kwargs = _get_autocast_kwargs(device=device)
  1181. if preserve_rng_state:
  1182. fwd_cpu_state = torch.get_rng_state()
  1183. # Don't eagerly initialize the cuda context by accident.
  1184. # (If the user intends that the context is initialized later, within their
  1185. # run_function, we SHOULD actually stash the cuda state here. Unfortunately,
  1186. # we have no way to anticipate this will happen before we run the function.
  1187. # If they do so, we raise an error.)
  1188. had_device_in_fwd = False
  1189. if getattr(device_module, "_initialized", False):
  1190. had_device_in_fwd = True
  1191. fwd_devices, fwd_device_states = get_device_states(*args)
  1192. def recompute_fn(*inputs):
  1193. kwargs, *args = inputs
  1194. # This will be called later during recomputation. This wrapping enables
  1195. # the necessary global state to be captured.
  1196. rng_devices = []
  1197. if preserve_rng_state and had_device_in_fwd:
  1198. rng_devices = fwd_devices
  1199. with torch.random.fork_rng(
  1200. devices=rng_devices, enabled=preserve_rng_state, device_type=device
  1201. ):
  1202. if preserve_rng_state:
  1203. torch.set_rng_state(fwd_cpu_state)
  1204. if had_device_in_fwd:
  1205. set_device_states(fwd_devices, fwd_device_states)
  1206. device_autocast_ctx = torch.amp.autocast(
  1207. device_type=device, **device_autocast_kwargs
  1208. ) if torch.amp.is_autocast_available(device) else contextlib.nullcontext()
  1209. with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined]
  1210. fn(*args, **kwargs)
  1211. new_frame = _CheckpointFrame(
  1212. recompute_fn,
  1213. _enable_checkpoint_early_stop,
  1214. unpack_error_cb,
  1215. metadata_fn
  1216. )
  1217. dummy = torch.empty((0,), requires_grad=True)
  1218. new_frame.input_saver = _NoopSaveInputs.apply(dummy, kwargs, *args)
  1219. # When ambient grad_mode is False
  1220. if new_frame.input_saver.grad_fn is None:
  1221. yield
  1222. return
  1223. with _checkpoint_hook(new_frame), forward_context:
  1224. yield
  1225. new_frame.forward_completed = True
  1226. if getattr(device_module, "_initialized", False) and \
  1227. preserve_rng_state and not had_device_in_fwd: # type: ignore[possibly-undefined]
  1228. # Device was not initialized before running the forward, so we didn't
  1229. # stash the device state.
  1230. raise RuntimeError(
  1231. "PyTorch's device state was initialized in the forward pass "
  1232. "of a Checkpoint, which is not allowed. Please open an issue "
  1233. "if you need this feature."
  1234. )
  1235. return