compilers.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. # mypy: ignore-errors
  2. import copy
  3. import logging
  4. import os
  5. import pickle
  6. import random
  7. from contextlib import contextmanager
  8. from functools import partial
  9. from typing import Callable, Union
  10. import sympy
  11. import torch
  12. import torch.fx as fx
  13. import torch.nn as nn
  14. import torch.utils._pytree as pytree
  15. from torch import SymInt
  16. from torch._decomp import get_decompositions
  17. from torch.fx.experimental.symbolic_shapes import bind_symbols
  18. from .aot_autograd import aot_function, aot_module, make_boxed_compiler
  19. from .compile_utils import strip_overloads
  20. from .partitioners import (
  21. default_partition,
  22. draw_graph,
  23. min_cut_rematerialization_partition,
  24. )
  25. log = logging.getLogger(__name__)
  26. # These canonicalizations are needed here (and not decompositions), as the ops
  27. # we're trying to canonicalize to CompositeImplicitAutograd.
  28. def _canonicalize(fx_g):
  29. for node in fx_g.graph.find_nodes(
  30. op="call_function", target=torch.ops.aten._to_copy
  31. ):
  32. node.target = torch.ops.aten.to
  33. fx_g.recompile()
  34. return fx_g
  35. @contextmanager
  36. def _disable_jit_autocast():
  37. old_jit_autocast_flag = torch._C._jit_set_autocast_mode(False)
  38. try:
  39. yield
  40. finally:
  41. torch._C._jit_set_autocast_mode(old_jit_autocast_flag)
  42. @make_boxed_compiler
  43. def ts_compile(fx_g: fx.GraphModule, inps) -> Callable:
  44. """
  45. Compiles the :attr:`fx_g` with Torchscript compiler.
  46. .. warning::
  47. This API is experimental and likely to change.
  48. Args:
  49. fx_g(fx.GraphModule): The input Fx graph module to be compiled.
  50. Returns:
  51. Torch scripted model.
  52. """
  53. with _disable_jit_autocast():
  54. strip_overloads(fx_g)
  55. for node in fx_g.graph.find_nodes(
  56. op="call_function", target=torch.ops.aten._to_copy
  57. ):
  58. if len(node.args) == 1 and len(node.kwargs) == 1 and "dtype" in node.kwargs:
  59. node.target = torch.ops.aten.to
  60. for node in fx_g.graph.nodes:
  61. new_kwargs = {}
  62. for k, v in node.kwargs.items():
  63. if isinstance(v, torch.device):
  64. v = v.type
  65. new_kwargs[k] = v
  66. node.kwargs = new_kwargs
  67. fx_g.graph.lint()
  68. fx_g.recompile()
  69. f = torch.jit.script(fx_g)
  70. torch._C._jit_pass_remove_mutation(f.graph)
  71. f = torch.jit.freeze(f.eval())
  72. f = torch.jit.optimize_for_inference(f)
  73. if not any(isinstance(t, torch._subclasses.FakeTensor) for t in inps):
  74. f(*inps)
  75. return f
  76. def _draw_graph_compile(fx_g, _, name, clear_meta=True):
  77. print(fx_g.code)
  78. draw_graph(fx_g, name, clear_meta=clear_meta)
  79. return fx_g
  80. def draw_graph_compile(name):
  81. return make_boxed_compiler(partial(_draw_graph_compile, name=name))
  82. @make_boxed_compiler
  83. def nop(fx_g: fx.GraphModule, _) -> Callable:
  84. """
  85. Returns the :attr:`fx_g` Fx graph module as it is. This is a no-op compiler
  86. and can be used to check accuracy.
  87. .. warning::
  88. This API is experimental and likely to change.
  89. """
  90. return fx_g
  91. class DebugInterpreter(fx.Interpreter):
  92. def run(self, *args):
  93. self.symbol_mapping = bind_symbols(self.module, *args)
  94. super().run(*args)
  95. def run_node(self, n):
  96. def subst_symint(ni):
  97. if not isinstance(ni, SymInt):
  98. return ni
  99. r = sympy.expand(ni.node.expr.xreplace(self.symbol_mapping))
  100. assert r.is_number, r
  101. return int(r)
  102. def subst_symint_tuple(nis):
  103. return tuple(subst_symint(ni) for ni in nis)
  104. def check_significant_strides(a, b):
  105. if subst_symint(a.numel()) > 0:
  106. for idx in range(a.ndim):
  107. if (
  108. subst_symint(a.stride(idx)) != b.stride(idx)
  109. and subst_symint(a.size(idx)) > 1
  110. ):
  111. return False
  112. return True
  113. def check(nv, rv, desc):
  114. assert callable(desc)
  115. assert nv.dtype == rv.dtype, f"{desc()}: {nv.dtype} != {rv.dtype}"
  116. assert (
  117. subst_symint_tuple(nv.size()) == rv.size()
  118. ), f"{desc()}: {nv.size()} aka {subst_symint_tuple(nv.size())} != {rv.size()}"
  119. same_strides = check_significant_strides(nv, rv)
  120. assert (
  121. same_strides
  122. ), f"{desc()}: {nv.stride()} aka {subst_symint_tuple(nv.stride())} != {rv.stride()}"
  123. r = super().run_node(n)
  124. if "val" in n.meta:
  125. n_vals, n_spec = pytree.tree_flatten(n.meta["val"])
  126. r_vals, r_spec = pytree.tree_flatten(r)
  127. # TODO: There is some sort of problem where we record that an
  128. # operator returned a tuple/list, and then later it turns out the
  129. # real version of the operator returned a list/tuple. Need to
  130. # figure out what's actually going on here, the error itself is
  131. # harmless enough as we only getitem out the outputs.
  132. # assert n_spec == r_spec, f"{n_spec} != {r_spec}"
  133. assert len(n_vals) == len(r_vals), f"{len(n_vals)} != {len(r_vals)}"
  134. for i, nv, rv in zip(range(len(n_vals)), n_vals, r_vals):
  135. if not isinstance(rv, torch.Tensor):
  136. continue
  137. check(nv, rv, lambda: f"output {i} where {self.symbol_mapping}")
  138. return r
  139. @make_boxed_compiler
  140. def debug_nop(fx_g: fx.GraphModule, _) -> Callable:
  141. """
  142. Returns a (slow) interpreter over the FX graph module that also checks
  143. various debugging properties (e.g., that tracing strides matched real
  144. strides.)
  145. """
  146. return DebugInterpreter(fx_g).run
  147. @make_boxed_compiler
  148. def simple_ts_compile(fx_g, _):
  149. strip_overloads(fx_g)
  150. f = torch.jit.script(fx_g)
  151. f = torch.jit.freeze(f.eval())
  152. return f
  153. def nnc_jit(f):
  154. return aot_function(f, simple_ts_compile)
  155. aten = torch.ops.aten
  156. default_decompositions = {
  157. aten.detach,
  158. aten.gelu_backward,
  159. aten.leaky_relu_backward,
  160. aten.sigmoid_backward,
  161. aten.threshold_backward,
  162. aten.hardtanh_backward,
  163. aten.hardsigmoid_backward,
  164. aten.hardswish_backward,
  165. aten.tanh_backward,
  166. aten.silu_backward,
  167. aten.elu_backward,
  168. aten.cudnn_batch_norm,
  169. aten.cudnn_batch_norm_backward,
  170. aten.masked_fill.Scalar,
  171. aten.masked_fill.Tensor,
  172. aten.elu,
  173. aten.leaky_relu,
  174. aten.hardtanh,
  175. aten.hardswish,
  176. aten.hardsigmoid,
  177. aten.conj_physical,
  178. aten.is_same_size,
  179. }
  180. default_decompositions = get_decompositions(default_decompositions)
  181. @make_boxed_compiler
  182. def print_compile(fx_g, _):
  183. print(fx_g.code)
  184. return fx_g
  185. def memory_efficient_fusion(
  186. fn: Union[Callable, nn.Module],
  187. **kwargs,
  188. ):
  189. """
  190. Wrapper function over :func:`aot_function` and :func:`aot_module` to perform
  191. memory efficient fusion. It uses the
  192. :func:`min_cut_rematerialization_partition` partitioner to perform efficient
  193. recomputation. It uses NVFuser to compile the generated forward and backward
  194. graphs.
  195. .. warning::
  196. This API is experimental and likely to change.
  197. Args:
  198. fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module``
  199. that takes one ore more arguments. Must return one or more Tensors.
  200. **kwargs: Any other overrides you want to make to the settings
  201. Returns:
  202. Returns a ``Callable`` or ``nn.Module`` that retains the eager behavior
  203. of the original :attr:`fn`, but whose forward and backward graphs have
  204. gone through recomputation optimizations, and the graphs have been
  205. compiled with nvfuser.
  206. """
  207. config = {
  208. "fw_compiler": ts_compile,
  209. "bw_compiler": ts_compile,
  210. "partition_fn": min_cut_rematerialization_partition,
  211. "decompositions": default_decompositions,
  212. }
  213. config.update(kwargs)
  214. if isinstance(fn, torch.nn.Module):
  215. return aot_module(fn, **config)
  216. else:
  217. return aot_function(fn, **config)
  218. def debug_compile(fx_g, inps):
  219. fx_g.to_folder("foo")
  220. print(
  221. f"""
  222. ##############################################################
  223. # To minimize FX graph, copy and paste the below and run it #
  224. ##############################################################
  225. import torch
  226. import torch.fx as fx
  227. from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess
  228. inps = {[(i.shape, i.dtype) for i in inps]}
  229. inps = [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]
  230. from foo import FxModule
  231. mod = FxModule().cuda()
  232. with torch.jit.fuser("fuser2"):
  233. # check_nvfuser_subprocess can be replaced with check_nvfuser_correctness_subprocess
  234. minifier(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)
  235. """
  236. )
  237. from foo import FxModule
  238. FxModule().cuda()(*inps)
  239. return ts_compile(fx_g, inps)
  240. graph_index = 0
  241. def get_inputs(input_data_path):
  242. """
  243. Return a random input for the given inputs meta generated from _save_fx_default.
  244. """
  245. inputs = []
  246. with open(input_data_path, "rb") as f:
  247. inputs_meta = pickle.load(f)
  248. inputs = []
  249. for meta in inputs_meta:
  250. if len(meta) == 1:
  251. type = meta
  252. input = type(random.rand())
  253. else:
  254. type, shape, stride, dtype, device = meta
  255. if dtype in {
  256. torch.int,
  257. torch.int32,
  258. torch.int64,
  259. torch.bool,
  260. torch.int,
  261. torch.uint8,
  262. int,
  263. float,
  264. }:
  265. input = torch.randint(0, 1, shape, dtype=dtype, device=device)
  266. else:
  267. input = torch.rand(shape, dtype=dtype, device=device)
  268. inputs.append(input)
  269. return inputs
  270. def _save_fx_default(current_name, folder_name, dump_example_input, gm, example_inputs):
  271. """
  272. The forward, backward, and joint computation graph will be stored in
  273. {folder_name}/{current_name}/{current_name}_forward_{graph_index},
  274. {folder_name}/{current_name}/{current_name}_backward_{graph_index}, and
  275. {folder_name}/{current_name}/{current_name}_joint_{graph_index} respectively.
  276. The input shape of the graphs will be stored in the .input files.
  277. These files can be loaded with pickle,
  278. and is a list of format (type, shape, stride, dtype, device).
  279. In the case of type = int or float, it is just (type,).
  280. For joint graph input, it is a nested list [[],[]]
  281. where the two inner lists have the same format.
  282. If dump_example_input is True, example_inputs will be stored in .pt file.
  283. Since each function might produce multiple graphs,
  284. the graph_index is used to distinguish difference graphs
  285. """
  286. from functorch.compile import aot_module_simplified
  287. def get_input_meta(args):
  288. input_meta = []
  289. if len(args) > 0 and isinstance(args[0], tuple): # joint input
  290. input_meta += get_input_meta(args[0])
  291. input_meta += get_input_meta(args[1])
  292. return input_meta
  293. for arg in args:
  294. if type(arg) == int or type(arg) == float:
  295. input_meta.append((type(arg),))
  296. else:
  297. input_meta.append(
  298. (type(arg), arg.shape, arg.stride(), arg.dtype, arg.device)
  299. )
  300. return input_meta
  301. def graph_saver_helper(gm_to_save, args, type_name):
  302. global graph_index
  303. if len(gm_to_save.graph.nodes) == 0:
  304. log.log(
  305. logging.WARNING,
  306. "No nodes in graph {%s}_{%s}_{%s}.",
  307. current_name,
  308. type_name,
  309. graph_index,
  310. )
  311. return
  312. gm = copy.deepcopy(gm_to_save)
  313. gm.graph.set_codegen(torch.fx.graph.CodeGen()) # remove codegen
  314. gm.recompile()
  315. input_meta = get_input_meta(args)
  316. os.makedirs(f"{folder_name}/{current_name}", exist_ok=True)
  317. gm.to_folder(
  318. f"{folder_name}/{current_name}/{current_name}_{type_name}_{graph_index}"
  319. )
  320. pickle.dump(
  321. input_meta,
  322. open(
  323. f"{folder_name}/{current_name}/{current_name}_{type_name}_{graph_index}/{current_name}_{type_name}_{graph_index}.input", # noqa: B950
  324. "wb",
  325. ),
  326. ) # noqa: E501
  327. if dump_example_input:
  328. torch.save(
  329. args,
  330. f"{folder_name}/{current_name}/{current_name}_{type_name}_{graph_index}/{current_name}_{type_name}_{graph_index}.pt", # noqa: B950
  331. ) # noqa: E501
  332. def graph_saver_forward(gm, fw_args):
  333. graph_saver_helper(gm, fw_args, "forward")
  334. return gm
  335. def graph_saver_backward(gm, bw_args):
  336. graph_saver_helper(gm, bw_args, "backward")
  337. global graph_index
  338. graph_index += 1
  339. return gm
  340. def graph_saver_joint(gm, joint_args):
  341. graph_saver_helper(gm, joint_args, "joint")
  342. return default_partition(gm, joint_args)
  343. return aot_module_simplified(
  344. gm,
  345. example_inputs,
  346. fw_compiler=graph_saver_forward,
  347. bw_compiler=graph_saver_backward,
  348. partition_fn=graph_saver_joint,
  349. decompositions=default_decompositions,
  350. )
  351. # WARNING: This isn't tested anywhere!!
  352. def graph_dumper_aot(current_name, folder_name, dump_example_input=False):
  353. """
  354. Dump the forward, backward, and joint computation graph.
  355. Example Usage:
  356. save_fx_func = graph_dumper_aot(current_name, folder_name, dump_example_input = False)
  357. optimize_ctx = torchdynamo.optimize(
  358. save_fx_func
  359. )
  360. with torch.enable_grad():
  361. with optimize_ctx:
  362. result = forward_and_backward_pass(model, example_inputs)
  363. """
  364. global graph_index
  365. graph_index = 0
  366. return partial(_save_fx_default, current_name, folder_name, dump_example_input)