benchmark_utils.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. # mypy: ignore-errors
  2. import contextlib
  3. import json
  4. import operator
  5. import os
  6. import time
  7. import torch
  8. from torch.profiler import profile, ProfilerActivity
  9. def synchronize():
  10. pass
  11. def dump_chrome_trace(
  12. f,
  13. input,
  14. trace_filename,
  15. optimize_ctx,
  16. activities,
  17. num_runs=1,
  18. devices=None,
  19. kwargs_for_f=None,
  20. kwargs_for_profiler=None,
  21. ):
  22. """
  23. Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
  24. [num_runs] times to [trace_filename].
  25. [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
  26. Return total runtime without the profiler
  27. Outputs to trace_filename
  28. """
  29. if devices is None:
  30. devices = ["cuda"]
  31. global synchronize
  32. if devices != ["cpu"] and torch.cuda.is_available():
  33. synchronize = torch.cuda.synchronize
  34. if kwargs_for_f is None:
  35. kwargs_for_f = {}
  36. if kwargs_for_profiler is None:
  37. kwargs_for_profiler = {}
  38. with optimize_ctx:
  39. torch.manual_seed(1337)
  40. for _ in range(5): # warmup runs
  41. f(input, **kwargs_for_f)
  42. synchronize()
  43. torch.manual_seed(1337)
  44. t0 = time.perf_counter()
  45. for _ in range(num_runs):
  46. f(input, **kwargs_for_f)
  47. synchronize()
  48. t1 = time.perf_counter()
  49. timing = t1 - t0
  50. with profile(activities=activities, **kwargs_for_profiler) as prof:
  51. with optimize_ctx:
  52. synchronize()
  53. torch.manual_seed(1337)
  54. for _ in range(num_runs):
  55. f(input, **kwargs_for_f)
  56. synchronize()
  57. prof.export_chrome_trace(trace_filename)
  58. return timing
  59. def get_chrome_trace_events(filename):
  60. f = open(filename)
  61. data = json.load(f)
  62. events = data["traceEvents"]
  63. return events
  64. def is_gpu_compute_event(event):
  65. global gpu_pids
  66. return (
  67. "pid" in event
  68. and event["pid"] in gpu_pids
  69. and "ph" in event
  70. and event["ph"] == "X"
  71. )
  72. def get_sorted_gpu_events(events):
  73. sorted_gpu_events = []
  74. for event in events:
  75. if not is_gpu_compute_event(event):
  76. continue
  77. sorted_gpu_events.append(event)
  78. return sorted(sorted_gpu_events, key=operator.itemgetter("ts"))
  79. def get_duration(sorted_gpu_events):
  80. if len(sorted_gpu_events) == 0:
  81. return 0
  82. event = sorted_gpu_events[0]
  83. current_end_time = event["ts"] + event["dur"]
  84. total_duration = event["dur"]
  85. for event in sorted_gpu_events[1:]:
  86. start_time = max(event["ts"], current_end_time)
  87. end_time = event["ts"] + event["dur"]
  88. total_duration = total_duration + max(end_time - start_time, 0)
  89. current_end_time = max(current_end_time, end_time)
  90. return total_duration
  91. def get_sorted_gpu_mm_conv_events(events):
  92. def is_mm_conv_event(event):
  93. return "name" in event and (
  94. "gemm" in event["name"]
  95. or "conv" in event["name"]
  96. or "cutlass" in event["name"]
  97. or "wgrad" in event["name"]
  98. )
  99. gpu_events = get_sorted_gpu_events(events)
  100. sorted_events = []
  101. for event in gpu_events:
  102. if not is_mm_conv_event(event):
  103. continue
  104. sorted_events.append(event)
  105. return sorted_events
  106. gpu_pids = []
  107. def compute_utilization(filename: str, total_length: float):
  108. """
  109. Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
  110. and percent of times spent on matmul and convolution
  111. Args:
  112. filename(str): Name of chrome traces file produced by pytorch profiler
  113. total_length(float): total length of the process without profiler in second
  114. Return:
  115. tuple: (GPU Utilization, percent of time spent on matmul and convolution)
  116. """
  117. events = get_chrome_trace_events(filename)
  118. # get pids of GPU events
  119. global gpu_pids
  120. gpu_pids = []
  121. for event in events:
  122. if "name" not in event:
  123. continue
  124. if event["name"] == "process_labels" and "GPU" in event["args"]["labels"]:
  125. gpu_pids.append(event["pid"])
  126. total_length = total_length * 1e6
  127. sorted_gpu_events = get_sorted_gpu_events(events)
  128. utilization = get_duration(sorted_gpu_events) / total_length
  129. sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
  130. mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length
  131. return utilization, mm_conv_utilization
  132. def benchmark_utilization(
  133. f,
  134. input,
  135. trace_folder,
  136. optimize_ctx=None,
  137. trace_file_name="tmp_chrome_trace",
  138. num_runs=1,
  139. ):
  140. """
  141. Benchmark the GPU Utilization and percent of time spent on matmul and convolution operations of
  142. running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
  143. It will produce a chrome trace file in trace_folder/trace_file_name.json
  144. Example:
  145. ```
  146. def f(a):
  147. return a.sum()
  148. a = torch.rand(2**20, device="cuda")
  149. utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
  150. ```
  151. Args:
  152. f: function to benchmark
  153. input: input to :attr:`f`
  154. trace_folder: name of the folder to store the chrome trace
  155. optimize_ctx: the context in which f will run
  156. trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"
  157. num_runs: number of times to run f, excluding the warm-up runs, default to 1.
  158. Return:
  159. tuple: (GPU Utilization, percent of time spent on matmul and convolution)
  160. """
  161. isExist = os.path.exists(trace_folder)
  162. if not isExist:
  163. os.makedirs(trace_folder)
  164. print("create folder " + trace_folder)
  165. if optimize_ctx is None:
  166. optimize_ctx = contextlib.nullcontext()
  167. chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
  168. total_length = dump_chrome_trace(
  169. f,
  170. input,
  171. chrome_trace_file_name,
  172. optimize_ctx,
  173. [ProfilerActivity.CUDA],
  174. num_runs=num_runs,
  175. devices="cuda",
  176. )
  177. utilization, mm_conv_utilization = compute_utilization(
  178. chrome_trace_file_name, total_length
  179. )
  180. return utilization, mm_conv_utilization