MPSProfiler.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. // Copyright © 2022 Apple Inc.
  2. #pragma once
  3. #include <ATen/Tensor.h>
  4. #include <ATen/mps/MPSStream.h>
  5. #include <ATen/mps/MPSAllocatorInterface.h>
  6. #include <os/signpost.h>
  7. #include <os/log.h>
  8. #include <atomic>
  9. #include <ctime>
  10. #include <sstream>
  11. #include <string>
  12. #include <unordered_map>
  13. #include <utility>
  14. namespace at::mps {
  15. namespace Profiler {
  16. struct BaseInfo {
  17. // profiling info types
  18. enum class Type {
  19. GRAPH,
  20. KERNEL,
  21. COPY,
  22. CPU_FALLBACK,
  23. };
  24. BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) :
  25. type(infoType), profileId(Id), handle(Handle) { }
  26. virtual ~BaseInfo() = default;
  27. // type of profiling info
  28. Type type;
  29. // unique profile ID for execution instances of operations or copies
  30. uint64_t profileId;
  31. // ID generated by os_signpost
  32. // since it's possible to use event and interval-based signposts at the
  33. // same time, we need separate IDs for each.
  34. os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
  35. // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime")
  36. std::atomic<double> totalGpuTime{0.0};
  37. // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime")
  38. std::atomic<double> totalSchedulingTime{0.0};
  39. // indicates if the operation or copy execution has completed
  40. std::atomic_bool completed{false};
  41. // handle used to identify the profile info's instance (usually the pointer)
  42. const uintptr_t handle;
  43. virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const;
  44. // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()])
  45. static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) {
  46. if (tensor.defined()) {
  47. std::stringstream tensorStr;
  48. auto deviceType = tensor.device().type();
  49. tensorStr << c10::DeviceTypeName(deviceType);
  50. // see comments for INCLUDE_BUFFER_ID
  51. if (includeBufferId && deviceType == at::kMPS) {
  52. id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
  53. tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer))
  54. << ":" << buffer.retainCount << ")";
  55. }
  56. tensorStr << ":"
  57. << tensor.scalar_type() << tensor.sizes();
  58. return tensorStr.str();
  59. } else {
  60. return "undefined";
  61. }
  62. }
  63. static uint64_t getTime() {
  64. return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
  65. }
  66. };
  67. struct OperationInfo : BaseInfo {
  68. OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) :
  69. BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { }
  70. uint64_t runCount = 0;
  71. std::string strKey;
  72. const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
  73. // builds a string for a kernel
  74. static std::string buildKernelString(const std::string& kernelName,
  75. const TensorList& tensors,
  76. bool includeBufferId = false) {
  77. std::stringstream kernelStr;
  78. kernelStr << kernelName;
  79. for (const Tensor& tensor: tensors) {
  80. kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
  81. }
  82. return kernelStr.str();
  83. }
  84. };
  85. struct CpuFbInfo : BaseInfo {
  86. CpuFbInfo(uint64_t Id, const std::string& OpName) :
  87. BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { }
  88. uint64_t runCount = 0;
  89. // the current and total overhead of copies in bytes required to convert the Op's
  90. // input tensors from MPS to CPU and then output from CPU back to MPS
  91. size_t currentCopyOverhead = 0;
  92. size_t totalCopyOverhead = 0;
  93. std::string opName;
  94. std::string strKey;
  95. uint64_t startTime = 0;
  96. const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
  97. void updateCopyOverhead(const TensorList& tensors) {
  98. currentCopyOverhead = 0;
  99. for (const Tensor& tensor: tensors) {
  100. if (tensor.defined()) {
  101. currentCopyOverhead += tensor.nbytes();
  102. }
  103. }
  104. totalCopyOverhead += currentCopyOverhead;
  105. }
  106. };
  107. struct CopyInfo : BaseInfo {
  108. enum class Kind {
  109. MPS_TO_MPS,
  110. MPS_TO_CPU,
  111. CPU_TO_MPS,
  112. };
  113. CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) :
  114. BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS),
  115. length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { }
  116. Kind kind;
  117. size_t length;
  118. bool isNonBlocking;
  119. bool usesBlitter;
  120. std::string srcStrKey;
  121. std::string dstStrKey;
  122. // for copies that don't use blitters, we measure CPU time
  123. uint64_t startTime = 0;
  124. const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
  125. static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false);
  126. static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) {
  127. if (tensor.has_value()) {
  128. return tensor->device().type() == at::kMPS;
  129. }
  130. TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
  131. // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device
  132. return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
  133. }
  134. static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer,
  135. const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) {
  136. const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
  137. const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
  138. TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS);
  139. if (isSrcOnMPS && !isDstOnMPS) {
  140. return Kind::MPS_TO_CPU;
  141. } else if (!isSrcOnMPS && isDstOnMPS) {
  142. return Kind::CPU_TO_MPS;
  143. }
  144. return Kind::MPS_TO_MPS;
  145. }
  146. };
  147. struct CopyStat : CopyInfo {
  148. explicit CopyStat(std::string CopyKindStr) :
  149. CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {}
  150. // total number of copies
  151. size_t totalCount = 0;
  152. // number of Scalar copies (i.e., less than sizeof(int64))
  153. size_t scalarsCount = 0;
  154. // number of blocking copies (i.e., require syncing to GPU)
  155. size_t blockingCount = 0;
  156. // number of copies that used memcpy(), instead of Metal Blit Encoder
  157. size_t memcpyCount = 0;
  158. // accumulated GPU time in ms for the scalar copies
  159. std::atomic<double> scalarsGpuTime{0.0};
  160. // copy kind in string type
  161. std::string kindStr;
  162. };
  163. class MPSProfiler {
  164. public:
  165. // lower 16 bits used for profiler options
  166. enum ProfileOptions : uint32_t {
  167. OPTIONS_NONE = 0,
  168. // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.)
  169. // (used for convenience to not compute bit flags by OR-ing manually)
  170. // trace all signpost types using events
  171. ALL_SIGNPOST_EVENTS = (1 << 0),
  172. // trace all signpost types using intervals
  173. ALL_SIGNPOST_INTERVALS = (1 << 1),
  174. // always wait for command buffer to finish executing after each commit
  175. WAIT_UNTIL_COMPLETED = (1 << 2),
  176. // for interval-based signposts, include the scheduling portion of
  177. // Graph/Kernel/Copy executions as well.
  178. // if flag is disable, only "GPU run time" is included in interval,
  179. // and not schedule time.
  180. INCLUDE_SCHEDULE_INTERVAL = (1 << 3),
  181. // use these if you need to trace signposts types individually (rarely required)
  182. // trace signpost using intervals
  183. USE_INTERVALS = (1 << 4),
  184. // trace signpost by emitting events
  185. USE_EVENTS = (1 << 5),
  186. // used for sanity check (Change this when new option added)
  187. OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
  188. };
  189. // when adding new types, #define the type string in MPSProfiler.mm as well.
  190. // upper 16 bits used for event types
  191. enum SignpostTypes : uint32_t {
  192. SIGNPOST_NONE = 0,
  193. // trace signposts for PyTorch operation executions
  194. RUN_OPERATION = (1 << 16),
  195. // trace signposts for blitter copies
  196. BLIT_COPY = (1 << 17),
  197. // trace signposts for ops that fall back on CPU
  198. CPU_FALLBACK = (1 << 18),
  199. // used for sanity check (Change this when new type added)
  200. SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
  201. };
  202. enum LogOptions : uint32_t {
  203. LOG_NONE = 0,
  204. // Info logging options during execution
  205. // -------------------------------------
  206. // prints operation info (id/key/run_count) during execution
  207. OPERATION_INFO = (1 << 0),
  208. // prints copy info (src/dst tensors/buffers, size, etc.) during execution
  209. COPY_INFO = (1 << 1),
  210. // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution
  211. CPU_FALLBACK_INFO = (1 << 2),
  212. // Profiling Statistics logging options when process terminates
  213. // ------------------------------------------------------------
  214. // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates
  215. // this is convenient to not combine following stats bit flags manually
  216. ALL_STATS = (1 << 3),
  217. // prints operation stats (GPU times, run count, etc.) before process terminates
  218. OPERATION_STATS = (1 << 4),
  219. // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates
  220. COPY_STATS = (1 << 5),
  221. // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies
  222. // for tensors, etc.) before process terminates
  223. CPU_FALLBACK_STATS = (1 << 6),
  224. // Metadata format options when logging the info
  225. // ---------------------------------------------
  226. // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime
  227. // from Metal Command Buffers) (e.g., [GPU=0.324 ms])
  228. INCLUDE_GPU_TIME = (1 << 7),
  229. // if enabled, includes GPU scheduling time in metadata separately
  230. // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers)
  231. // e.g., [GPU=0.324 ms, KRNL=0.036 ms]
  232. INCLUDE_KERNEL_TIME = (1 << 8),
  233. // if enabled, includes the unique buffer ID in metadata for the storage
  234. // of a tensor that was allocated on MPSAllocator. This is useful (along with
  235. // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved
  236. // with various operations.
  237. INCLUDE_BUFFER_ID = (1 << 9),
  238. // used for sanity check (Change this when new option added)
  239. LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
  240. };
  241. explicit MPSProfiler();
  242. ~MPSProfiler();
  243. // the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
  244. // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy
  245. uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph);
  246. uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors);
  247. uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer,
  248. const OptionalTensorRef srcTensor,
  249. const OptionalTensorRef dstTensor,
  250. size_t length, bool isNonBlocking, bool usesBlitter = true);
  251. uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors);
  252. void beginProfileGPUInterval(const void* handle);
  253. void endProfileCopy(uint64_t profileId, SyncType syncType);
  254. void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
  255. void endProfileCPUFallback(const std::string& opName);
  256. // these are used to hook into Python bindings for torch.mps.profiler module.
  257. // this enables generating OS Signpost traces from MPSProfiler on-demand
  258. // during runtime (instead of environment variables).
  259. // The "mode" could be either "interval", "event", or both "interval,event"
  260. // for interval-based and/or event-based signpost tracing.
  261. void StartTrace(const std::string& mode, bool waitUntilCompleted);
  262. void StopTrace();
  263. // Abstractions for GPU trace capturing
  264. bool isCaptureEnabled() const;
  265. bool isCapturing() const;
  266. void startCapture(const std::string& name, MPSStream* stream = nullptr);
  267. void stopCapture(MPSStream* stream = nullptr);
  268. // convenience functions to indicate whether signpost tracing or
  269. // logging are enabled for the SignpostTypes
  270. bool isOperationProfilingEnabled() const {
  271. return (m_signpost_types & SignpostTypes::RUN_OPERATION) ||
  272. (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS));
  273. }
  274. bool isCopyProfilingEnabled() const {
  275. return (m_signpost_types & SignpostTypes::BLIT_COPY) ||
  276. (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS));
  277. }
  278. bool isCPUFallbackProfilingEnabled() const {
  279. return (m_signpost_types & SignpostTypes::CPU_FALLBACK) ||
  280. (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS));
  281. }
  282. bool isSignpostTracingEnabled() const {
  283. return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
  284. }
  285. private:
  286. // indicates what type of signpost types are enabled and traced by MPS profiler.
  287. uint32_t m_signpost_types = 0;
  288. uint32_t m_profile_options = 0;
  289. uint32_t m_log_options = 0;
  290. uint64_t m_kernel_counter = 0;
  291. uint64_t m_graph_counter = 0;
  292. uint64_t m_cpu_fb_counter = 0;
  293. uint64_t m_copy_counter = 0;
  294. // technically, it's possible to trace both events and intervals at the same time
  295. // so we use separate os_log categories for them
  296. os_log_t m_os_log_events;
  297. os_log_t m_os_log_intervals;
  298. // stats logging could run either from destructor or signal handler
  299. // so this is used to check if logging has already started.
  300. std::atomic_bool hasLoggedStats{false};
  301. // indicates there are pending completionHandler callbacks that haven't been called yet.
  302. std::atomic_bool hasPendingCompletionHandlers{false};
  303. // used to capture sigint signal to log profiling stats
  304. static struct sigaction currentSigint, previousSigint;
  305. // We use the following lists for two reasons:
  306. // 1- for interval-based signposts the "begin" point won't be in same function
  307. // as the "end" point where we need to be able to retrieve signpost's info
  308. // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO.
  309. // the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
  310. // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
  311. std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{};
  312. // the string key for this map is the op name that we fall back to execute on CPU
  313. // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
  314. std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{};
  315. // this list contains the info for copies, and its key is the unique profileId
  316. // which is generated from m_copy_counter
  317. // The copyInfo list is not retained.
  318. std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
  319. // a short list that contains copy stats
  320. std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};
  321. mutable MTLCaptureManager *captureManager = nil;
  322. unsigned captureCount = 0;
  323. void initialize();
  324. void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
  325. void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
  326. os_signpost_id_t interval_signpost_id,
  327. double gpuTime, double schedulingTime);
  328. void addProfilerScheduledHandler(BaseInfo& info);
  329. void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
  330. void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
  331. const std::string& msg) const;
  332. void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
  333. const std::string& msg) const;
  334. void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const;
  335. void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime);
  336. // returns true if logging the profiling info "during the execution" is enabled
  337. bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded);
  338. // logs all the profiling stats that are enabled
  339. void logProfilingStats();
  340. // logs kernel profiling stats when the process ends.
  341. void logOperationsProfilingStats(std::FILE* f) const;
  342. // logs CPU Fallback profiling stats when the process ends.
  343. void logCPUFallbackProfilingStats(std::FILE* f) const;
  344. // logs copy profiling stats when the process ends.
  345. void logCopyProfilingStats(std::FILE* f) const;
  346. os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr);
  347. static SignpostTypes getSignpostType(BaseInfo::Type infoType);
  348. static void handleIntSignal(int signal);
  349. };
  350. } // namespace Profiler
  351. Profiler::MPSProfiler& getMPSProfiler();
  352. } // namespace at::mps