Config.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under the BSD-style license found in the
  6. * LICENSE file in the root directory of this source tree.
  7. */
  8. #pragma once
  9. #include "AbstractConfig.h"
  10. #include "ActivityType.h"
  11. #include <assert.h>
  12. #include <chrono>
  13. #include <functional>
  14. #include <set>
  15. #include <string>
  16. #include <vector>
  17. namespace libkineto {
  18. class Config : public AbstractConfig {
  19. public:
  20. Config();
  21. Config& operator=(const Config&) = delete;
  22. Config(Config&&) = delete;
  23. Config& operator=(Config&&) = delete;
  24. // Return a full copy including feature config object
  25. std::unique_ptr<Config> clone() const {
  26. auto cfg = std::unique_ptr<Config>(new Config(*this));
  27. cloneFeaturesInto(*cfg);
  28. return cfg;
  29. }
  30. bool handleOption(const std::string& name, std::string& val) override;
  31. void setClientDefaults() override;
  32. // Log events to this file
  33. const std::string& eventLogFile() const {
  34. return eventLogFile_;
  35. }
  36. bool activityProfilerEnabled() const {
  37. return activityProfilerEnabled_ ||
  38. activitiesOnDemandTimestamp_.time_since_epoch().count() > 0;
  39. }
  40. // Log activitiy trace to this file
  41. const std::string& activitiesLogFile() const {
  42. return activitiesLogFile_;
  43. }
  44. // Log activitiy trace to this url
  45. const std::string& activitiesLogUrl() const {
  46. return activitiesLogUrl_;
  47. }
  48. void setActivitiesLogUrl(const std::string& url) {
  49. activitiesLogUrl_ = url;
  50. }
  51. bool activitiesLogToMemory() const {
  52. return activitiesLogToMemory_;
  53. }
  54. bool eventProfilerEnabled() const {
  55. return !eventNames_.empty() || !metricNames_.empty();
  56. }
  57. // Is profiling enabled for the given device?
  58. bool eventProfilerEnabledForDevice(uint32_t dev) const {
  59. return 0 != (eventProfilerDeviceMask_ & (1 << dev));
  60. }
  61. // Take a sample (read hardware counters) at this frequency.
  62. // This controls how often counters are read - if all counters cannot
  63. // be collected simultaneously then multiple samples are needed to
  64. // collect all requested counters - see multiplex period.
  65. std::chrono::milliseconds samplePeriod() const {
  66. return samplePeriod_;
  67. }
  68. void setSamplePeriod(std::chrono::milliseconds period) {
  69. samplePeriod_ = period;
  70. }
  71. // When all requested counters cannot be collected simultaneously,
  72. // counters will be multiplexed at this frequency.
  73. // Multiplexing can have a large performance impact if done frequently.
  74. // To avoid a perf impact, keep this at 1s or above.
  75. std::chrono::milliseconds multiplexPeriod() const {
  76. return multiplexPeriod_;
  77. }
  78. void setMultiplexPeriod(std::chrono::milliseconds period) {
  79. multiplexPeriod_ = period;
  80. }
  81. // Report counters at this frequency. Note that several samples can
  82. // be reported each time, see samplesPerReport.
  83. std::chrono::milliseconds reportPeriod() const {
  84. return reportPeriod_;
  85. }
  86. void setReportPeriod(std::chrono::milliseconds msecs);
  87. // Number of samples dispatched each report period.
  88. // Must be in the range [1, report period / sample period].
  89. // In other words, aggregation is supported but not interpolation.
  90. int samplesPerReport() const {
  91. return samplesPerReport_;
  92. }
  93. void setSamplesPerReport(int count) {
  94. samplesPerReport_ = count;
  95. }
  96. // The names of events to collect
  97. const std::set<std::string>& eventNames() const {
  98. return eventNames_;
  99. }
  100. // Add additional events to be profiled
  101. void addEvents(const std::set<std::string>& names) {
  102. eventNames_.insert(names.begin(), names.end());
  103. }
  104. // The names of metrics to collect
  105. const std::set<std::string>& metricNames() const {
  106. return metricNames_;
  107. }
  108. // Add additional metrics to be profiled
  109. void addMetrics(const std::set<std::string>& names) {
  110. metricNames_.insert(names.begin(), names.end());
  111. }
  112. const std::vector<int>& percentiles() const {
  113. return eventReportPercentiles_;
  114. }
  115. // Profile for this long, then revert to base config
  116. std::chrono::seconds eventProfilerOnDemandDuration() const {
  117. return eventProfilerOnDemandDuration_;
  118. }
  119. void setEventProfilerOnDemandDuration(std::chrono::seconds duration) {
  120. eventProfilerOnDemandDuration_ = duration;
  121. }
  122. // Too many event profilers on a single system can overload the driver.
  123. // At some point, latencies shoot through the roof and collection of samples
  124. // becomes impossible. To avoid this situation we have a limit of profilers
  125. // per GPU.
  126. // NOTE: Communication with a daemon is needed for this feature.
  127. // Library must be built with an active DaemonConfigLoader.
  128. int maxEventProfilersPerGpu() const {
  129. return eventProfilerMaxInstancesPerGpu_;
  130. }
  131. // On Cuda11 we've seen occasional hangs when reprogramming counters
  132. // Monitor profiling threads and report when a thread is not responding
  133. // for a given number of seconds.
  134. // A period of 0 means disable.
  135. std::chrono::seconds eventProfilerHeartbeatMonitorPeriod() const {
  136. return eventProfilerHeartbeatMonitorPeriod_;
  137. }
  138. // The types of activities selected in the configuration file
  139. const std::set<ActivityType>& selectedActivityTypes() const {
  140. return selectedActivityTypes_;
  141. }
  142. void setSelectedActivityTypes(const std::set<ActivityType>& types) {
  143. selectedActivityTypes_ = types;
  144. }
  145. bool isReportInputShapesEnabled() const {
  146. return enableReportInputShapes_;
  147. }
  148. bool isProfileMemoryEnabled() const {
  149. return enableProfileMemory_;
  150. }
  151. bool isWithStackEnabled() const {
  152. return enableWithStack_;
  153. }
  154. bool isWithFlopsEnabled() const {
  155. return enableWithFlops_;
  156. }
  157. bool isWithModulesEnabled() const {
  158. return enableWithModules_;
  159. }
  160. // Trace for this long
  161. std::chrono::milliseconds activitiesDuration() const {
  162. return activitiesDuration_;
  163. }
  164. // Trace for this many iterations, determined by external API
  165. int activitiesRunIterations() const {
  166. return activitiesRunIterations_;
  167. }
  168. int activitiesMaxGpuBufferSize() const {
  169. return activitiesMaxGpuBufferSize_;
  170. }
  171. std::chrono::seconds activitiesWarmupDuration() const {
  172. return activitiesWarmupDuration_;
  173. }
  174. int activitiesWarmupIterations() const {
  175. return activitiesWarmupIterations_;
  176. }
  177. // Show CUDA Synchronization Stream Wait Events
  178. bool activitiesCudaSyncWaitEvents() const {
  179. return activitiesCudaSyncWaitEvents_;
  180. }
  181. void setActivitiesCudaSyncWaitEvents(bool enable) {
  182. activitiesCudaSyncWaitEvents_ = enable;
  183. }
  184. // Timestamp at which the profiling to start, requested by the user.
  185. const std::chrono::time_point<std::chrono::system_clock> requestTimestamp()
  186. const {
  187. if (profileStartTime_.time_since_epoch().count()) {
  188. return profileStartTime_;
  189. }
  190. // If no one requested timestamp, return 0.
  191. if (requestTimestamp_.time_since_epoch().count() == 0) {
  192. return requestTimestamp_;
  193. }
  194. // TODO(T94634890): Deprecate requestTimestamp
  195. return requestTimestamp_ + maxRequestAge() + activitiesWarmupDuration();
  196. }
  197. bool hasProfileStartTime() const {
  198. return requestTimestamp_.time_since_epoch().count() > 0 ||
  199. profileStartTime_.time_since_epoch().count() > 0;
  200. }
  201. int profileStartIteration() const {
  202. return profileStartIteration_;
  203. }
  204. bool hasProfileStartIteration() const {
  205. return profileStartIteration_ >= 0 && activitiesRunIterations_ > 0;
  206. }
  207. void setProfileStartIteration(int iter) {
  208. profileStartIteration_ = iter;
  209. }
  210. int profileStartIterationRoundUp() const {
  211. return profileStartIterationRoundUp_;
  212. }
  213. // calculate the start iteration accounting for warmup
  214. int startIterationIncludingWarmup() const {
  215. if (!hasProfileStartIteration()) {
  216. return -1;
  217. }
  218. return profileStartIteration_ - activitiesWarmupIterations_;
  219. }
  220. const std::chrono::seconds maxRequestAge() const;
  221. // All VLOG* macros will log if the verbose log level is >=
  222. // the verbosity specified for the verbose log message.
  223. // Default value is -1, so messages with log level 0 will log by default.
  224. int verboseLogLevel() const {
  225. return verboseLogLevel_;
  226. }
  227. // Modules for which verbose logging is enabled.
  228. // If empty, logging is enabled for all modules.
  229. const std::vector<std::string>& verboseLogModules() const {
  230. return verboseLogModules_;
  231. }
  232. bool sigUsr2Enabled() const {
  233. return enableSigUsr2_;
  234. }
  235. bool ipcFabricEnabled() const {
  236. return enableIpcFabric_;
  237. }
  238. std::chrono::seconds onDemandConfigUpdateIntervalSecs() const {
  239. return onDemandConfigUpdateIntervalSecs_;
  240. }
  241. static std::chrono::milliseconds alignUp(
  242. std::chrono::milliseconds duration,
  243. std::chrono::milliseconds alignment) {
  244. duration += alignment;
  245. return duration - (duration % alignment);
  246. }
  247. std::chrono::time_point<std::chrono::system_clock>
  248. eventProfilerOnDemandStartTime() const {
  249. return eventProfilerOnDemandTimestamp_;
  250. }
  251. std::chrono::time_point<std::chrono::system_clock>
  252. eventProfilerOnDemandEndTime() const {
  253. return eventProfilerOnDemandTimestamp_ + eventProfilerOnDemandDuration_;
  254. }
  255. std::chrono::time_point<std::chrono::system_clock>
  256. activityProfilerRequestReceivedTime() const {
  257. return activitiesOnDemandTimestamp_;
  258. }
  259. static constexpr std::chrono::milliseconds kControllerIntervalMsecs{1000};
  260. // Users may request and set trace id and group trace id.
  261. const std::string& requestTraceID() const {
  262. return requestTraceID_;
  263. }
  264. void setRequestTraceID(const std::string& tid) {
  265. requestTraceID_ = tid;
  266. }
  267. const std::string& requestGroupTraceID() const {
  268. return requestGroupTraceID_;
  269. }
  270. void setRequestGroupTraceID(const std::string& gtid) {
  271. requestGroupTraceID_ = gtid;
  272. }
  273. size_t cuptiDeviceBufferSize() const {
  274. return cuptiDeviceBufferSize_;
  275. }
  276. size_t cuptiDeviceBufferPoolLimit() const {
  277. return cuptiDeviceBufferPoolLimit_;
  278. }
  279. void updateActivityProfilerRequestReceivedTime();
  280. void printActivityProfilerConfig(std::ostream& s) const override;
  281. void validate(const std::chrono::time_point<std::chrono::system_clock>&
  282. fallbackProfileStartTime) override;
  283. static void addConfigFactory(
  284. std::string name,
  285. std::function<AbstractConfig*(Config&)> factory);
  286. void print(std::ostream& s) const;
  287. // Config relies on some state with global static lifetime. If other
  288. // threads are using the config, it's possible that the global state
  289. // is destroyed before the threads stop. By hanging onto this handle,
  290. // correct destruction order can be ensured.
  291. static std::shared_ptr<void> getStaticObjectsLifetimeHandle();
  292. private:
  293. explicit Config(const Config& other) = default;
  294. AbstractConfig* cloneDerived(AbstractConfig& parent) const override {
  295. // Clone from AbstractConfig not supported
  296. assert(false);
  297. return nullptr;
  298. }
  299. uint8_t createDeviceMask(const std::string& val);
  300. // Adds valid activity types from the user defined string list in the
  301. // configuration file
  302. void setActivityTypes(const std::vector<std::string>& selected_activities);
  303. // Sets the default activity types to be traced
  304. void selectDefaultActivityTypes() {
  305. // If the user has not specified an activity list, add all types
  306. for (ActivityType t : defaultActivityTypes()) {
  307. selectedActivityTypes_.insert(t);
  308. }
  309. }
  310. int verboseLogLevel_;
  311. std::vector<std::string> verboseLogModules_;
  312. // Event profiler
  313. // These settings are also supported in on-demand mode
  314. std::chrono::milliseconds samplePeriod_;
  315. std::chrono::milliseconds reportPeriod_;
  316. int samplesPerReport_;
  317. std::set<std::string> eventNames_;
  318. std::set<std::string> metricNames_;
  319. // On-demand duration
  320. std::chrono::seconds eventProfilerOnDemandDuration_;
  321. // Last on-demand request
  322. std::chrono::time_point<std::chrono::system_clock>
  323. eventProfilerOnDemandTimestamp_;
  324. int eventProfilerMaxInstancesPerGpu_;
  325. // Monitor whether event profiler threads are stuck
  326. // at this frequency
  327. std::chrono::seconds eventProfilerHeartbeatMonitorPeriod_;
  328. // These settings can not be changed on-demand
  329. std::string eventLogFile_;
  330. std::vector<int> eventReportPercentiles_ = {5, 25, 50, 75, 95};
  331. uint8_t eventProfilerDeviceMask_ = ~0;
  332. std::chrono::milliseconds multiplexPeriod_;
  333. // Activity profiler
  334. bool activityProfilerEnabled_;
  335. std::set<ActivityType> selectedActivityTypes_;
  336. // The activity profiler settings are all on-demand
  337. std::string activitiesLogFile_;
  338. std::string activitiesLogUrl_;
  339. // Log activities to memory buffer
  340. bool activitiesLogToMemory_{false};
  341. int activitiesMaxGpuBufferSize_;
  342. std::chrono::seconds activitiesWarmupDuration_;
  343. int activitiesWarmupIterations_;
  344. bool activitiesCudaSyncWaitEvents_;
  345. // Enable Profiler Config Options
  346. // Temporarily disable shape collection until we re-roll out the feature for on-demand cases
  347. bool enableReportInputShapes_{false};
  348. bool enableProfileMemory_{false};
  349. bool enableWithStack_{false};
  350. bool enableWithFlops_{false};
  351. bool enableWithModules_{false};
  352. // Profile for specified iterations and duration
  353. std::chrono::milliseconds activitiesDuration_;
  354. int activitiesRunIterations_;
  355. // Below are not used
  356. // Use this net name for iteration count
  357. std::string activitiesExternalAPIIterationsTarget_;
  358. // Only profile nets that includes this in the name
  359. std::vector<std::string> activitiesExternalAPIFilter_;
  360. // Only profile nets with at least this many operators
  361. int activitiesExternalAPINetSizeThreshold_;
  362. // Only profile nets with at least this many GPU operators
  363. int activitiesExternalAPIGpuOpCountThreshold_;
  364. // Last activity profiler request
  365. std::chrono::time_point<std::chrono::system_clock>
  366. activitiesOnDemandTimestamp_;
  367. // ActivityProfilers are triggered by either:
  368. // Synchronized start timestamps
  369. std::chrono::time_point<std::chrono::system_clock> profileStartTime_;
  370. // Or start iterations.
  371. int profileStartIteration_;
  372. int profileStartIterationRoundUp_;
  373. // DEPRECATED
  374. std::chrono::time_point<std::chrono::system_clock> requestTimestamp_;
  375. // Enable profiling via SIGUSR2
  376. bool enableSigUsr2_;
  377. // Enable IPC Fabric instead of thrift communication
  378. bool enableIpcFabric_;
  379. std::chrono::seconds onDemandConfigUpdateIntervalSecs_;
  380. // Logger Metadata
  381. std::string requestTraceID_;
  382. std::string requestGroupTraceID_;
  383. // CUPTI Device Buffer
  384. size_t cuptiDeviceBufferSize_;
  385. size_t cuptiDeviceBufferPoolLimit_;
  386. };
  387. constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON";
  388. } // namespace libkineto