ConvUtils.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. #pragma once
  2. #include <ATen/core/Tensor.h>
  3. #include <ATen/TensorUtils.h>
  4. #include <ATen/detail/CUDAHooksInterface.h>
  5. #include <ATen/native/DispatchStub.h>
  6. #include <c10/util/env.h>
  7. #include <c10/util/irange.h>
  8. namespace at::native {
  9. using conv_depthwise2d_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
  10. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  11. at::IntArrayRef, at::IntArrayRef, std::array<bool, 2>);
  12. DECLARE_DISPATCH(conv_depthwise2d_backward_fn, conv_depthwise2d_backward_stub);
  13. using conv_depthwise3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  14. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  15. at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
  16. DECLARE_DISPATCH(conv_depthwise3d_backward_fn, conv_depthwise3d_backward_stub);
  17. using cudnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
  18. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  19. at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
  20. DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub);
  21. using mps_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  22. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  23. at::IntArrayRef, int64_t, std::array<bool,3>);
  24. DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub);
  25. using cudnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
  26. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  27. at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
  28. DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub);
  29. using miopen_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  30. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  31. at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
  32. DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub);
  33. using miopen_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  34. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  35. at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
  36. DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub);
  37. using miopen_depthwise_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  38. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  39. at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
  40. DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub);
  41. using mkldnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  42. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  43. at::IntArrayRef, int64_t, std::array<bool,3>);
  44. DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub);
  45. using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const std::optional<Tensor>&,
  46. IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t);
  47. DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub);
  48. using mkldnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  49. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  50. at::IntArrayRef, at::IntArrayRef, int64_t, std::array<bool,3>);
  51. DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub);
  52. using slow_conv_dilated2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  53. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  54. at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
  55. DECLARE_DISPATCH(slow_conv_dilated2d_backward_fn, slow_conv_dilated2d_backward_stub);
  56. using slow_conv_dilated3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  57. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  58. at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
  59. DECLARE_DISPATCH(slow_conv_dilated3d_backward_fn, slow_conv_dilated3d_backward_stub);
  60. using slow_conv_transpose2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  61. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  62. at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
  63. DECLARE_DISPATCH(slow_conv_transpose2d_backward_fn, slow_conv_transpose2d_backward_stub);
  64. using slow_conv_transpose3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
  65. const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
  66. at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
  67. DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub);
  68. namespace {
  69. bool is_cudnnv8_heuristic_mode_b() {
  70. static const bool is_cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true;
  71. return is_cudnnv8_heuristic_mode_b;
  72. }
  73. }
  74. inline bool cudnnv8_enabled_check_debug() {
  75. static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true;
  76. static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true;
  77. static uint8_t cudnnv8_debugcount = 0;
  78. if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) {
  79. TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", is_cudnnv8_heuristic_mode_b());
  80. cudnnv8_debugcount++;
  81. }
  82. return cudnnv8_flag == 1;
  83. }
  84. inline bool cudnnv8_use_heur_mode_b() {
  85. return is_cudnnv8_heuristic_mode_b();
  86. }
  87. // Keep in sync with py::enum_ in Module.cpp
  88. enum class ConvBackend {
  89. CudaDepthwise2d,
  90. CudaDepthwise3d,
  91. Cudnn,
  92. CudnnTranspose,
  93. Empty,
  94. Miopen,
  95. MiopenDepthwise,
  96. MiopenTranspose,
  97. Mkldnn,
  98. MkldnnTranspose,
  99. MkldnnEmpty,
  100. NnpackSpatial,
  101. Overrideable,
  102. Slow2d,
  103. Slow3d,
  104. SlowDilated2d,
  105. SlowDilated3d,
  106. SlowTranspose2d,
  107. SlowTranspose3d,
  108. Winograd3x3Depthwise,
  109. Xnnpack2d,
  110. Mps,
  111. MpsTranspose,
  112. };
  113. // Overload for selecting the convolution backend from the full set of convolution inputs.
  114. // This overload is exposed to python for testing, etc.
  115. TORCH_API ConvBackend select_conv_backend(
  116. const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt,
  117. SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation,
  118. bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
  119. TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
  120. const Tensor& weight,
  121. const ConvBackend backend);
  122. // ---------------------------------------------------------------------
  123. //
  124. // Math
  125. //
  126. // ---------------------------------------------------------------------
  127. constexpr int input_batch_size_dim = 0; // also grad_input
  128. constexpr int input_channels_dim = 1;
  129. constexpr int output_batch_size_dim = 0; // also grad_output
  130. constexpr int output_channels_dim = 1;
  131. constexpr int weight_output_channels_dim = 0;
  132. constexpr int weight_input_channels_dim = 1;
  133. // Often written as 2 + max_dim (extra dims for batch size and channels)
  134. constexpr int max_dim = 3;
  135. // ---------------------------------------------------------------------
  136. //
  137. // Checking
  138. //
  139. // ---------------------------------------------------------------------
  140. // Used on pad, stride and dilation
  141. static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
  142. {
  143. TORCH_CHECK(args.size() <= expected_size,
  144. "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
  145. expected_size, " (while checking arguments for ", c, ")");
  146. TORCH_CHECK(args.size() >= expected_size,
  147. "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
  148. expected_size, " (while checking arguments for ", c, ")");
  149. auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
  150. if (num_negative_values > 0){
  151. std::stringstream ss;
  152. ss << arg_name << " should be greater than zero but got (";
  153. std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
  154. ss << args.back() << ")" << " (while checking arguments for " << c << ")";
  155. AT_ERROR(ss.str());
  156. }
  157. }
  158. // NOTE [ Convolution checks ]
  159. //
  160. // NB: For many call sites, it is not strictly necessary to check all of
  161. // these relationships (for example, for forward convolution, we compute
  162. // the size of output ourselves, so we don't actually need to check
  163. // output. However, writing a single function that does everything
  164. // means we get to reuse it for both forwards and all backwards
  165. // variants, even when the set of "real" inputs varies. The magic of
  166. // relational computing!
  167. //
  168. // (There is one downside, which is that it is slightly harder to write
  169. // error messages which are able to distinguish between real inputs
  170. // (which the user can change) and computed inputs (which the user can
  171. // only indirectly affect). It would be an interesting exercise to
  172. // come up with a general framework to handle such situations.)
  173. inline void convolution_shape_check(
  174. CheckedFrom c,
  175. const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
  176. IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
  177. {
  178. check_args(c, padding, input->dim() - 2, "padding");
  179. check_args(c, stride, padding.size(), "stride");
  180. check_args(c, dilation, padding.size(), "dilation");
  181. // Input
  182. checkDimRange(c, input, 3, 6 /* exclusive */);
  183. checkSize_symint(c, input, input_channels_dim, weight->size(1) * groups);
  184. // Weight
  185. checkSameDim(c, input, weight);
  186. // TODO: check that output->size() matches output_sizes
  187. // TODO: check that weight matches output->sizes()
  188. checkSameDim(c, input, output);
  189. }
  190. // NB: conv_output_size and conv_input_size are not bijections,
  191. // as conv_output_size loses information; this is why conv_input_size
  192. // takes an extra output_padding argument to resolve the ambiguity.
  193. template <typename T>
  194. inline std::vector<T> _conv_output_size(
  195. ArrayRef<T> input_size, ArrayRef<T> weight_size,
  196. ArrayRef<T> padding, ArrayRef<T> stride, ArrayRef<T> dilation = ArrayRef<T>()
  197. ) {
  198. // ASSERT(input_size.size() > 2)
  199. // ASSERT(input_size.size() == weight_size.size())
  200. bool has_dilation = !dilation.empty();
  201. auto dim = input_size.size();
  202. std::vector<T> output_size(dim);
  203. output_size[0] = input_size[input_batch_size_dim];
  204. output_size[1] = weight_size[weight_output_channels_dim];
  205. for (const auto d : c10::irange(2, dim)) {
  206. auto dilation_ = has_dilation ? dilation[d - 2] : 1;
  207. auto kernel = dilation_ * (weight_size[d] - 1) + 1;
  208. output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
  209. }
  210. return output_size;
  211. }
  212. inline std::vector<int64_t> conv_output_size(
  213. IntArrayRef input_size, IntArrayRef weight_size,
  214. IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
  215. ) {
  216. return _conv_output_size(input_size, weight_size, padding, stride, dilation);
  217. }
  218. inline std::vector<c10::SymInt> conv_output_size(
  219. SymIntArrayRef input_size, SymIntArrayRef weight_size,
  220. SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef()
  221. ) {
  222. return _conv_output_size(input_size, weight_size, padding, stride, dilation);
  223. }
  224. template <typename T>
  225. std::vector<T> _conv_input_size(
  226. ArrayRef<T> output_size, ArrayRef<T> weight_size,
  227. ArrayRef<T> padding, ArrayRef<T> output_padding, ArrayRef<T> stride, ArrayRef<T> dilation, T groups
  228. ) {
  229. // ASSERT(output_size.size() > 2)
  230. // ASSERT(output_size.size() == weight_size.size())
  231. auto dim = output_size.size();
  232. std::vector<T> input_size(dim);
  233. input_size[0] = output_size[output_batch_size_dim];
  234. input_size[1] = weight_size[weight_input_channels_dim] * groups;
  235. for (const auto d : c10::irange(2, dim)) {
  236. auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1;
  237. input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) +
  238. kernel + output_padding[d - 2];
  239. }
  240. return input_size;
  241. }
  242. inline std::vector<c10::SymInt> conv_input_size(
  243. SymIntArrayRef output_size, SymIntArrayRef weight_size,
  244. SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups
  245. ) {
  246. return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
  247. }
  248. inline std::vector<int64_t> conv_input_size(
  249. IntArrayRef output_size, IntArrayRef weight_size,
  250. IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
  251. ) {
  252. return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
  253. }
  254. template <typename T>
  255. std::vector<T> _conv_weight_size(
  256. ArrayRef<T> input_size, ArrayRef<T> output_size,
  257. ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
  258. ) {
  259. auto dim = input_size.size();
  260. std::vector<T> weight_size(dim);
  261. weight_size[0] = output_size[1];
  262. weight_size[1] = input_size[1] / groups;
  263. for (const auto d : c10::irange(2, dim)) {
  264. auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
  265. + padding[d - 2] * 2 - output_padding[d - 2];
  266. weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
  267. }
  268. return weight_size;
  269. }
  270. inline std::vector<c10::SymInt> conv_weight_size(
  271. SymIntArrayRef input_size, SymIntArrayRef output_size,
  272. SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
  273. ) {
  274. return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
  275. }
  276. inline std::vector<int64_t> conv_weight_size(
  277. IntArrayRef input_size, IntArrayRef output_size,
  278. IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
  279. ) {
  280. return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
  281. }
  282. inline Tensor reshape_bias(int64_t dim, const Tensor& bias) {
  283. std::vector<int64_t> shape(dim, 1);
  284. shape[1] = -1;
  285. return bias.reshape(shape);
  286. }
  287. inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
  288. // disable NHWC for float64 input.
  289. if (!at::detail::getCUDAHooks().compiledWithCuDNN() ||
  290. input.scalar_type() == at::kDouble ||
  291. weight.scalar_type() == at::kDouble) {
  292. return at::MemoryFormat::Contiguous;
  293. }
  294. long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
  295. auto input_memory_format = input.suggest_memory_format();
  296. auto weight_memory_format = weight.suggest_memory_format();
  297. auto weight_ndim = weight.ndimension();
  298. bool can_use_cudnn_channels_last_2d = (cudnn_version >= 7603) && (weight_ndim == 4) && (
  299. (input_memory_format == at::MemoryFormat::ChannelsLast) ||
  300. (weight_memory_format == at::MemoryFormat::ChannelsLast)
  301. );
  302. if (can_use_cudnn_channels_last_2d) {
  303. return at::MemoryFormat::ChannelsLast;
  304. }
  305. bool can_use_cudnn_channels_last_3d = (cudnn_version >= 8005) && (weight_ndim == 5) && (
  306. (input_memory_format == at::MemoryFormat::ChannelsLast3d) ||
  307. (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
  308. );
  309. if (can_use_cudnn_channels_last_3d) {
  310. return at::MemoryFormat::ChannelsLast3d;
  311. }
  312. return at::MemoryFormat::Contiguous;
  313. }
  314. // controls whether emptyCache will be called following cudnn conv benchmarking
  315. TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
  316. TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();
  317. inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
  318. // disable NHWC for float64 input.
  319. if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
  320. input.scalar_type() == at::kDouble ||
  321. weight.scalar_type() == at::kDouble) {
  322. return false;
  323. }
  324. bool can_use_miopen_channels_last_2d = false;
  325. // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
  326. // See #64427
  327. static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
  328. auto input_memory_format = input.suggest_memory_format();
  329. auto weight_memory_format = weight.suggest_memory_format();
  330. can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC && (
  331. ( (input_memory_format == at::MemoryFormat::ChannelsLast) ||
  332. (weight_memory_format == at::MemoryFormat::ChannelsLast) )
  333. );
  334. bool can_use_miopen_channels_last_3d = false;
  335. return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
  336. }
  337. inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
  338. // disable NHWC for float64 input.
  339. if (input.scalar_type() == at::kDouble ||
  340. weight.scalar_type() == at::kDouble) {
  341. return false;
  342. }
  343. // disable NHWC for MkldnnCPU tensor.
  344. if (input.is_mkldnn() || weight.is_mkldnn()) {
  345. return false;
  346. }
  347. auto input_memory_format = input.suggest_memory_format();
  348. auto weight_memory_format = weight.suggest_memory_format();
  349. bool can_use_mkldnn_channels_last_2d =
  350. (input_memory_format == at::MemoryFormat::ChannelsLast) ||
  351. (weight_memory_format == at::MemoryFormat::ChannelsLast);
  352. bool can_use_mkldnn_channels_last_3d =
  353. (input_memory_format == at::MemoryFormat::ChannelsLast3d) ||
  354. (weight_memory_format == at::MemoryFormat::ChannelsLast3d);
  355. return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d;
  356. }
  357. inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
  358. auto input_memory_format = input.suggest_memory_format();
  359. auto weight_memory_format = weight.suggest_memory_format();
  360. bool can_use_thnn_channels_last_2d = input.device().is_cpu() && (
  361. (input_memory_format == at::MemoryFormat::ChannelsLast) || (
  362. weight_memory_format == at::MemoryFormat::ChannelsLast));
  363. return can_use_thnn_channels_last_2d;
  364. }
  365. inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
  366. // check layout only for xpu tensor.
  367. if (!input.is_xpu() || !weight.is_xpu()) {
  368. return false;
  369. }
  370. // disable NHWC for float64 input.
  371. if (input.scalar_type() == at::kDouble ||
  372. weight.scalar_type() == at::kDouble) {
  373. return false;
  374. }
  375. auto input_memory_format = input.suggest_memory_format();
  376. auto weight_memory_format = weight.suggest_memory_format();
  377. bool can_use_xpu_channels_last_2d =
  378. (input_memory_format == at::MemoryFormat::ChannelsLast) ||
  379. (weight_memory_format == at::MemoryFormat::ChannelsLast);
  380. bool can_use_xpu_channels_last_3d =
  381. (input_memory_format == at::MemoryFormat::ChannelsLast3d) ||
  382. (weight_memory_format == at::MemoryFormat::ChannelsLast3d);
  383. return can_use_xpu_channels_last_2d || can_use_xpu_channels_last_3d;
  384. }
  385. } // namespace at::native