Reduce.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. #pragma once
  2. #include <ATen/native/cpu/Loops.h>
  3. #include <ATen/Parallel.h>
  4. #include <c10/util/TypeList.h>
  5. #include <c10/core/Scalar.h>
  6. #include <c10/util/irange.h>
  7. #include <sstream>
  8. #include <type_traits>
  9. namespace at { namespace native { inline namespace CPU_CAPABILITY {
  10. using namespace vec;
  11. #define VEC_LOOP_HEADER(func_t, data) \
  12. using scalar_t = typename function_traits<func_t>::result_type; \
  13. using Vec = Vectorized<scalar_t>; \
  14. char* out_ptr = data[0]; \
  15. (void) out_ptr;
  16. // reduction that is contiguous over the input in dim 0
  17. template <typename traits>
  18. inline bool is_contiguous_reduction(const int64_t* strides) {
  19. return strides[0] == 0 &&
  20. strides[1] == sizeof(typename traits::arg2_t);
  21. }
  22. // reduction that is contiguous over the input in dim 1
  23. template <typename traits>
  24. inline bool is_outer_reduction(const int64_t* strides) {
  25. return strides[0] == 0 &&
  26. strides[2] == sizeof(typename traits::result_type) &&
  27. strides[3] == sizeof(typename traits::arg2_t);
  28. }
  29. template <typename func_t, typename vec_func_t>
  30. inline void vectorized_reduction(char** data, int64_t n, int64_t stride,
  31. func_t op, vec_func_t vop, bool reduce) {
  32. VEC_LOOP_HEADER(func_t, data)
  33. const char* in1_ptr = data[1];
  34. Vec acc[4];
  35. for (const auto j : c10::irange(4)) {
  36. acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t));
  37. }
  38. for (const auto i : c10::irange(1, n)) {
  39. const char* ptr = in1_ptr + stride * i;
  40. acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
  41. acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
  42. acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
  43. acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
  44. }
  45. if (reduce) {
  46. scalar_t buffer[Vec::size()];
  47. acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
  48. acc[0].store(buffer);
  49. for (const auto j : c10::irange(1, Vec::size())) {
  50. buffer[0] = op(buffer[0], buffer[j]);
  51. }
  52. auto dst = (scalar_t*)out_ptr;
  53. *dst = op(*dst, buffer[0]);
  54. } else {
  55. for (const auto j : c10::irange(4)) {
  56. auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
  57. acc[j] = vop(acc[j], Vec::loadu(dst));
  58. acc[j].store(dst);
  59. }
  60. }
  61. }
  62. template <typename F>
  63. inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) {
  64. for (const auto j C10_UNUSED : c10::irange(n)) {
  65. f();
  66. data[0] += strides[0];
  67. data[1] += strides[1];
  68. }
  69. }
  70. // computes the reduction out = op(out, in)
  71. template <typename func_t, typename vec_func_t>
  72. inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
  73. VEC_LOOP_HEADER(func_t, data)
  74. int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
  75. int64_t count = n / (4 * Vec::size());
  76. if (count > 0) {
  77. vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true);
  78. }
  79. char* ptrs[3] = { data[0], data[0], data[1] };
  80. int64_t strides[] = { 0, 0, sizeof(scalar_t) };
  81. basic_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
  82. }
  83. // computes the reduction out = op(out, in)
  84. template <typename func_t, typename vec_func_t>
  85. inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
  86. VEC_LOOP_HEADER(func_t, data)
  87. // reduce down each column of 4 * Vec::size() elements (128 or 256 bytes)
  88. #if defined(CPU_CAPABILITY_AVX512)
  89. int64_t outer_stride[2] = { 256, 256 };
  90. #else
  91. int64_t outer_stride[2] = { 128, 128 };
  92. #endif
  93. UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
  94. vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false);
  95. });
  96. // reduce down the remaining columns
  97. int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
  98. int64_t remaining = size1 % (4 * Vec::size());
  99. UNARY_OUTER_LOOP(data, step, remaining, [&] {
  100. char* ptrs[3] = { data[0], data[0], data[1] };
  101. int64_t strides[] = { 0, 0, inner_stride };
  102. basic_loop(ptrs, strides, 0, size0, op);
  103. });
  104. }
  105. template<typename traits, typename res_t>
  106. static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
  107. // static_assert(std::is_same<res_t, typename traits::arg2_t>::value, "data types must match");
  108. if (index < num_outputs) {
  109. char *out = (char *) iter.data_ptr(index);
  110. *(res_t *) out = result;
  111. }
  112. }
  113. template<typename traits, typename res_t>
  114. static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
  115. AT_ASSERT(num_outputs == 1);
  116. set_result<traits>(0, result, iter, num_outputs);
  117. }
  118. template<typename traits, std::size_t i = 0, typename... tuple_t>
  119. inline typename std::enable_if<i == sizeof...(tuple_t), std::size_t>::type
  120. for_each_in_tuple(const std::tuple<tuple_t...>& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) {
  121. return i;
  122. }
  123. template<typename traits, std::size_t i = 0, typename... tuple_t>
  124. inline typename std::enable_if<i < sizeof...(tuple_t), std::size_t>::type
  125. for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIteratorBase &iter, const int num_outputs) {
  126. if (i < (size_t)num_outputs) {
  127. set_result<traits>(i, std::get<i>(t), iter, num_outputs);
  128. return for_each_in_tuple<traits, i + 1, tuple_t...>(t, iter, num_outputs);
  129. }
  130. return i;
  131. }
  132. template<typename traits, typename... res_t>
  133. static void set_results(const std::tuple<res_t...>& result, const TensorIteratorBase &iter, const int num_outputs) {
  134. AT_ASSERT(num_outputs >= 1);
  135. std::size_t result_size = for_each_in_tuple<traits>(result, iter, num_outputs);
  136. AT_ASSERT((size_t)num_outputs == result_size);
  137. }
  138. template <typename T, typename... Args>
  139. struct all_same : std::conjunction<
  140. std::is_same<T, Args>...
  141. > {};
  142. // data_t is the input/output data type.
  143. // acc_t is a type that contains all the necessary data
  144. // to continue reducing.
  145. // index_t is a one-dimensional index
  146. //
  147. // ops_t is such that &ops_t::reduce, &ops_t::combine, and &ops_t::project exist and satisfy
  148. // the following.
  149. // reduce: (acc_t, data_t, index_t) -> acc_t adds one data point to the accumulated value.
  150. // combine: (acc_t, acc_t) -> acc_t combines two accumulated values into one.
  151. // project: acc_t -> out_t finishes the reduction, getting the required output.
  152. //
  153. // Additionally, acc_t must be default-constructible:
  154. // acc_t {} is an identity for combine,
  155. // and project(acc_t {}) is the value of the operation on zero elements.
  156. //
  157. // The point of `combine` is to support parallelization -
  158. // the idea is to one sequence of `reduce` calls per thread of execution,
  159. // and then to combine them at the end with `combine`.
  160. //
  161. // If there is more than one output element,
  162. // our parallelization strategy is to use one thread for each of them,
  163. // which means that `combine` will never be called.
  164. //
  165. // If, on the other hand, there is only one, then we split the input into
  166. // into several pieces, reduce each separately, and then combine them.
  167. template <typename ops_t, typename init_t>
  168. void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
  169. using rf_t = decltype(&ops_t::reduce);
  170. using cf_t = decltype(&ops_t::combine);
  171. using pf_t = decltype(&ops_t::project);
  172. using r_traits = binary_function_traits<rf_t>;
  173. using c_traits = binary_function_traits<cf_t>;
  174. using p_traits = unary_function_traits<pf_t>;
  175. using acc_t = typename p_traits::arg1_t;
  176. using data_t = typename r_traits::arg2_t;
  177. static_assert(
  178. all_same<
  179. acc_t,
  180. init_t,
  181. typename r_traits::arg1_t,
  182. typename r_traits::result_type,
  183. typename c_traits::arg1_t,
  184. typename c_traits::arg2_t,
  185. typename c_traits::result_type>::value,
  186. "all accumulate types must match");
  187. static_assert(
  188. std::is_default_constructible<acc_t>::value,
  189. "the accumulate type must be default-constructible"
  190. );
  191. const int num_outputs = iter.noutputs();
  192. iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) {
  193. auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t {
  194. int ntensors = sub_iter.ntensors();
  195. sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) {
  196. AT_ASSERT(ntensors - num_outputs == 1);
  197. char *in = data[ntensors - 1];
  198. int64_t stride = strides[ntensors - 1];
  199. for (const auto i : c10::irange(size)) {
  200. acc = ops.reduce(acc, c10::load<data_t>(in), begin + i);
  201. in += stride;
  202. }
  203. }, {begin, end});
  204. return ops.translate_idx(acc, sub_iter.view_offsets()[0]);
  205. };
  206. acc_t total_acc = init;
  207. auto numel = sub_iter.numel();
  208. if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
  209. at::in_parallel_region()) {
  210. total_acc = reduction_body(total_acc, 0, numel);
  211. } else {
  212. int max_threads = at::get_num_threads();
  213. AT_ASSERT(max_threads > 0);
  214. static_assert(
  215. !std::is_same<acc_t, bool>::value,
  216. "Concurrently modifying different references into std::vector<bool> is UB."
  217. );
  218. std::vector<acc_t> buffer((unsigned)max_threads, init);
  219. at::parallel_for(0, numel, internal::GRAIN_SIZE,
  220. [&](int64_t begin, int64_t end) {
  221. auto& acc = buffer[at::get_thread_num()];
  222. acc = reduction_body(acc, begin, end);
  223. }
  224. );
  225. for (const auto i : c10::irange(max_threads)) {
  226. total_acc = ops.combine(total_acc, buffer[i]);
  227. }
  228. }
  229. set_results<r_traits>(ops.project(total_acc), sub_iter, num_outputs);
  230. });
  231. }
  232. template <typename func_t, typename vec_func_t>
  233. void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
  234. using traits = binary_function_traits<func_t>;
  235. static_assert(
  236. all_same<
  237. typename traits::result_type,
  238. typename traits::arg1_t,
  239. typename traits::arg2_t>::value,
  240. "all types must match");
  241. iter.output_base().fill_(ident);
  242. iter.parallel_reduce([&](char** data, const int64_t* strides, int64_t size0, int64_t size1) {
  243. int64_t outer_strides[] = { strides[2], strides[3] };
  244. if (is_contiguous_reduction<traits>(strides)) {
  245. // input is contiguous in dim 0, output is reduced in dim 0
  246. UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
  247. vectorized_inner_reduction(data, size0, op, vop);
  248. });
  249. } else if (is_outer_reduction<traits>(strides)) {
  250. // input and output are contiguous in dim 1
  251. int64_t inner_stride = strides[1]; // stride of input in dim 0
  252. vectorized_outer_reduction(data, inner_stride, size0, size1, op, vop);
  253. } else {
  254. UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
  255. char* ptrs[3] = { data[0], data[0], data[1] };
  256. int64_t inner_strides[3] = { strides[0], strides[0], strides[1] };
  257. basic_loop(ptrs, inner_strides, 0, size0, op);
  258. });
  259. }
  260. });
  261. }
  262. // when reduction is on most inner dimension (dim 0 in TensorIterator)
  263. // and input has contiguous most inner dimension, `binary_kernel_reduce_lastdim`
  264. // can be used.
  265. inline bool is_reduce_lastdim(TensorIteratorBase& iter) {
  266. return iter.num_reduce_dims() == 1 && iter.is_dim_reduced(0)
  267. && iter.ninputs() == 1 && iter.strides(1)[0] == iter.element_size(1);
  268. }
  269. template <typename reduce_func_t>
  270. void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce_op) {
  271. auto shape = iter.shape();
  272. int64_t dim_size = shape[0];
  273. int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / dim_size);
  274. TensorIterator sub_iter(iter);
  275. // create sub iterator to parallel on all non-reduce-dims
  276. sub_iter.narrow(0, 0, 1);
  277. auto loop = [&](char** data, const int64_t* strides, int64_t size) {
  278. char* out = data[0];
  279. char* in = data[1];
  280. for (int64_t i = 0; i < size; ++i) {
  281. reduce_op(out, in, dim_size);
  282. out += strides[0];
  283. in += strides[1];
  284. }
  285. };
  286. sub_iter.for_each(loop, grain_size);
  287. }
  288. }}} // namespace at::native::<anonymous>