dlpack.h 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. /*!
  2. * Copyright (c) 2017 by Contributors
  3. * \file dlpack.h
  4. * \brief The common header of DLPack.
  5. */
  6. #ifndef DLPACK_DLPACK_H_
  7. #define DLPACK_DLPACK_H_
  8. /**
  9. * \brief Compatibility with C++
  10. */
  11. #ifdef __cplusplus
  12. #define DLPACK_EXTERN_C extern "C"
  13. #else
  14. #define DLPACK_EXTERN_C
  15. #endif
  16. /*! \brief The current version of dlpack */
  17. #define DLPACK_VERSION 80
  18. /*! \brief The current ABI version of dlpack */
  19. #define DLPACK_ABI_VERSION 1
  20. /*! \brief DLPACK_DLL prefix for windows */
  21. #ifdef _WIN32
  22. #ifdef DLPACK_EXPORTS
  23. #define DLPACK_DLL __declspec(dllexport)
  24. #else
  25. #define DLPACK_DLL __declspec(dllimport)
  26. #endif
  27. #else
  28. #define DLPACK_DLL
  29. #endif
  30. #include <stdint.h>
  31. #include <stddef.h>
  32. #ifdef __cplusplus
  33. extern "C" {
  34. #endif
  35. /*!
  36. * \brief The device type in DLDevice.
  37. */
  38. #ifdef __cplusplus
  39. typedef enum : int32_t {
  40. #else
  41. typedef enum {
  42. #endif
  43. /*! \brief CPU device */
  44. kDLCPU = 1,
  45. /*! \brief CUDA GPU device */
  46. kDLCUDA = 2,
  47. /*!
  48. * \brief Pinned CUDA CPU memory by cudaMallocHost
  49. */
  50. kDLCUDAHost = 3,
  51. /*! \brief OpenCL devices. */
  52. kDLOpenCL = 4,
  53. /*! \brief Vulkan buffer for next generation graphics. */
  54. kDLVulkan = 7,
  55. /*! \brief Metal for Apple GPU. */
  56. kDLMetal = 8,
  57. /*! \brief Verilog simulator buffer */
  58. kDLVPI = 9,
  59. /*! \brief ROCm GPUs for AMD GPUs */
  60. kDLROCM = 10,
  61. /*!
  62. * \brief Pinned ROCm CPU memory allocated by hipMallocHost
  63. */
  64. kDLROCMHost = 11,
  65. /*!
  66. * \brief Reserved extension device type,
  67. * used for quickly test extension device
  68. * The semantics can differ depending on the implementation.
  69. */
  70. kDLExtDev = 12,
  71. /*!
  72. * \brief CUDA managed/unified memory allocated by cudaMallocManaged
  73. */
  74. kDLCUDAManaged = 13,
  75. /*!
  76. * \brief Unified shared memory allocated on a oneAPI non-partititioned
  77. * device. Call to oneAPI runtime is required to determine the device
  78. * type, the USM allocation type and the sycl context it is bound to.
  79. *
  80. */
  81. kDLOneAPI = 14,
  82. /*! \brief GPU support for next generation WebGPU standard. */
  83. kDLWebGPU = 15,
  84. /*! \brief Qualcomm Hexagon DSP */
  85. kDLHexagon = 16,
  86. } DLDeviceType;
  87. /*!
  88. * \brief A Device for Tensor and operator.
  89. */
  90. typedef struct {
  91. /*! \brief The device type used in the device. */
  92. DLDeviceType device_type;
  93. /*!
  94. * \brief The device index.
  95. * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
  96. */
  97. int32_t device_id;
  98. } DLDevice;
  99. /*!
  100. * \brief The type code options DLDataType.
  101. */
  102. typedef enum {
  103. /*! \brief signed integer */
  104. kDLInt = 0U,
  105. /*! \brief unsigned integer */
  106. kDLUInt = 1U,
  107. /*! \brief IEEE floating point */
  108. kDLFloat = 2U,
  109. /*!
  110. * \brief Opaque handle type, reserved for testing purposes.
  111. * Frameworks need to agree on the handle data type for the exchange to be well-defined.
  112. */
  113. kDLOpaqueHandle = 3U,
  114. /*! \brief bfloat16 */
  115. kDLBfloat = 4U,
  116. /*!
  117. * \brief complex number
  118. * (C/C++/Python layout: compact struct per complex number)
  119. */
  120. kDLComplex = 5U,
  121. /*! \brief boolean */
  122. kDLBool = 6U,
  123. } DLDataTypeCode;
  124. /*!
  125. * \brief The data type the tensor can hold. The data type is assumed to follow the
  126. * native endian-ness. An explicit error message should be raised when attempting to
  127. * export an array with non-native endianness
  128. *
  129. * Examples
  130. * - float: type_code = 2, bits = 32, lanes = 1
  131. * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
  132. * - int8: type_code = 0, bits = 8, lanes = 1
  133. * - std::complex<float>: type_code = 5, bits = 64, lanes = 1
  134. * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
  135. */
  136. typedef struct {
  137. /*!
  138. * \brief Type code of base types.
  139. * We keep it uint8_t instead of DLDataTypeCode for minimal memory
  140. * footprint, but the value should be one of DLDataTypeCode enum values.
  141. * */
  142. uint8_t code;
  143. /*!
  144. * \brief Number of bits, common choices are 8, 16, 32.
  145. */
  146. uint8_t bits;
  147. /*! \brief Number of lanes in the type, used for vector types. */
  148. uint16_t lanes;
  149. } DLDataType;
  150. /*!
  151. * \brief Plain C Tensor object, does not manage memory.
  152. */
  153. typedef struct {
  154. /*!
  155. * \brief The data pointer points to the allocated data. This will be CUDA
  156. * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
  157. * types. This pointer is always aligned to 256 bytes as in CUDA. The
  158. * `byte_offset` field should be used to point to the beginning of the data.
  159. *
  160. * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
  161. * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
  162. * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
  163. * (after which this note will be updated); at the moment it is recommended
  164. * to not rely on the data pointer being correctly aligned.
  165. *
  166. * For given DLTensor, the size of memory required to store the contents of
  167. * data is calculated as follows:
  168. *
  169. * \code{.c}
  170. * static inline size_t GetDataSize(const DLTensor* t) {
  171. * size_t size = 1;
  172. * for (tvm_index_t i = 0; i < t->ndim; ++i) {
  173. * size *= t->shape[i];
  174. * }
  175. * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
  176. * return size;
  177. * }
  178. * \endcode
  179. */
  180. void* data;
  181. /*! \brief The device of the tensor */
  182. DLDevice device;
  183. /*! \brief Number of dimensions */
  184. int32_t ndim;
  185. /*! \brief The data type of the pointer*/
  186. DLDataType dtype;
  187. /*! \brief The shape of the tensor */
  188. const int64_t* shape;
  189. /*!
  190. * \brief strides of the tensor (in number of elements, not bytes)
  191. * can be NULL, indicating tensor is compact and row-majored.
  192. */
  193. const int64_t* strides;
  194. /*! \brief The offset in bytes to the beginning pointer to data */
  195. uint64_t byte_offset;
  196. } DLTensor;
  197. /*!
  198. * \brief C Tensor object, manage memory of DLTensor. This data structure is
  199. * intended to facilitate the borrowing of DLTensor by another framework. It is
  200. * not meant to transfer the tensor. When the borrowing framework doesn't need
  201. * the tensor, it should call the deleter to notify the host that the resource
  202. * is no longer needed.
  203. */
  204. typedef struct DLManagedTensor {
  205. /*! \brief DLTensor which is being memory managed */
  206. DLTensor dl_tensor;
  207. /*! \brief the context of the original host framework of DLManagedTensor in
  208. * which DLManagedTensor is used in the framework. It can also be NULL.
  209. */
  210. void * manager_ctx;
  211. /*! \brief Destructor signature void (*)(void*) - this should be called
  212. * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
  213. * if there is no way for the caller to provide a reasonable destructor.
  214. * The destructors deletes the argument self as well.
  215. */
  216. void (*deleter)(struct DLManagedTensor * self);
  217. } DLManagedTensor;
  218. #ifdef __cplusplus
  219. } // DLPACK_EXTERN_C
  220. #endif
  221. #endif // DLPACK_DLPACK_H_