| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232 |
- /*!
- * Copyright (c) 2017 by Contributors
- * \file dlpack.h
- * \brief The common header of DLPack.
- */
- #ifndef DLPACK_DLPACK_H_
- #define DLPACK_DLPACK_H_
- /**
- * \brief Compatibility with C++
- */
- #ifdef __cplusplus
- #define DLPACK_EXTERN_C extern "C"
- #else
- #define DLPACK_EXTERN_C
- #endif
- /*! \brief The current version of dlpack */
- #define DLPACK_VERSION 80
- /*! \brief The current ABI version of dlpack */
- #define DLPACK_ABI_VERSION 1
- /*! \brief DLPACK_DLL prefix for windows */
- #ifdef _WIN32
- #ifdef DLPACK_EXPORTS
- #define DLPACK_DLL __declspec(dllexport)
- #else
- #define DLPACK_DLL __declspec(dllimport)
- #endif
- #else
- #define DLPACK_DLL
- #endif
- #include <stdint.h>
- #include <stddef.h>
- #ifdef __cplusplus
- extern "C" {
- #endif
- /*!
- * \brief The device type in DLDevice.
- */
- #ifdef __cplusplus
- typedef enum : int32_t {
- #else
- typedef enum {
- #endif
- /*! \brief CPU device */
- kDLCPU = 1,
- /*! \brief CUDA GPU device */
- kDLCUDA = 2,
- /*!
- * \brief Pinned CUDA CPU memory by cudaMallocHost
- */
- kDLCUDAHost = 3,
- /*! \brief OpenCL devices. */
- kDLOpenCL = 4,
- /*! \brief Vulkan buffer for next generation graphics. */
- kDLVulkan = 7,
- /*! \brief Metal for Apple GPU. */
- kDLMetal = 8,
- /*! \brief Verilog simulator buffer */
- kDLVPI = 9,
- /*! \brief ROCm GPUs for AMD GPUs */
- kDLROCM = 10,
- /*!
- * \brief Pinned ROCm CPU memory allocated by hipMallocHost
- */
- kDLROCMHost = 11,
- /*!
- * \brief Reserved extension device type,
- * used for quickly test extension device
- * The semantics can differ depending on the implementation.
- */
- kDLExtDev = 12,
- /*!
- * \brief CUDA managed/unified memory allocated by cudaMallocManaged
- */
- kDLCUDAManaged = 13,
- /*!
- * \brief Unified shared memory allocated on a oneAPI non-partititioned
- * device. Call to oneAPI runtime is required to determine the device
- * type, the USM allocation type and the sycl context it is bound to.
- *
- */
- kDLOneAPI = 14,
- /*! \brief GPU support for next generation WebGPU standard. */
- kDLWebGPU = 15,
- /*! \brief Qualcomm Hexagon DSP */
- kDLHexagon = 16,
- } DLDeviceType;
- /*!
- * \brief A Device for Tensor and operator.
- */
- typedef struct {
- /*! \brief The device type used in the device. */
- DLDeviceType device_type;
- /*!
- * \brief The device index.
- * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
- */
- int32_t device_id;
- } DLDevice;
- /*!
- * \brief The type code options DLDataType.
- */
- typedef enum {
- /*! \brief signed integer */
- kDLInt = 0U,
- /*! \brief unsigned integer */
- kDLUInt = 1U,
- /*! \brief IEEE floating point */
- kDLFloat = 2U,
- /*!
- * \brief Opaque handle type, reserved for testing purposes.
- * Frameworks need to agree on the handle data type for the exchange to be well-defined.
- */
- kDLOpaqueHandle = 3U,
- /*! \brief bfloat16 */
- kDLBfloat = 4U,
- /*!
- * \brief complex number
- * (C/C++/Python layout: compact struct per complex number)
- */
- kDLComplex = 5U,
- /*! \brief boolean */
- kDLBool = 6U,
- } DLDataTypeCode;
- /*!
- * \brief The data type the tensor can hold. The data type is assumed to follow the
- * native endian-ness. An explicit error message should be raised when attempting to
- * export an array with non-native endianness
- *
- * Examples
- * - float: type_code = 2, bits = 32, lanes = 1
- * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
- * - int8: type_code = 0, bits = 8, lanes = 1
- * - std::complex<float>: type_code = 5, bits = 64, lanes = 1
- * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
- */
- typedef struct {
- /*!
- * \brief Type code of base types.
- * We keep it uint8_t instead of DLDataTypeCode for minimal memory
- * footprint, but the value should be one of DLDataTypeCode enum values.
- * */
- uint8_t code;
- /*!
- * \brief Number of bits, common choices are 8, 16, 32.
- */
- uint8_t bits;
- /*! \brief Number of lanes in the type, used for vector types. */
- uint16_t lanes;
- } DLDataType;
- /*!
- * \brief Plain C Tensor object, does not manage memory.
- */
- typedef struct {
- /*!
- * \brief The data pointer points to the allocated data. This will be CUDA
- * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
- * types. This pointer is always aligned to 256 bytes as in CUDA. The
- * `byte_offset` field should be used to point to the beginning of the data.
- *
- * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
- * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
- * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
- * (after which this note will be updated); at the moment it is recommended
- * to not rely on the data pointer being correctly aligned.
- *
- * For given DLTensor, the size of memory required to store the contents of
- * data is calculated as follows:
- *
- * \code{.c}
- * static inline size_t GetDataSize(const DLTensor* t) {
- * size_t size = 1;
- * for (tvm_index_t i = 0; i < t->ndim; ++i) {
- * size *= t->shape[i];
- * }
- * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
- * return size;
- * }
- * \endcode
- */
- void* data;
- /*! \brief The device of the tensor */
- DLDevice device;
- /*! \brief Number of dimensions */
- int32_t ndim;
- /*! \brief The data type of the pointer*/
- DLDataType dtype;
- /*! \brief The shape of the tensor */
- const int64_t* shape;
- /*!
- * \brief strides of the tensor (in number of elements, not bytes)
- * can be NULL, indicating tensor is compact and row-majored.
- */
- const int64_t* strides;
- /*! \brief The offset in bytes to the beginning pointer to data */
- uint64_t byte_offset;
- } DLTensor;
- /*!
- * \brief C Tensor object, manage memory of DLTensor. This data structure is
- * intended to facilitate the borrowing of DLTensor by another framework. It is
- * not meant to transfer the tensor. When the borrowing framework doesn't need
- * the tensor, it should call the deleter to notify the host that the resource
- * is no longer needed.
- */
- typedef struct DLManagedTensor {
- /*! \brief DLTensor which is being memory managed */
- DLTensor dl_tensor;
- /*! \brief the context of the original host framework of DLManagedTensor in
- * which DLManagedTensor is used in the framework. It can also be NULL.
- */
- void * manager_ctx;
- /*! \brief Destructor signature void (*)(void*) - this should be called
- * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
- * if there is no way for the caller to provide a reasonable destructor.
- * The destructors deletes the argument self as well.
- */
- void (*deleter)(struct DLManagedTensor * self);
- } DLManagedTensor;
- #ifdef __cplusplus
- } // DLPACK_EXTERN_C
- #endif
- #endif // DLPACK_DLPACK_H_
|