__init__.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. # mypy: allow-untyped-defs
  2. from typing import List, Optional, Tuple, Union
  3. import torch
  4. import torch.nn.functional as F
  5. from torch import SymInt, Tensor
  6. from torch._C import _add_docstr, _nested # type: ignore[attr-defined]
  7. from torch.types import _device as Device, _dtype as DType
  8. __all__ = [
  9. "to_padded_tensor",
  10. "as_nested_tensor",
  11. "nested_tensor",
  12. "nested_tensor_from_jagged",
  13. "narrow",
  14. ]
  15. # Nested Tensor constructor functions
  16. def as_nested_tensor(
  17. ts: Union[Tensor, List[Tensor], Tuple[Tensor, ...]],
  18. dtype: Optional[DType] = None,
  19. device: Optional[Device] = None,
  20. layout=None
  21. ) -> Tensor:
  22. r"""
  23. Constructs a nested tensor preserving autograd history from a tensor or a list / tuple of
  24. tensors.
  25. If a nested tensor is passed, it will be returned directly unless the device / dtype / layout
  26. differ. Note that converting device / dtype will result in a copy, while converting layout
  27. is not currently supported by this function.
  28. If a non-nested tensor is passed, it is treated as a batch of constituents of consistent size.
  29. A copy will be incurred if the passed device / dtype differ from those of the input OR if
  30. the input is non-contiguous. Otherwise, the input's storage will be used directly.
  31. If a tensor list is provided, tensors in the list are always copied during construction of
  32. the nested tensor.
  33. Args:
  34. ts (Tensor or List[Tensor] or Tuple[Tensor]): a tensor to treat as a nested tensor OR a
  35. list / tuple of tensors with the same ndim
  36. Keyword arguments:
  37. dtype (:class:`torch.dtype`, optional): the desired type of returned nested tensor.
  38. Default: if None, same :class:`torch.dtype` as leftmost tensor in the list.
  39. device (:class:`torch.device`, optional): the desired device of returned nested tensor.
  40. Default: if None, same :class:`torch.device` as leftmost tensor in the list
  41. layout (:class:`torch.layout`, optional): the desired layout of returned nested tensor.
  42. Only strided and jagged layouts are supported. Default: if None, the strided layout.
  43. Example::
  44. >>> a = torch.arange(3, dtype=torch.float, requires_grad=True)
  45. >>> b = torch.arange(5, dtype=torch.float, requires_grad=True)
  46. >>> nt = torch.nested.as_nested_tensor([a, b])
  47. >>> nt.is_leaf
  48. False
  49. >>> fake_grad = torch.nested.nested_tensor([torch.ones_like(a), torch.zeros_like(b)])
  50. >>> nt.backward(fake_grad)
  51. >>> a.grad
  52. tensor([1., 1., 1.])
  53. >>> b.grad
  54. tensor([0., 0., 0., 0., 0.])
  55. >>> c = torch.randn(3, 5, requires_grad=True)
  56. >>> nt2 = torch.nested.as_nested_tensor(c)
  57. """
  58. is_tensor_list = isinstance(ts, (list, tuple)) and all(isinstance(t, Tensor) for t in ts)
  59. if not isinstance(ts, Tensor) and not is_tensor_list:
  60. raise TypeError(
  61. "as_nested_tensor(): Expected first argument to be a tensor or a list / tuple of tensors "
  62. )
  63. # convert tuple -> list if needed
  64. if is_tensor_list and not isinstance(ts, list):
  65. ts = list(ts)
  66. if isinstance(ts, Tensor) and ts.dim() < 2:
  67. raise RuntimeError("as_nested_tensor(): Expected tensor argument to have dim() > 1")
  68. if isinstance(ts, Tensor) and ts.is_nested:
  69. if layout == ts.layout:
  70. # return input directly or input copied to device / dtype
  71. return ts.to(device=device, dtype=dtype)
  72. else:
  73. # TODO: Just use nt.to(layout=layout) when it exists.
  74. raise RuntimeError(
  75. "as_nested_tensor(): Converting between nested tensor layouts is not supported")
  76. if layout is None:
  77. layout = torch.strided
  78. if layout == torch.strided:
  79. if isinstance(ts, Tensor):
  80. # contiguous() might be necessary to get flattened view.
  81. # we could probably be more precise about when to do this as an optimization
  82. buffer = ts.contiguous().view(-1).to(device=device, dtype=dtype)
  83. nested_sizes = torch.tensor([t.shape for t in ts])
  84. return torch._nested_view_from_buffer(
  85. buffer,
  86. nested_sizes,
  87. *torch._nested_compute_contiguous_strides_offsets(nested_sizes))
  88. else:
  89. assert isinstance(ts, list)
  90. return torch._nested_tensor_from_tensor_list(ts, dtype, None, device, None)
  91. elif layout == torch.jagged:
  92. if isinstance(ts, Tensor):
  93. # contiguous() might be necessary to get flattened view.
  94. # we could probably be more precise about when to do this as an optimization
  95. values = ts.contiguous().flatten(0, 1).to(device=device, dtype=dtype)
  96. batch_size = ts.shape[0]
  97. seq_len = ts.shape[1]
  98. offsets = torch.arange(0, batch_size * seq_len + 1, seq_len,
  99. device=device, dtype=torch.int64)
  100. from torch.nested._internal.nested_tensor import nested_view_from_values_offsets
  101. return nested_view_from_values_offsets(values, offsets)
  102. else:
  103. from torch.nested._internal.nested_tensor import jagged_from_list
  104. assert isinstance(ts, list)
  105. nt, _ = jagged_from_list(ts, offsets=None, device=device, dtype=dtype)
  106. return nt
  107. else:
  108. raise RuntimeError(f"Specified layout is unsupported for nested tensors: {layout}")
  109. # Note: This not only adds doc strings for the nested ops, but
  110. # also connects the torch.nested Python namespace to the torch._C._nested builtins.
  111. to_padded_tensor = _add_docstr(
  112. _nested.nested_to_padded_tensor,
  113. r"""
  114. to_padded_tensor(input, padding, output_size=None, out=None) -> Tensor
  115. Returns a new (non-nested) Tensor by padding the :attr:`input` nested tensor.
  116. The leading entries will be filled with the nested data,
  117. while the trailing entries will be padded.
  118. .. warning::
  119. :func:`to_padded_tensor` always copies the underlying data,
  120. since the nested and the non-nested tensors differ in memory layout.
  121. Args:
  122. padding (float): The padding value for the trailing entries.
  123. Keyword args:
  124. output_size (Tuple[int]): The size of the output tensor.
  125. If given, it must be large enough to contain all nested data;
  126. else, will infer by taking the max size of each nested sub-tensor along each dimension.
  127. out (Tensor, optional): the output tensor.
  128. Example::
  129. >>> nt = torch.nested.nested_tensor([torch.randn((2, 5)), torch.randn((3, 4))])
  130. nested_tensor([
  131. tensor([[ 1.6862, -1.1282, 1.1031, 0.0464, -1.3276],
  132. [-1.9967, -1.0054, 1.8972, 0.9174, -1.4995]]),
  133. tensor([[-1.8546, -0.7194, -0.2918, -0.1846],
  134. [ 0.2773, 0.8793, -0.5183, -0.6447],
  135. [ 1.8009, 1.8468, -0.9832, -1.5272]])
  136. ])
  137. >>> pt_infer = torch.nested.to_padded_tensor(nt, 0.0)
  138. tensor([[[ 1.6862, -1.1282, 1.1031, 0.0464, -1.3276],
  139. [-1.9967, -1.0054, 1.8972, 0.9174, -1.4995],
  140. [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
  141. [[-1.8546, -0.7194, -0.2918, -0.1846, 0.0000],
  142. [ 0.2773, 0.8793, -0.5183, -0.6447, 0.0000],
  143. [ 1.8009, 1.8468, -0.9832, -1.5272, 0.0000]]])
  144. >>> pt_large = torch.nested.to_padded_tensor(nt, 1.0, (2, 4, 6))
  145. tensor([[[ 1.6862, -1.1282, 1.1031, 0.0464, -1.3276, 1.0000],
  146. [-1.9967, -1.0054, 1.8972, 0.9174, -1.4995, 1.0000],
  147. [ 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
  148. [ 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]],
  149. [[-1.8546, -0.7194, -0.2918, -0.1846, 1.0000, 1.0000],
  150. [ 0.2773, 0.8793, -0.5183, -0.6447, 1.0000, 1.0000],
  151. [ 1.8009, 1.8468, -0.9832, -1.5272, 1.0000, 1.0000],
  152. [ 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]]])
  153. >>> pt_small = torch.nested.to_padded_tensor(nt, 2.0, (2, 2, 2))
  154. RuntimeError: Value in output_size is less than NestedTensor padded size. Truncation is not supported.
  155. """,
  156. )
  157. def nested_tensor(tensor_list, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor:
  158. r"""
  159. Constructs a nested tensor with no autograd history (also known as a "leaf tensor", see
  160. :ref:`Autograd mechanics <autograd-mechanics>`) from :attr:`tensor_list` a list of tensors.
  161. Args:
  162. tensor_list (List[array_like]): a list of tensors, or anything that can be passed to torch.tensor,
  163. where each element of the list has the same dimensionality.
  164. Keyword arguments:
  165. dtype (:class:`torch.dtype`, optional): the desired type of returned nested tensor.
  166. Default: if None, same :class:`torch.dtype` as leftmost tensor in the list.
  167. layout (:class:`torch.layout`, optional): the desired layout of returned nested tensor.
  168. Only strided and jagged layouts are supported. Default: if None, the strided layout.
  169. device (:class:`torch.device`, optional): the desired device of returned nested tensor.
  170. Default: if None, same :class:`torch.device` as leftmost tensor in the list
  171. requires_grad (bool, optional): If autograd should record operations on the
  172. returned nested tensor. Default: ``False``.
  173. pin_memory (bool, optional): If set, returned nested tensor would be allocated in
  174. the pinned memory. Works only for CPU tensors. Default: ``False``.
  175. Example::
  176. >>> a = torch.arange(3, dtype=torch.float, requires_grad=True)
  177. >>> b = torch.arange(5, dtype=torch.float, requires_grad=True)
  178. >>> nt = torch.nested.nested_tensor([a, b], requires_grad=True)
  179. >>> nt.is_leaf
  180. True
  181. """
  182. if layout is None:
  183. layout = torch.strided
  184. if layout == torch.strided:
  185. return _nested.nested_tensor(
  186. tensor_list,
  187. dtype=dtype,
  188. device=device,
  189. requires_grad=requires_grad,
  190. pin_memory=pin_memory)
  191. elif layout == torch.jagged:
  192. # Need to wrap lists of scalars as tensors
  193. list_of_tensors = [t if isinstance(t, Tensor) else torch.as_tensor(t) for t in tensor_list]
  194. from torch.nested._internal.nested_tensor import jagged_from_list
  195. with torch.no_grad():
  196. nt, _ = jagged_from_list(list_of_tensors, offsets=None, device=device, dtype=dtype)
  197. nt.requires_grad_(requires_grad)
  198. if pin_memory:
  199. nt = nt.pin_memory() # type: ignore[assignment]
  200. return nt
  201. else:
  202. raise RuntimeError(f"Specified layout is unsupported for nested tensors: {layout}")
  203. def narrow(tensor: Tensor, dim: int, start: Union[int, Tensor], length: Union[int, Tensor], layout=torch.strided) -> Tensor:
  204. r"""
  205. Constructs a nested tensor (which might be a view) from :attr:`tensor`, a strided tensor. This follows
  206. similar semantics to torch.Tensor.narrow, where in the :attr:`dim`-th dimension the new nested tensor
  207. shows only the elements in the interval `[start, start+length)`. As nested representations
  208. allow for a different `start` and `length` at each 'row' of that dimension, :attr:`start` and :attr:`length`
  209. can also be tensors of shape `tensor.shape[0]`.
  210. There's some differences depending on the layout you use for the nested tensor. If using strided layout,
  211. torch.narrow will do a copy of the narrowed data into a contiguous NT with strided layout, while
  212. jagged layout narrow() will create a non-contiguous view of your original strided tensor. This particular
  213. representation is really useful for representing kv-caches in Transformer models, as specialized
  214. SDPA kernels can deal with format easily, resulting in performance improvements.
  215. Args:
  216. tensor (:class:`torch.Tensor`): a strided tensor, which will be used as the underlying data
  217. for the nested tensor if using the jagged layout or will be copied for the strided layout.
  218. dim (int): the dimension where narrow will be applied. Only `dim=1` is supported for the
  219. jagged layout, while strided supports all dim
  220. start (Union[int, :class:`torch.Tensor`]): starting element for the narrow operation
  221. length (Union[int, :class:`torch.Tensor`]): number of elements taken during the narrow op
  222. Keyword arguments:
  223. layout (:class:`torch.layout`, optional): the desired layout of returned nested tensor.
  224. Only strided and jagged layouts are supported. Default: if None, the strided layout.
  225. Example::
  226. >>> starts = torch.tensor([0, 1, 2, 3, 4], dtype=torch.int64)
  227. >>> lengths = torch.tensor([3, 2, 2, 1, 5], dtype=torch.int64)
  228. >>> narrow_base = torch.randn(5, 10, 20)
  229. >>> nt_narrowed = torch.nested.narrow(narrow_base, 1, starts, lengths, layout=torch.jagged)
  230. >>> nt_narrowed.is_contiguous()
  231. False
  232. """
  233. if not isinstance(start, (int, SymInt, Tensor)):
  234. raise RuntimeError("start must be an integer or a tensor")
  235. if not isinstance(length, (int, SymInt, Tensor)):
  236. raise RuntimeError("length must be an integer or a tensor")
  237. if layout == torch.strided:
  238. if isinstance(start, Tensor) or isinstance(length, Tensor):
  239. raise RuntimeError("start and length must be integers for the strided layout NT impl")
  240. # TODO: switch to as_nested_tensor(tensor) when it is available
  241. nt = as_nested_tensor(torch.unbind(tensor), layout=torch.strided).narrow(dim, start, length)
  242. elif layout == torch.jagged:
  243. if dim != 1:
  244. raise RuntimeError("jagged layout only supports dim=1")
  245. from torch.nested._internal.nested_tensor import jagged_from_tensor_and_lengths
  246. if isinstance(start, (int, SymInt)):
  247. start = torch.tensor([start], device=tensor.device, dtype=torch.int64)
  248. if isinstance(length, (int, SymInt)):
  249. length = torch.tensor([length], device=tensor.device, dtype=torch.int64)
  250. nt, _, _ = jagged_from_tensor_and_lengths(tensor, start, length)
  251. else:
  252. raise RuntimeError(f"Specified layout is unsupported for nested narrow: {layout}")
  253. return nt
  254. def nested_tensor_from_jagged(
  255. values: Tensor,
  256. offsets: Optional[Tensor] = None,
  257. lengths: Optional[Tensor] = None,
  258. jagged_dim: Optional[int] = None,
  259. ) -> Tensor:
  260. r"""
  261. Constructs a jagged layout nested tensor from the given jagged components. The jagged layout
  262. consists of a required values buffer with the jagged dimension packed into a single dimension.
  263. The offsets / lengths metadata determines how this dimension is split into batch elements
  264. and are expected to be allocated on the same device as the values buffer.
  265. Expected metadata formats:
  266. * offsets: Indices within the packed dimension splitting it into heterogeneously-sized
  267. batch elements. Example: [0, 2, 3, 6] indicates that a packed jagged dim of size 6
  268. should be conceptually split into batch elements of length [2, 1, 3]. Note that both the
  269. beginning and ending offsets are required for kernel convenience (i.e. shape batch_size + 1).
  270. * lengths: Lengths of the individual batch elements; shape == batch_size. Example: [2, 1, 3]
  271. indicates that a packed jagged dim of size 6 should be conceptually split into batch
  272. elements of length [2, 1, 3].
  273. Note that it can be useful to provide both offsets and lengths. This describes a nested tensor
  274. with "holes", where the offsets indicate the start position of each batch item and the length
  275. specifies the total number of elements (see example below).
  276. The returned jagged layout nested tensor will be a view of the input values tensor.
  277. Args:
  278. values (:class:`torch.Tensor`): The underlying buffer in the shape of
  279. (sum_B(*), D_1, ..., D_N). The jagged dimension is packed into a single dimension,
  280. with the offsets / lengths metadata used to distinguish batch elements.
  281. offsets (optional :class:`torch.Tensor`): Offsets into the jagged dimension of shape B + 1.
  282. lengths (optional :class:`torch.Tensor`): Lengths of the batch elements of shape B.
  283. jagged_dim (optional int): Indicates which dimension in values is the packed jagged
  284. dimension. If None, this is set to dim=1 (i.e. the dimension immediately following
  285. the batch dimension). Default: None
  286. Example::
  287. >>> values = torch.randn(12, 5)
  288. >>> offsets = torch.tensor([0, 3, 5, 6, 10, 12])
  289. >>> nt = nested_tensor_from_jagged(values, offsets)
  290. >>> # 3D shape with the middle dimension jagged
  291. >>> nt.shape
  292. torch.Size([5, j2, 5])
  293. >>> # Length of each item in the batch:
  294. >>> offsets.diff()
  295. tensor([3, 2, 1, 4, 2])
  296. >>> values = torch.randn(6, 5)
  297. >>> offsets = torch.tensor([0, 2, 3, 6])
  298. >>> lengths = torch.tensor([1, 1, 2])
  299. >>> # NT with holes
  300. >>> nt = nested_tensor_from_jagged(values, offsets, lengths)
  301. >>> a, b, c = nt.unbind()
  302. >>> # Batch item 1 consists of indices [0, 1)
  303. >>> torch.equal(a, values[0:1, :])
  304. True
  305. >>> # Batch item 2 consists of indices [2, 3)
  306. >>> torch.equal(b, values[2:3, :])
  307. True
  308. >>> # Batch item 3 consists of indices [3, 5)
  309. >>> torch.equal(c, values[3:5, :])
  310. True
  311. """
  312. if offsets is None:
  313. if lengths is None:
  314. raise RuntimeError(
  315. "nested_tensor_from_jagged(): At least one of offsets or lengths is required."
  316. )
  317. else:
  318. # TODO: Truly support offsets=None at some point?
  319. # For now, just convert lengths -> offsets for kernel convenience
  320. offsets = F.pad(lengths.cumsum(0), (1, 0))
  321. lengths = None
  322. if jagged_dim is None:
  323. jagged_dim = 1
  324. from torch.nested._internal.nested_tensor import nested_view_from_values_offsets_lengths
  325. return nested_view_from_values_offsets_lengths(values, offsets, lengths, ragged_idx=jagged_dim)