_quantized_conversions.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # mypy: allow-untyped-defs
  2. import torch
  3. # Pack pairs of int4 values into int8, in row major order; first int4
  4. # value goes into lower order bits, and second int4 value into higher
  5. # order bits of resulting int8 value.
  6. def pack_int4_to_int8(weight):
  7. assert weight.dim() == 2
  8. assert weight.shape[1] % 2 == 0
  9. assert weight.dtype == torch.int8
  10. return ((weight[:, 1::2] & 0xF) << 4) | (weight[:, 0::2] & 0xF)
  11. # Unpack quandruples of bits in int8 values into int4 values, in row
  12. # major order; lower 4 bits go into first int4 value goes, and upper 4
  13. # bits go into second int4 value.
  14. def unpack_int8_to_int4(weight):
  15. assert weight.dim() == 2
  16. assert weight.dtype == torch.int8
  17. return torch.stack((weight & 0xF, (weight >> 4) & 0xF), dim=2).view(
  18. weight.shape[0], 2 * weight.shape[1]
  19. )
  20. # Transpose the weight matrix, and then reorder its elements according
  21. # to underlying requirements of CUTLASS library, so that it could be
  22. # used for CUTLASS-based mixed datatypes linear operation.
  23. def quantized_weight_reorder_for_mixed_dtypes_linear_cutlass(
  24. weight, dtypeq, transpose=False
  25. ):
  26. assert weight.dim() == 2
  27. assert weight.dtype == torch.int8
  28. assert dtypeq == torch.int8 or dtypeq == torch.quint4x2
  29. assert weight.device.type == "cuda"
  30. device = weight.device
  31. # subbyte_transpose
  32. if not transpose:
  33. if dtypeq == torch.int8:
  34. outp = weight.T
  35. elif dtypeq == torch.quint4x2:
  36. outp = pack_int4_to_int8(unpack_int8_to_int4(weight.view(torch.int8)).T)
  37. else:
  38. outp = weight
  39. ncols, nrows = outp.shape # type: ignore[possibly-undefined]
  40. assert nrows % (32 if dtypeq == torch.quint4x2 else 64) == 0
  41. assert ncols % 64 == 0
  42. # permute_B_rows_for_mixed_gemm
  43. # (permute cols actually, as transpose is applied first here)
  44. if dtypeq == torch.quint4x2:
  45. cols_permuted = (
  46. torch.tensor(
  47. [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15],
  48. device=device,
  49. )
  50. + (torch.arange(0, nrows // 16, device=device).reshape(-1, 1) * 16).expand(
  51. nrows // 16, 16
  52. )
  53. ).view(-1)
  54. else:
  55. cols_permuted = (
  56. torch.tensor(
  57. [0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15],
  58. device=device,
  59. )
  60. + (torch.arange(0, nrows // 16, device=device).reshape(-1, 1) * 16).expand(
  61. nrows // 16, 16
  62. )
  63. ).view(-1)
  64. outp = outp.index_copy(1, cols_permuted, outp)
  65. # interleave_column_major_tensor
  66. magic0 = 4 if dtypeq == torch.quint4x2 else 2
  67. magic1 = 32 // magic0
  68. tmp0 = (
  69. (torch.arange(0, ncols // magic0, device=device) * (nrows // 4 * magic0))
  70. .view(-1, 1)
  71. .repeat(1, nrows // 4 * magic0)
  72. .view(-1)
  73. )
  74. tmp1 = (
  75. (torch.arange(0, nrows // 4 // magic1, device=device) * (magic0 * magic1))
  76. .view(-1, 1)
  77. .repeat(1, magic1)
  78. .view(-1)
  79. .repeat(ncols)
  80. )
  81. tmp2 = (
  82. (torch.arange(0, magic0, device=device) * magic1)
  83. .view(-1, 1)
  84. .repeat(1, nrows // 4)
  85. .view(-1)
  86. .repeat(ncols // magic0)
  87. )
  88. tmp3 = torch.arange(0, magic1, device=device).repeat(nrows // 4 * ncols // magic1)
  89. outp_offsets = tmp0 + tmp1 + tmp2 + tmp3
  90. tmp = outp.view(-1).view(torch.int32)
  91. outp = torch.zeros_like(tmp)
  92. outp.scatter_(0, outp_offsets, tmp)
  93. outp = outp.view(weight.dtype)
  94. # add_bias_and_interleave_quantized_tensor_inplace
  95. tmp = outp.view(-1)
  96. outp = torch.empty_like(tmp)
  97. if dtypeq == torch.int8:
  98. tmp = (tmp.to(torch.int) + 128).to(tmp.dtype)
  99. outp[0::4] = tmp[0::4]
  100. outp[1::4] = tmp[2::4]
  101. outp[2::4] = tmp[1::4]
  102. outp[3::4] = tmp[3::4]
  103. elif dtypeq == torch.quint4x2:
  104. tmp0 = ((tmp & 0xF) + 8) & 0xF
  105. tmp0 = (tmp0[1::2] << 4) | tmp0[0::2]
  106. tmp1 = (((tmp >> 4) & 0xF) + 8) & 0xF
  107. tmp1 = (tmp1[1::2] << 4) | tmp1[0::2]
  108. outp[0::4] = tmp0[0::2]
  109. outp[1::4] = tmp0[1::2]
  110. outp[2::4] = tmp1[0::2]
  111. outp[3::4] = tmp1[1::2]
  112. if dtypeq == torch.quint4x2:
  113. nrows *= 2
  114. ncols //= 2
  115. return outp.view(nrows, ncols).view(torch.uint8)