binning.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. """
  2. This module contains the BinMapper class.
  3. BinMapper is used for mapping a real-valued dataset into integer-valued bins.
  4. Bin thresholds are computed with the quantiles so that each bin contains
  5. approximately the same number of samples.
  6. """
  7. # Author: Nicolas Hug
  8. import numpy as np
  9. from ...base import BaseEstimator, TransformerMixin
  10. from ...utils import check_array, check_random_state
  11. from ...utils._openmp_helpers import _openmp_effective_n_threads
  12. from ...utils.fixes import percentile
  13. from ...utils.validation import check_is_fitted
  14. from ._binning import _map_to_bins
  15. from ._bitset import set_bitset_memoryview
  16. from .common import ALMOST_INF, X_BINNED_DTYPE, X_BITSET_INNER_DTYPE, X_DTYPE
  17. def _find_binning_thresholds(col_data, max_bins):
  18. """Extract quantiles from a continuous feature.
  19. Missing values are ignored for finding the thresholds.
  20. Parameters
  21. ----------
  22. col_data : array-like, shape (n_samples,)
  23. The continuous feature to bin.
  24. max_bins: int
  25. The maximum number of bins to use for non-missing values. If for a
  26. given feature the number of unique values is less than ``max_bins``,
  27. then those unique values will be used to compute the bin thresholds,
  28. instead of the quantiles
  29. Return
  30. ------
  31. binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)
  32. The increasing numeric values that can be used to separate the bins.
  33. A given value x will be mapped into bin value i iff
  34. bining_thresholds[i - 1] < x <= binning_thresholds[i]
  35. """
  36. # ignore missing values when computing bin thresholds
  37. missing_mask = np.isnan(col_data)
  38. if missing_mask.any():
  39. col_data = col_data[~missing_mask]
  40. col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
  41. distinct_values = np.unique(col_data)
  42. if len(distinct_values) <= max_bins:
  43. midpoints = distinct_values[:-1] + distinct_values[1:]
  44. midpoints *= 0.5
  45. else:
  46. # We sort again the data in this case. We could compute
  47. # approximate midpoint percentiles using the output of
  48. # np.unique(col_data, return_counts) instead but this is more
  49. # work and the performance benefit will be limited because we
  50. # work on a fixed-size subsample of the full data.
  51. percentiles = np.linspace(0, 100, num=max_bins + 1)
  52. percentiles = percentiles[1:-1]
  53. midpoints = percentile(col_data, percentiles, method="midpoint").astype(X_DTYPE)
  54. assert midpoints.shape[0] == max_bins - 1
  55. # We avoid having +inf thresholds: +inf thresholds are only allowed in
  56. # a "split on nan" situation.
  57. np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
  58. return midpoints
  59. class _BinMapper(TransformerMixin, BaseEstimator):
  60. """Transformer that maps a dataset into integer-valued bins.
  61. For continuous features, the bins are created in a feature-wise fashion,
  62. using quantiles so that each bins contains approximately the same number
  63. of samples. For large datasets, quantiles are computed on a subset of the
  64. data to speed-up the binning, but the quantiles should remain stable.
  65. For categorical features, the raw categorical values are expected to be
  66. in [0, 254] (this is not validated here though) and each category
  67. corresponds to a bin. All categorical values must be known at
  68. initialization: transform() doesn't know how to bin unknown categorical
  69. values. Note that transform() is only used on non-training data in the
  70. case of early stopping.
  71. Features with a small number of values may be binned into less than
  72. ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
  73. for missing values.
  74. Parameters
  75. ----------
  76. n_bins : int, default=256
  77. The maximum number of bins to use (including the bin for missing
  78. values). Should be in [3, 256]. Non-missing values are binned on
  79. ``max_bins = n_bins - 1`` bins. The last bin is always reserved for
  80. missing values. If for a given feature the number of unique values is
  81. less than ``max_bins``, then those unique values will be used to
  82. compute the bin thresholds, instead of the quantiles. For categorical
  83. features indicated by ``is_categorical``, the docstring for
  84. ``is_categorical`` details on this procedure.
  85. subsample : int or None, default=2e5
  86. If ``n_samples > subsample``, then ``sub_samples`` samples will be
  87. randomly chosen to compute the quantiles. If ``None``, the whole data
  88. is used.
  89. is_categorical : ndarray of bool of shape (n_features,), default=None
  90. Indicates categorical features. By default, all features are
  91. considered continuous.
  92. known_categories : list of {ndarray, None} of shape (n_features,), \
  93. default=none
  94. For each categorical feature, the array indicates the set of unique
  95. categorical values. These should be the possible values over all the
  96. data, not just the training data. For continuous features, the
  97. corresponding entry should be None.
  98. random_state: int, RandomState instance or None, default=None
  99. Pseudo-random number generator to control the random sub-sampling.
  100. Pass an int for reproducible output across multiple
  101. function calls.
  102. See :term:`Glossary <random_state>`.
  103. n_threads : int, default=None
  104. Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
  105. to determine the effective number of threads use, which takes cgroups CPU
  106. quotes into account. See the docstring of `_openmp_effective_n_threads`
  107. for details.
  108. Attributes
  109. ----------
  110. bin_thresholds_ : list of ndarray
  111. For each feature, each array indicates how to map a feature into a
  112. binned feature. The semantic and size depends on the nature of the
  113. feature:
  114. - for real-valued features, the array corresponds to the real-valued
  115. bin thresholds (the upper bound of each bin). There are ``max_bins
  116. - 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of
  117. bins used for non-missing values.
  118. - for categorical features, the array is a map from a binned category
  119. value to the raw category value. The size of the array is equal to
  120. ``min(max_bins, category_cardinality)`` where we ignore missing
  121. values in the cardinality.
  122. n_bins_non_missing_ : ndarray, dtype=np.uint32
  123. For each feature, gives the number of bins actually used for
  124. non-missing values. For features with a lot of unique values, this is
  125. equal to ``n_bins - 1``.
  126. is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8
  127. Indicator for categorical features.
  128. missing_values_bin_idx_ : np.uint8
  129. The index of the bin where missing values are mapped. This is a
  130. constant across all features. This corresponds to the last bin, and
  131. it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``
  132. is less than ``n_bins - 1`` for a given feature, then there are
  133. empty (and unused) bins.
  134. """
  135. def __init__(
  136. self,
  137. n_bins=256,
  138. subsample=int(2e5),
  139. is_categorical=None,
  140. known_categories=None,
  141. random_state=None,
  142. n_threads=None,
  143. ):
  144. self.n_bins = n_bins
  145. self.subsample = subsample
  146. self.is_categorical = is_categorical
  147. self.known_categories = known_categories
  148. self.random_state = random_state
  149. self.n_threads = n_threads
  150. def fit(self, X, y=None):
  151. """Fit data X by computing the binning thresholds.
  152. The last bin is reserved for missing values, whether missing values
  153. are present in the data or not.
  154. Parameters
  155. ----------
  156. X : array-like of shape (n_samples, n_features)
  157. The data to bin.
  158. y: None
  159. Ignored.
  160. Returns
  161. -------
  162. self : object
  163. """
  164. if not (3 <= self.n_bins <= 256):
  165. # min is 3: at least 2 distinct bins and a missing values bin
  166. raise ValueError(
  167. "n_bins={} should be no smaller than 3 and no larger than 256.".format(
  168. self.n_bins
  169. )
  170. )
  171. X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
  172. max_bins = self.n_bins - 1
  173. rng = check_random_state(self.random_state)
  174. if self.subsample is not None and X.shape[0] > self.subsample:
  175. subset = rng.choice(X.shape[0], self.subsample, replace=False)
  176. X = X.take(subset, axis=0)
  177. if self.is_categorical is None:
  178. self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)
  179. else:
  180. self.is_categorical_ = np.asarray(self.is_categorical, dtype=np.uint8)
  181. n_features = X.shape[1]
  182. known_categories = self.known_categories
  183. if known_categories is None:
  184. known_categories = [None] * n_features
  185. # validate is_categorical and known_categories parameters
  186. for f_idx in range(n_features):
  187. is_categorical = self.is_categorical_[f_idx]
  188. known_cats = known_categories[f_idx]
  189. if is_categorical and known_cats is None:
  190. raise ValueError(
  191. f"Known categories for feature {f_idx} must be provided."
  192. )
  193. if not is_categorical and known_cats is not None:
  194. raise ValueError(
  195. f"Feature {f_idx} isn't marked as a categorical feature, "
  196. "but categories were passed."
  197. )
  198. self.missing_values_bin_idx_ = self.n_bins - 1
  199. self.bin_thresholds_ = []
  200. n_bins_non_missing = []
  201. for f_idx in range(n_features):
  202. if not self.is_categorical_[f_idx]:
  203. thresholds = _find_binning_thresholds(X[:, f_idx], max_bins)
  204. n_bins_non_missing.append(thresholds.shape[0] + 1)
  205. else:
  206. # Since categories are assumed to be encoded in
  207. # [0, n_cats] and since n_cats <= max_bins,
  208. # the thresholds *are* the unique categorical values. This will
  209. # lead to the correct mapping in transform()
  210. thresholds = known_categories[f_idx]
  211. n_bins_non_missing.append(thresholds.shape[0])
  212. self.bin_thresholds_.append(thresholds)
  213. self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)
  214. return self
  215. def transform(self, X):
  216. """Bin data X.
  217. Missing values will be mapped to the last bin.
  218. For categorical features, the mapping will be incorrect for unknown
  219. categories. Since the BinMapper is given known_categories of the
  220. entire training data (i.e. before the call to train_test_split() in
  221. case of early-stopping), this never happens.
  222. Parameters
  223. ----------
  224. X : array-like of shape (n_samples, n_features)
  225. The data to bin.
  226. Returns
  227. -------
  228. X_binned : array-like of shape (n_samples, n_features)
  229. The binned data (fortran-aligned).
  230. """
  231. X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
  232. check_is_fitted(self)
  233. if X.shape[1] != self.n_bins_non_missing_.shape[0]:
  234. raise ValueError(
  235. "This estimator was fitted with {} features but {} got passed "
  236. "to transform()".format(self.n_bins_non_missing_.shape[0], X.shape[1])
  237. )
  238. n_threads = _openmp_effective_n_threads(self.n_threads)
  239. binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
  240. _map_to_bins(
  241. X,
  242. self.bin_thresholds_,
  243. self.is_categorical_,
  244. self.missing_values_bin_idx_,
  245. n_threads,
  246. binned,
  247. )
  248. return binned
  249. def make_known_categories_bitsets(self):
  250. """Create bitsets of known categories.
  251. Returns
  252. -------
  253. - known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
  254. Array of bitsets of known categories, for each categorical feature.
  255. - f_idx_map : ndarray of shape (n_features,)
  256. Map from original feature index to the corresponding index in the
  257. known_cat_bitsets array.
  258. """
  259. categorical_features_indices = np.flatnonzero(self.is_categorical_)
  260. n_features = self.is_categorical_.size
  261. n_categorical_features = categorical_features_indices.size
  262. f_idx_map = np.zeros(n_features, dtype=np.uint32)
  263. f_idx_map[categorical_features_indices] = np.arange(
  264. n_categorical_features, dtype=np.uint32
  265. )
  266. known_categories = self.bin_thresholds_
  267. known_cat_bitsets = np.zeros(
  268. (n_categorical_features, 8), dtype=X_BITSET_INNER_DTYPE
  269. )
  270. # TODO: complexity is O(n_categorical_features * 255). Maybe this is
  271. # worth cythonizing
  272. for mapped_f_idx, f_idx in enumerate(categorical_features_indices):
  273. for raw_cat_val in known_categories[f_idx]:
  274. set_bitset_memoryview(known_cat_bitsets[mapped_f_idx], raw_cat_val)
  275. return known_cat_bitsets, f_idx_map