stats.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import numpy as np
  2. from .extmath import stable_cumsum
  3. def _weighted_percentile(array, sample_weight, percentile=50):
  4. """Compute weighted percentile
  5. Computes lower weighted percentile. If `array` is a 2D array, the
  6. `percentile` is computed along the axis 0.
  7. .. versionchanged:: 0.24
  8. Accepts 2D `array`.
  9. Parameters
  10. ----------
  11. array : 1D or 2D array
  12. Values to take the weighted percentile of.
  13. sample_weight: 1D or 2D array
  14. Weights for each value in `array`. Must be same shape as `array` or
  15. of shape `(array.shape[0],)`.
  16. percentile: int or float, default=50
  17. Percentile to compute. Must be value between 0 and 100.
  18. Returns
  19. -------
  20. percentile : int if `array` 1D, ndarray if `array` 2D
  21. Weighted percentile.
  22. """
  23. n_dim = array.ndim
  24. if n_dim == 0:
  25. return array[()]
  26. if array.ndim == 1:
  27. array = array.reshape((-1, 1))
  28. # When sample_weight 1D, repeat for each array.shape[1]
  29. if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]:
  30. sample_weight = np.tile(sample_weight, (array.shape[1], 1)).T
  31. sorted_idx = np.argsort(array, axis=0)
  32. sorted_weights = np.take_along_axis(sample_weight, sorted_idx, axis=0)
  33. # Find index of median prediction for each sample
  34. weight_cdf = stable_cumsum(sorted_weights, axis=0)
  35. adjusted_percentile = percentile / 100 * weight_cdf[-1]
  36. # For percentile=0, ignore leading observations with sample_weight=0. GH20528
  37. mask = adjusted_percentile == 0
  38. adjusted_percentile[mask] = np.nextafter(
  39. adjusted_percentile[mask], adjusted_percentile[mask] + 1
  40. )
  41. percentile_idx = np.array(
  42. [
  43. np.searchsorted(weight_cdf[:, i], adjusted_percentile[i])
  44. for i in range(weight_cdf.shape[1])
  45. ]
  46. )
  47. percentile_idx = np.array(percentile_idx)
  48. # In rare cases, percentile_idx equals to sorted_idx.shape[0]
  49. max_idx = sorted_idx.shape[0] - 1
  50. percentile_idx = np.apply_along_axis(
  51. lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx
  52. )
  53. col_index = np.arange(array.shape[1])
  54. percentile_in_sorted = sorted_idx[percentile_idx, col_index]
  55. percentile = array[percentile_in_sorted, col_index]
  56. return percentile[0] if n_dim == 1 else percentile