test_base.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897
  1. # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
  2. # Fabian Pedregosa <fabian.pedregosa@inria.fr>
  3. # Maria Telenczuk <https://github.com/maikia>
  4. #
  5. # License: BSD 3 clause
  6. import warnings
  7. import numpy as np
  8. import pytest
  9. from scipy import linalg, sparse
  10. from sklearn.datasets import load_iris, make_regression, make_sparse_uncorrelated
  11. from sklearn.linear_model import LinearRegression
  12. from sklearn.linear_model._base import (
  13. _deprecate_normalize,
  14. _preprocess_data,
  15. _rescale_data,
  16. make_dataset,
  17. )
  18. from sklearn.preprocessing import StandardScaler, add_dummy_feature
  19. from sklearn.utils._testing import (
  20. assert_allclose,
  21. assert_array_almost_equal,
  22. assert_array_equal,
  23. )
  24. rtol = 1e-6
  25. def test_linear_regression():
  26. # Test LinearRegression on a simple dataset.
  27. # a simple dataset
  28. X = [[1], [2]]
  29. Y = [1, 2]
  30. reg = LinearRegression()
  31. reg.fit(X, Y)
  32. assert_array_almost_equal(reg.coef_, [1])
  33. assert_array_almost_equal(reg.intercept_, [0])
  34. assert_array_almost_equal(reg.predict(X), [1, 2])
  35. # test it also for degenerate input
  36. X = [[1]]
  37. Y = [0]
  38. reg = LinearRegression()
  39. reg.fit(X, Y)
  40. assert_array_almost_equal(reg.coef_, [0])
  41. assert_array_almost_equal(reg.intercept_, [0])
  42. assert_array_almost_equal(reg.predict(X), [0])
  43. @pytest.mark.parametrize("array_constr", [np.array, sparse.csr_matrix])
  44. @pytest.mark.parametrize("fit_intercept", [True, False])
  45. def test_linear_regression_sample_weights(
  46. array_constr, fit_intercept, global_random_seed
  47. ):
  48. rng = np.random.RandomState(global_random_seed)
  49. # It would not work with under-determined systems
  50. n_samples, n_features = 6, 5
  51. X = array_constr(rng.normal(size=(n_samples, n_features)))
  52. y = rng.normal(size=n_samples)
  53. sample_weight = 1.0 + rng.uniform(size=n_samples)
  54. # LinearRegression with explicit sample_weight
  55. reg = LinearRegression(fit_intercept=fit_intercept)
  56. reg.fit(X, y, sample_weight=sample_weight)
  57. coefs1 = reg.coef_
  58. inter1 = reg.intercept_
  59. assert reg.coef_.shape == (X.shape[1],) # sanity checks
  60. # Closed form of the weighted least square
  61. # theta = (X^T W X)^(-1) @ X^T W y
  62. W = np.diag(sample_weight)
  63. X_aug = X if not fit_intercept else add_dummy_feature(X)
  64. Xw = X_aug.T @ W @ X_aug
  65. yw = X_aug.T @ W @ y
  66. coefs2 = linalg.solve(Xw, yw)
  67. if not fit_intercept:
  68. assert_allclose(coefs1, coefs2)
  69. else:
  70. assert_allclose(coefs1, coefs2[1:])
  71. assert_allclose(inter1, coefs2[0])
  72. def test_raises_value_error_if_positive_and_sparse():
  73. error_msg = "A sparse matrix was passed, but dense data is required."
  74. # X must not be sparse if positive == True
  75. X = sparse.eye(10)
  76. y = np.ones(10)
  77. reg = LinearRegression(positive=True)
  78. with pytest.raises(TypeError, match=error_msg):
  79. reg.fit(X, y)
  80. @pytest.mark.parametrize("n_samples, n_features", [(2, 3), (3, 2)])
  81. def test_raises_value_error_if_sample_weights_greater_than_1d(n_samples, n_features):
  82. # Sample weights must be either scalar or 1D
  83. rng = np.random.RandomState(0)
  84. X = rng.randn(n_samples, n_features)
  85. y = rng.randn(n_samples)
  86. sample_weights_OK = rng.randn(n_samples) ** 2 + 1
  87. sample_weights_OK_1 = 1.0
  88. sample_weights_OK_2 = 2.0
  89. reg = LinearRegression()
  90. # make sure the "OK" sample weights actually work
  91. reg.fit(X, y, sample_weights_OK)
  92. reg.fit(X, y, sample_weights_OK_1)
  93. reg.fit(X, y, sample_weights_OK_2)
  94. def test_fit_intercept():
  95. # Test assertions on betas shape.
  96. X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]])
  97. X3 = np.array(
  98. [[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]
  99. )
  100. y = np.array([1, 1])
  101. lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y)
  102. lr2_with_intercept = LinearRegression().fit(X2, y)
  103. lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y)
  104. lr3_with_intercept = LinearRegression().fit(X3, y)
  105. assert lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape
  106. assert lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape
  107. assert lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim
  108. def test_error_on_wrong_normalize():
  109. normalize = "wrong"
  110. error_msg = "Leave 'normalize' to its default"
  111. with pytest.raises(ValueError, match=error_msg):
  112. _deprecate_normalize(normalize, "estimator")
  113. # TODO(1.4): remove
  114. @pytest.mark.parametrize("normalize", [True, False, "deprecated"])
  115. def test_deprecate_normalize(normalize):
  116. # test all possible case of the normalize parameter deprecation
  117. if normalize == "deprecated":
  118. # no warning
  119. output = False
  120. expected = None
  121. warning_msg = []
  122. else:
  123. output = normalize
  124. expected = FutureWarning
  125. warning_msg = ["1.4"]
  126. if not normalize:
  127. warning_msg.append("default value")
  128. else:
  129. warning_msg.append("StandardScaler(")
  130. if expected is None:
  131. with warnings.catch_warnings():
  132. warnings.simplefilter("error", FutureWarning)
  133. _normalize = _deprecate_normalize(normalize, "estimator")
  134. else:
  135. with pytest.warns(expected) as record:
  136. _normalize = _deprecate_normalize(normalize, "estimator")
  137. assert all([warning in str(record[0].message) for warning in warning_msg])
  138. assert _normalize == output
  139. def test_linear_regression_sparse(global_random_seed):
  140. # Test that linear regression also works with sparse data
  141. rng = np.random.RandomState(global_random_seed)
  142. n = 100
  143. X = sparse.eye(n, n)
  144. beta = rng.rand(n)
  145. y = X @ beta
  146. ols = LinearRegression()
  147. ols.fit(X, y.ravel())
  148. assert_array_almost_equal(beta, ols.coef_ + ols.intercept_)
  149. assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
  150. @pytest.mark.parametrize("fit_intercept", [True, False])
  151. def test_linear_regression_sparse_equal_dense(fit_intercept):
  152. # Test that linear regression agrees between sparse and dense
  153. rng = np.random.RandomState(0)
  154. n_samples = 200
  155. n_features = 2
  156. X = rng.randn(n_samples, n_features)
  157. X[X < 0.1] = 0.0
  158. Xcsr = sparse.csr_matrix(X)
  159. y = rng.rand(n_samples)
  160. params = dict(fit_intercept=fit_intercept)
  161. clf_dense = LinearRegression(**params)
  162. clf_sparse = LinearRegression(**params)
  163. clf_dense.fit(X, y)
  164. clf_sparse.fit(Xcsr, y)
  165. assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
  166. assert_allclose(clf_dense.coef_, clf_sparse.coef_)
  167. def test_linear_regression_multiple_outcome():
  168. # Test multiple-outcome linear regressions
  169. rng = np.random.RandomState(0)
  170. X, y = make_regression(random_state=rng)
  171. Y = np.vstack((y, y)).T
  172. n_features = X.shape[1]
  173. reg = LinearRegression()
  174. reg.fit((X), Y)
  175. assert reg.coef_.shape == (2, n_features)
  176. Y_pred = reg.predict(X)
  177. reg.fit(X, y)
  178. y_pred = reg.predict(X)
  179. assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
  180. def test_linear_regression_sparse_multiple_outcome(global_random_seed):
  181. # Test multiple-outcome linear regressions with sparse data
  182. rng = np.random.RandomState(global_random_seed)
  183. X, y = make_sparse_uncorrelated(random_state=rng)
  184. X = sparse.coo_matrix(X)
  185. Y = np.vstack((y, y)).T
  186. n_features = X.shape[1]
  187. ols = LinearRegression()
  188. ols.fit(X, Y)
  189. assert ols.coef_.shape == (2, n_features)
  190. Y_pred = ols.predict(X)
  191. ols.fit(X, y.ravel())
  192. y_pred = ols.predict(X)
  193. assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
  194. def test_linear_regression_positive():
  195. # Test nonnegative LinearRegression on a simple dataset.
  196. X = [[1], [2]]
  197. y = [1, 2]
  198. reg = LinearRegression(positive=True)
  199. reg.fit(X, y)
  200. assert_array_almost_equal(reg.coef_, [1])
  201. assert_array_almost_equal(reg.intercept_, [0])
  202. assert_array_almost_equal(reg.predict(X), [1, 2])
  203. # test it also for degenerate input
  204. X = [[1]]
  205. y = [0]
  206. reg = LinearRegression(positive=True)
  207. reg.fit(X, y)
  208. assert_allclose(reg.coef_, [0])
  209. assert_allclose(reg.intercept_, [0])
  210. assert_allclose(reg.predict(X), [0])
  211. def test_linear_regression_positive_multiple_outcome(global_random_seed):
  212. # Test multiple-outcome nonnegative linear regressions
  213. rng = np.random.RandomState(global_random_seed)
  214. X, y = make_sparse_uncorrelated(random_state=rng)
  215. Y = np.vstack((y, y)).T
  216. n_features = X.shape[1]
  217. ols = LinearRegression(positive=True)
  218. ols.fit(X, Y)
  219. assert ols.coef_.shape == (2, n_features)
  220. assert np.all(ols.coef_ >= 0.0)
  221. Y_pred = ols.predict(X)
  222. ols.fit(X, y.ravel())
  223. y_pred = ols.predict(X)
  224. assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
  225. def test_linear_regression_positive_vs_nonpositive(global_random_seed):
  226. # Test differences with LinearRegression when positive=False.
  227. rng = np.random.RandomState(global_random_seed)
  228. X, y = make_sparse_uncorrelated(random_state=rng)
  229. reg = LinearRegression(positive=True)
  230. reg.fit(X, y)
  231. regn = LinearRegression(positive=False)
  232. regn.fit(X, y)
  233. assert np.mean((reg.coef_ - regn.coef_) ** 2) > 1e-3
  234. def test_linear_regression_positive_vs_nonpositive_when_positive(global_random_seed):
  235. # Test LinearRegression fitted coefficients
  236. # when the problem is positive.
  237. rng = np.random.RandomState(global_random_seed)
  238. n_samples = 200
  239. n_features = 4
  240. X = rng.rand(n_samples, n_features)
  241. y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3]
  242. reg = LinearRegression(positive=True)
  243. reg.fit(X, y)
  244. regn = LinearRegression(positive=False)
  245. regn.fit(X, y)
  246. assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6
  247. @pytest.mark.parametrize("sparse_X", [True, False])
  248. @pytest.mark.parametrize("use_sw", [True, False])
  249. def test_inplace_data_preprocessing(sparse_X, use_sw, global_random_seed):
  250. # Check that the data is not modified inplace by the linear regression
  251. # estimator.
  252. rng = np.random.RandomState(global_random_seed)
  253. original_X_data = rng.randn(10, 12)
  254. original_y_data = rng.randn(10, 2)
  255. orginal_sw_data = rng.rand(10)
  256. if sparse_X:
  257. X = sparse.csr_matrix(original_X_data)
  258. else:
  259. X = original_X_data.copy()
  260. y = original_y_data.copy()
  261. # XXX: Note hat y_sparse is not supported (broken?) in the current
  262. # implementation of LinearRegression.
  263. if use_sw:
  264. sample_weight = orginal_sw_data.copy()
  265. else:
  266. sample_weight = None
  267. # Do not allow inplace preprocessing of X and y:
  268. reg = LinearRegression()
  269. reg.fit(X, y, sample_weight=sample_weight)
  270. if sparse_X:
  271. assert_allclose(X.toarray(), original_X_data)
  272. else:
  273. assert_allclose(X, original_X_data)
  274. assert_allclose(y, original_y_data)
  275. if use_sw:
  276. assert_allclose(sample_weight, orginal_sw_data)
  277. # Allow inplace preprocessing of X and y
  278. reg = LinearRegression(copy_X=False)
  279. reg.fit(X, y, sample_weight=sample_weight)
  280. if sparse_X:
  281. # No optimization relying on the inplace modification of sparse input
  282. # data has been implemented at this time.
  283. assert_allclose(X.toarray(), original_X_data)
  284. else:
  285. # X has been offset (and optionally rescaled by sample weights)
  286. # inplace. The 0.42 threshold is arbitrary and has been found to be
  287. # robust to any random seed in the admissible range.
  288. assert np.linalg.norm(X - original_X_data) > 0.42
  289. # y should not have been modified inplace by LinearRegression.fit.
  290. assert_allclose(y, original_y_data)
  291. if use_sw:
  292. # Sample weights have no reason to ever be modified inplace.
  293. assert_allclose(sample_weight, orginal_sw_data)
  294. def test_linear_regression_pd_sparse_dataframe_warning():
  295. pd = pytest.importorskip("pandas")
  296. # Warning is raised only when some of the columns is sparse
  297. df = pd.DataFrame({"0": np.random.randn(10)})
  298. for col in range(1, 4):
  299. arr = np.random.randn(10)
  300. arr[:8] = 0
  301. # all columns but the first column is sparse
  302. if col != 0:
  303. arr = pd.arrays.SparseArray(arr, fill_value=0)
  304. df[str(col)] = arr
  305. msg = "pandas.DataFrame with sparse columns found."
  306. reg = LinearRegression()
  307. with pytest.warns(UserWarning, match=msg):
  308. reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
  309. # does not warn when the whole dataframe is sparse
  310. df["0"] = pd.arrays.SparseArray(df["0"], fill_value=0)
  311. assert hasattr(df, "sparse")
  312. with warnings.catch_warnings():
  313. warnings.simplefilter("error", UserWarning)
  314. reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
  315. def test_preprocess_data(global_random_seed):
  316. rng = np.random.RandomState(global_random_seed)
  317. n_samples = 200
  318. n_features = 2
  319. X = rng.rand(n_samples, n_features)
  320. y = rng.rand(n_samples)
  321. expected_X_mean = np.mean(X, axis=0)
  322. expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0])
  323. expected_y_mean = np.mean(y, axis=0)
  324. Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
  325. X, y, fit_intercept=False, normalize=False
  326. )
  327. assert_array_almost_equal(X_mean, np.zeros(n_features))
  328. assert_array_almost_equal(y_mean, 0)
  329. assert_array_almost_equal(X_scale, np.ones(n_features))
  330. assert_array_almost_equal(Xt, X)
  331. assert_array_almost_equal(yt, y)
  332. Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
  333. X, y, fit_intercept=True, normalize=False
  334. )
  335. assert_array_almost_equal(X_mean, expected_X_mean)
  336. assert_array_almost_equal(y_mean, expected_y_mean)
  337. assert_array_almost_equal(X_scale, np.ones(n_features))
  338. assert_array_almost_equal(Xt, X - expected_X_mean)
  339. assert_array_almost_equal(yt, y - expected_y_mean)
  340. Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
  341. X, y, fit_intercept=True, normalize=True
  342. )
  343. assert_array_almost_equal(X_mean, expected_X_mean)
  344. assert_array_almost_equal(y_mean, expected_y_mean)
  345. assert_array_almost_equal(X_scale, expected_X_scale)
  346. assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)
  347. assert_array_almost_equal(yt, y - expected_y_mean)
  348. def test_preprocess_data_multioutput(global_random_seed):
  349. rng = np.random.RandomState(global_random_seed)
  350. n_samples = 200
  351. n_features = 3
  352. n_outputs = 2
  353. X = rng.rand(n_samples, n_features)
  354. y = rng.rand(n_samples, n_outputs)
  355. expected_y_mean = np.mean(y, axis=0)
  356. args = [X, sparse.csc_matrix(X)]
  357. for X in args:
  358. _, yt, _, y_mean, _ = _preprocess_data(
  359. X, y, fit_intercept=False, normalize=False
  360. )
  361. assert_array_almost_equal(y_mean, np.zeros(n_outputs))
  362. assert_array_almost_equal(yt, y)
  363. _, yt, _, y_mean, _ = _preprocess_data(
  364. X, y, fit_intercept=True, normalize=False
  365. )
  366. assert_array_almost_equal(y_mean, expected_y_mean)
  367. assert_array_almost_equal(yt, y - y_mean)
  368. _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True)
  369. assert_array_almost_equal(y_mean, expected_y_mean)
  370. assert_array_almost_equal(yt, y - y_mean)
  371. @pytest.mark.parametrize("is_sparse", [False, True])
  372. def test_preprocess_data_weighted(is_sparse, global_random_seed):
  373. rng = np.random.RandomState(global_random_seed)
  374. n_samples = 200
  375. n_features = 4
  376. # Generate random data with 50% of zero values to make sure
  377. # that the sparse variant of this test is actually sparse. This also
  378. # shifts the mean value for each columns in X further away from
  379. # zero.
  380. X = rng.rand(n_samples, n_features)
  381. X[X < 0.5] = 0.0
  382. # Scale the first feature of X to be 10 larger than the other to
  383. # better check the impact of feature scaling.
  384. X[:, 0] *= 10
  385. # Constant non-zero feature.
  386. X[:, 2] = 1.0
  387. # Constant zero feature (non-materialized in the sparse case)
  388. X[:, 3] = 0.0
  389. y = rng.rand(n_samples)
  390. sample_weight = rng.rand(n_samples)
  391. expected_X_mean = np.average(X, axis=0, weights=sample_weight)
  392. expected_y_mean = np.average(y, axis=0, weights=sample_weight)
  393. X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
  394. X_sample_weight_var = np.average(
  395. (X - X_sample_weight_avg) ** 2, weights=sample_weight, axis=0
  396. )
  397. constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
  398. assert_array_equal(constant_mask, [0, 0, 1, 1])
  399. expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())
  400. # near constant features should not be scaled
  401. expected_X_scale[constant_mask] = 1
  402. if is_sparse:
  403. X = sparse.csr_matrix(X)
  404. # normalize is False
  405. Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
  406. X,
  407. y,
  408. fit_intercept=True,
  409. normalize=False,
  410. sample_weight=sample_weight,
  411. )
  412. assert_array_almost_equal(X_mean, expected_X_mean)
  413. assert_array_almost_equal(y_mean, expected_y_mean)
  414. assert_array_almost_equal(X_scale, np.ones(n_features))
  415. if is_sparse:
  416. assert_array_almost_equal(Xt.toarray(), X.toarray())
  417. else:
  418. assert_array_almost_equal(Xt, X - expected_X_mean)
  419. assert_array_almost_equal(yt, y - expected_y_mean)
  420. # normalize is True
  421. Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
  422. X,
  423. y,
  424. fit_intercept=True,
  425. normalize=True,
  426. sample_weight=sample_weight,
  427. )
  428. assert_array_almost_equal(X_mean, expected_X_mean)
  429. assert_array_almost_equal(y_mean, expected_y_mean)
  430. assert_array_almost_equal(X_scale, expected_X_scale)
  431. if is_sparse:
  432. # X is not centered
  433. assert_array_almost_equal(Xt.toarray(), X.toarray() / expected_X_scale)
  434. else:
  435. assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)
  436. # _preprocess_data with normalize=True scales the data by the feature-wise
  437. # euclidean norms while StandardScaler scales the data by the feature-wise
  438. # standard deviations.
  439. # The two are equivalent up to a ratio of np.sqrt(n_samples) if unweighted
  440. # or np.sqrt(sample_weight.sum()) if weighted.
  441. if is_sparse:
  442. scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight)
  443. # Non-constant features are scaled similarly with np.sqrt(n_samples)
  444. assert_array_almost_equal(
  445. scaler.transform(X).toarray()[:, :2] / np.sqrt(sample_weight.sum()),
  446. Xt.toarray()[:, :2],
  447. )
  448. # Constant features go through un-scaled.
  449. assert_array_almost_equal(
  450. scaler.transform(X).toarray()[:, 2:], Xt.toarray()[:, 2:]
  451. )
  452. else:
  453. scaler = StandardScaler(with_mean=True).fit(X, sample_weight=sample_weight)
  454. assert_array_almost_equal(scaler.mean_, X_mean)
  455. assert_array_almost_equal(
  456. scaler.transform(X) / np.sqrt(sample_weight.sum()),
  457. Xt,
  458. )
  459. assert_array_almost_equal(yt, y - expected_y_mean)
  460. def test_sparse_preprocess_data_offsets(global_random_seed):
  461. rng = np.random.RandomState(global_random_seed)
  462. n_samples = 200
  463. n_features = 2
  464. X = sparse.rand(n_samples, n_features, density=0.5, random_state=rng)
  465. X = X.tolil()
  466. y = rng.rand(n_samples)
  467. XA = X.toarray()
  468. expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0])
  469. Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
  470. X, y, fit_intercept=False, normalize=False
  471. )
  472. assert_array_almost_equal(X_mean, np.zeros(n_features))
  473. assert_array_almost_equal(y_mean, 0)
  474. assert_array_almost_equal(X_scale, np.ones(n_features))
  475. assert_array_almost_equal(Xt.A, XA)
  476. assert_array_almost_equal(yt, y)
  477. Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
  478. X, y, fit_intercept=True, normalize=False
  479. )
  480. assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
  481. assert_array_almost_equal(y_mean, np.mean(y, axis=0))
  482. assert_array_almost_equal(X_scale, np.ones(n_features))
  483. assert_array_almost_equal(Xt.A, XA)
  484. assert_array_almost_equal(yt, y - np.mean(y, axis=0))
  485. Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
  486. X, y, fit_intercept=True, normalize=True
  487. )
  488. assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
  489. assert_array_almost_equal(y_mean, np.mean(y, axis=0))
  490. assert_array_almost_equal(X_scale, expected_X_scale)
  491. assert_array_almost_equal(Xt.A, XA / expected_X_scale)
  492. assert_array_almost_equal(yt, y - np.mean(y, axis=0))
  493. def test_csr_preprocess_data():
  494. # Test output format of _preprocess_data, when input is csr
  495. X, y = make_regression()
  496. X[X < 2.5] = 0.0
  497. csr = sparse.csr_matrix(X)
  498. csr_, y, _, _, _ = _preprocess_data(csr, y, True)
  499. assert csr_.getformat() == "csr"
  500. @pytest.mark.parametrize("is_sparse", (True, False))
  501. @pytest.mark.parametrize("to_copy", (True, False))
  502. def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
  503. X, y = make_regression()
  504. X[X < 2.5] = 0.0
  505. if is_sparse:
  506. X = sparse.csr_matrix(X)
  507. X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False)
  508. if to_copy and is_sparse:
  509. assert not np.may_share_memory(X_.data, X.data)
  510. elif to_copy:
  511. assert not np.may_share_memory(X_, X)
  512. elif is_sparse:
  513. assert np.may_share_memory(X_.data, X.data)
  514. else:
  515. assert np.may_share_memory(X_, X)
  516. def test_dtype_preprocess_data(global_random_seed):
  517. rng = np.random.RandomState(global_random_seed)
  518. n_samples = 200
  519. n_features = 2
  520. X = rng.rand(n_samples, n_features)
  521. y = rng.rand(n_samples)
  522. X_32 = np.asarray(X, dtype=np.float32)
  523. y_32 = np.asarray(y, dtype=np.float32)
  524. X_64 = np.asarray(X, dtype=np.float64)
  525. y_64 = np.asarray(y, dtype=np.float64)
  526. for fit_intercept in [True, False]:
  527. for normalize in [True, False]:
  528. Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data(
  529. X_32,
  530. y_32,
  531. fit_intercept=fit_intercept,
  532. normalize=normalize,
  533. )
  534. Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data(
  535. X_64,
  536. y_64,
  537. fit_intercept=fit_intercept,
  538. normalize=normalize,
  539. )
  540. Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = _preprocess_data(
  541. X_32,
  542. y_64,
  543. fit_intercept=fit_intercept,
  544. normalize=normalize,
  545. )
  546. Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = _preprocess_data(
  547. X_64,
  548. y_32,
  549. fit_intercept=fit_intercept,
  550. normalize=normalize,
  551. )
  552. assert Xt_32.dtype == np.float32
  553. assert yt_32.dtype == np.float32
  554. assert X_mean_32.dtype == np.float32
  555. assert y_mean_32.dtype == np.float32
  556. assert X_scale_32.dtype == np.float32
  557. assert Xt_64.dtype == np.float64
  558. assert yt_64.dtype == np.float64
  559. assert X_mean_64.dtype == np.float64
  560. assert y_mean_64.dtype == np.float64
  561. assert X_scale_64.dtype == np.float64
  562. assert Xt_3264.dtype == np.float32
  563. assert yt_3264.dtype == np.float32
  564. assert X_mean_3264.dtype == np.float32
  565. assert y_mean_3264.dtype == np.float32
  566. assert X_scale_3264.dtype == np.float32
  567. assert Xt_6432.dtype == np.float64
  568. assert yt_6432.dtype == np.float64
  569. assert X_mean_6432.dtype == np.float64
  570. assert y_mean_6432.dtype == np.float64
  571. assert X_scale_6432.dtype == np.float64
  572. assert X_32.dtype == np.float32
  573. assert y_32.dtype == np.float32
  574. assert X_64.dtype == np.float64
  575. assert y_64.dtype == np.float64
  576. assert_array_almost_equal(Xt_32, Xt_64)
  577. assert_array_almost_equal(yt_32, yt_64)
  578. assert_array_almost_equal(X_mean_32, X_mean_64)
  579. assert_array_almost_equal(y_mean_32, y_mean_64)
  580. assert_array_almost_equal(X_scale_32, X_scale_64)
  581. @pytest.mark.parametrize("n_targets", [None, 2])
  582. @pytest.mark.parametrize("sparse_data", [True, False])
  583. def test_rescale_data(n_targets, sparse_data, global_random_seed):
  584. rng = np.random.RandomState(global_random_seed)
  585. n_samples = 200
  586. n_features = 2
  587. sample_weight = 1.0 + rng.rand(n_samples)
  588. X = rng.rand(n_samples, n_features)
  589. if n_targets is None:
  590. y = rng.rand(n_samples)
  591. else:
  592. y = rng.rand(n_samples, n_targets)
  593. expected_sqrt_sw = np.sqrt(sample_weight)
  594. expected_rescaled_X = X * expected_sqrt_sw[:, np.newaxis]
  595. if n_targets is None:
  596. expected_rescaled_y = y * expected_sqrt_sw
  597. else:
  598. expected_rescaled_y = y * expected_sqrt_sw[:, np.newaxis]
  599. if sparse_data:
  600. X = sparse.csr_matrix(X)
  601. if n_targets is None:
  602. y = sparse.csr_matrix(y.reshape(-1, 1))
  603. else:
  604. y = sparse.csr_matrix(y)
  605. rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight)
  606. assert_allclose(sqrt_sw, expected_sqrt_sw)
  607. if sparse_data:
  608. rescaled_X = rescaled_X.toarray()
  609. rescaled_y = rescaled_y.toarray()
  610. if n_targets is None:
  611. rescaled_y = rescaled_y.ravel()
  612. assert_allclose(rescaled_X, expected_rescaled_X)
  613. assert_allclose(rescaled_y, expected_rescaled_y)
  614. def test_fused_types_make_dataset():
  615. iris = load_iris()
  616. X_32 = iris.data.astype(np.float32)
  617. y_32 = iris.target.astype(np.float32)
  618. X_csr_32 = sparse.csr_matrix(X_32)
  619. sample_weight_32 = np.arange(y_32.size, dtype=np.float32)
  620. X_64 = iris.data.astype(np.float64)
  621. y_64 = iris.target.astype(np.float64)
  622. X_csr_64 = sparse.csr_matrix(X_64)
  623. sample_weight_64 = np.arange(y_64.size, dtype=np.float64)
  624. # array
  625. dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32)
  626. dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64)
  627. xi_32, yi_32, _, _ = dataset_32._next_py()
  628. xi_64, yi_64, _, _ = dataset_64._next_py()
  629. xi_data_32, _, _ = xi_32
  630. xi_data_64, _, _ = xi_64
  631. assert xi_data_32.dtype == np.float32
  632. assert xi_data_64.dtype == np.float64
  633. assert_allclose(yi_64, yi_32, rtol=rtol)
  634. # csr
  635. datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32)
  636. datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64)
  637. xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py()
  638. xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py()
  639. xicsr_data_32, _, _ = xicsr_32
  640. xicsr_data_64, _, _ = xicsr_64
  641. assert xicsr_data_32.dtype == np.float32
  642. assert xicsr_data_64.dtype == np.float64
  643. assert_allclose(xicsr_data_64, xicsr_data_32, rtol=rtol)
  644. assert_allclose(yicsr_64, yicsr_32, rtol=rtol)
  645. assert_array_equal(xi_data_32, xicsr_data_32)
  646. assert_array_equal(xi_data_64, xicsr_data_64)
  647. assert_array_equal(yi_32, yicsr_32)
  648. assert_array_equal(yi_64, yicsr_64)
  649. @pytest.mark.parametrize("sparseX", [False, True])
  650. @pytest.mark.parametrize("fit_intercept", [False, True])
  651. def test_linear_regression_sample_weight_consistency(
  652. sparseX, fit_intercept, global_random_seed
  653. ):
  654. """Test that the impact of sample_weight is consistent.
  655. Note that this test is stricter than the common test
  656. check_sample_weights_invariance alone and also tests sparse X.
  657. It is very similar to test_enet_sample_weight_consistency.
  658. """
  659. rng = np.random.RandomState(global_random_seed)
  660. n_samples, n_features = 10, 5
  661. X = rng.rand(n_samples, n_features)
  662. y = rng.rand(n_samples)
  663. if sparseX:
  664. X = sparse.csr_matrix(X)
  665. params = dict(fit_intercept=fit_intercept)
  666. reg = LinearRegression(**params).fit(X, y, sample_weight=None)
  667. coef = reg.coef_.copy()
  668. if fit_intercept:
  669. intercept = reg.intercept_
  670. # 1) sample_weight=np.ones(..) must be equivalent to sample_weight=None
  671. # same check as check_sample_weights_invariance(name, reg, kind="ones"), but we also
  672. # test with sparse input.
  673. sample_weight = np.ones_like(y)
  674. reg.fit(X, y, sample_weight=sample_weight)
  675. assert_allclose(reg.coef_, coef, rtol=1e-6)
  676. if fit_intercept:
  677. assert_allclose(reg.intercept_, intercept)
  678. # 2) sample_weight=None should be equivalent to sample_weight = number
  679. sample_weight = 123.0
  680. reg.fit(X, y, sample_weight=sample_weight)
  681. assert_allclose(reg.coef_, coef, rtol=1e-6)
  682. if fit_intercept:
  683. assert_allclose(reg.intercept_, intercept)
  684. # 3) scaling of sample_weight should have no effect, cf. np.average()
  685. sample_weight = rng.uniform(low=0.01, high=2, size=X.shape[0])
  686. reg = reg.fit(X, y, sample_weight=sample_weight)
  687. coef = reg.coef_.copy()
  688. if fit_intercept:
  689. intercept = reg.intercept_
  690. reg.fit(X, y, sample_weight=np.pi * sample_weight)
  691. assert_allclose(reg.coef_, coef, rtol=1e-5 if sparseX else 1e-6)
  692. if fit_intercept:
  693. assert_allclose(reg.intercept_, intercept)
  694. # 4) setting elements of sample_weight to 0 is equivalent to removing these samples
  695. sample_weight_0 = sample_weight.copy()
  696. sample_weight_0[-5:] = 0
  697. y[-5:] *= 1000 # to make excluding those samples important
  698. reg.fit(X, y, sample_weight=sample_weight_0)
  699. coef_0 = reg.coef_.copy()
  700. if fit_intercept:
  701. intercept_0 = reg.intercept_
  702. reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5])
  703. if fit_intercept and not sparseX:
  704. # FIXME: https://github.com/scikit-learn/scikit-learn/issues/26164
  705. # This often fails, e.g. when calling
  706. # SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" pytest \
  707. # sklearn/linear_model/tests/test_base.py\
  708. # ::test_linear_regression_sample_weight_consistency
  709. pass
  710. else:
  711. assert_allclose(reg.coef_, coef_0, rtol=1e-5)
  712. if fit_intercept:
  713. assert_allclose(reg.intercept_, intercept_0)
  714. # 5) check that multiplying sample_weight by 2 is equivalent to repeating
  715. # corresponding samples twice
  716. if sparseX:
  717. X2 = sparse.vstack([X, X[: n_samples // 2]], format="csc")
  718. else:
  719. X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
  720. y2 = np.concatenate([y, y[: n_samples // 2]])
  721. sample_weight_1 = sample_weight.copy()
  722. sample_weight_1[: n_samples // 2] *= 2
  723. sample_weight_2 = np.concatenate(
  724. [sample_weight, sample_weight[: n_samples // 2]], axis=0
  725. )
  726. reg1 = LinearRegression(**params).fit(X, y, sample_weight=sample_weight_1)
  727. reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=sample_weight_2)
  728. assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-6)
  729. if fit_intercept:
  730. assert_allclose(reg1.intercept_, reg2.intercept_)