test_polynomial.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239
  1. import sys
  2. import numpy as np
  3. import pytest
  4. from numpy.testing import assert_allclose, assert_array_equal
  5. from scipy import sparse
  6. from scipy.interpolate import BSpline
  7. from scipy.sparse import random as sparse_random
  8. from sklearn.linear_model import LinearRegression
  9. from sklearn.pipeline import Pipeline
  10. from sklearn.preprocessing import (
  11. KBinsDiscretizer,
  12. PolynomialFeatures,
  13. SplineTransformer,
  14. )
  15. from sklearn.preprocessing._csr_polynomial_expansion import (
  16. _calc_expanded_nnz,
  17. _calc_total_nnz,
  18. _get_sizeof_LARGEST_INT_t,
  19. )
  20. from sklearn.utils._testing import assert_array_almost_equal
  21. from sklearn.utils.fixes import parse_version, sp_version
  22. @pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
  23. def test_polynomial_and_spline_array_order(est):
  24. """Test that output array has the given order."""
  25. X = np.arange(10).reshape(5, 2)
  26. def is_c_contiguous(a):
  27. return np.isfortran(a.T)
  28. assert is_c_contiguous(est().fit_transform(X))
  29. assert is_c_contiguous(est(order="C").fit_transform(X))
  30. assert np.isfortran(est(order="F").fit_transform(X))
  31. @pytest.mark.parametrize(
  32. "params, err_msg",
  33. [
  34. ({"knots": [[1]]}, r"Number of knots, knots.shape\[0\], must be >= 2."),
  35. ({"knots": [[1, 1], [2, 2]]}, r"knots.shape\[1\] == n_features is violated"),
  36. ({"knots": [[1], [0]]}, "knots must be sorted without duplicates."),
  37. ],
  38. )
  39. def test_spline_transformer_input_validation(params, err_msg):
  40. """Test that we raise errors for invalid input in SplineTransformer."""
  41. X = [[1], [2]]
  42. with pytest.raises(ValueError, match=err_msg):
  43. SplineTransformer(**params).fit(X)
  44. @pytest.mark.parametrize("extrapolation", ["continue", "periodic"])
  45. def test_spline_transformer_integer_knots(extrapolation):
  46. """Test that SplineTransformer accepts integer value knot positions."""
  47. X = np.arange(20).reshape(10, 2)
  48. knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]]
  49. _ = SplineTransformer(
  50. degree=3, knots=knots, extrapolation=extrapolation
  51. ).fit_transform(X)
  52. def test_spline_transformer_feature_names():
  53. """Test that SplineTransformer generates correct features name."""
  54. X = np.arange(20).reshape(10, 2)
  55. splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X)
  56. feature_names = splt.get_feature_names_out()
  57. assert_array_equal(
  58. feature_names,
  59. [
  60. "x0_sp_0",
  61. "x0_sp_1",
  62. "x0_sp_2",
  63. "x0_sp_3",
  64. "x0_sp_4",
  65. "x1_sp_0",
  66. "x1_sp_1",
  67. "x1_sp_2",
  68. "x1_sp_3",
  69. "x1_sp_4",
  70. ],
  71. )
  72. splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X)
  73. feature_names = splt.get_feature_names_out(["a", "b"])
  74. assert_array_equal(
  75. feature_names,
  76. [
  77. "a_sp_0",
  78. "a_sp_1",
  79. "a_sp_2",
  80. "a_sp_3",
  81. "b_sp_0",
  82. "b_sp_1",
  83. "b_sp_2",
  84. "b_sp_3",
  85. ],
  86. )
  87. @pytest.mark.parametrize(
  88. "extrapolation",
  89. ["constant", "linear", "continue", "periodic"],
  90. )
  91. @pytest.mark.parametrize("degree", [2, 3])
  92. def test_split_transform_feature_names_extrapolation_degree(extrapolation, degree):
  93. """Test feature names are correct for different extrapolations and degree.
  94. Non-regression test for gh-25292.
  95. """
  96. X = np.arange(20).reshape(10, 2)
  97. splt = SplineTransformer(degree=degree, extrapolation=extrapolation).fit(X)
  98. feature_names = splt.get_feature_names_out(["a", "b"])
  99. assert len(feature_names) == splt.n_features_out_
  100. X_trans = splt.transform(X)
  101. assert X_trans.shape[1] == len(feature_names)
  102. @pytest.mark.parametrize("degree", range(1, 5))
  103. @pytest.mark.parametrize("n_knots", range(3, 5))
  104. @pytest.mark.parametrize("knots", ["uniform", "quantile"])
  105. @pytest.mark.parametrize("extrapolation", ["constant", "periodic"])
  106. def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation):
  107. """Test that B-splines are indeed a decomposition of unity.
  108. Splines basis functions must sum up to 1 per row, if we stay in between boundaries.
  109. """
  110. X = np.linspace(0, 1, 100)[:, None]
  111. # make the boundaries 0 and 1 part of X_train, for sure.
  112. X_train = np.r_[[[0]], X[::2, :], [[1]]]
  113. X_test = X[1::2, :]
  114. if extrapolation == "periodic":
  115. n_knots = n_knots + degree # periodic splines require degree < n_knots
  116. splt = SplineTransformer(
  117. n_knots=n_knots,
  118. degree=degree,
  119. knots=knots,
  120. include_bias=True,
  121. extrapolation=extrapolation,
  122. )
  123. splt.fit(X_train)
  124. for X in [X_train, X_test]:
  125. assert_allclose(np.sum(splt.transform(X), axis=1), 1)
  126. @pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
  127. def test_spline_transformer_linear_regression(bias, intercept):
  128. """Test that B-splines fit a sinusodial curve pretty well."""
  129. X = np.linspace(0, 10, 100)[:, None]
  130. y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose
  131. pipe = Pipeline(
  132. steps=[
  133. (
  134. "spline",
  135. SplineTransformer(
  136. n_knots=15,
  137. degree=3,
  138. include_bias=bias,
  139. extrapolation="constant",
  140. ),
  141. ),
  142. ("ols", LinearRegression(fit_intercept=intercept)),
  143. ]
  144. )
  145. pipe.fit(X, y)
  146. assert_allclose(pipe.predict(X), y, rtol=1e-3)
  147. @pytest.mark.parametrize(
  148. ["knots", "n_knots", "sample_weight", "expected_knots"],
  149. [
  150. ("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])),
  151. (
  152. "uniform",
  153. 3,
  154. np.array([0, 0, 1, 1, 0, 3, 1]),
  155. np.array([[2, 2], [4, 8], [6, 14]]),
  156. ),
  157. ("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])),
  158. ("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])),
  159. (
  160. "quantile",
  161. 3,
  162. np.array([0, 0, 1, 1, 0, 3, 1]),
  163. np.array([[2, 2], [5, 8], [6, 14]]),
  164. ),
  165. ],
  166. )
  167. def test_spline_transformer_get_base_knot_positions(
  168. knots, n_knots, sample_weight, expected_knots
  169. ):
  170. """Check the behaviour to find knot positions with and without sample_weight."""
  171. X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]])
  172. base_knots = SplineTransformer._get_base_knot_positions(
  173. X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight
  174. )
  175. assert_allclose(base_knots, expected_knots)
  176. @pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
  177. def test_spline_transformer_periodic_linear_regression(bias, intercept):
  178. """Test that B-splines fit a periodic curve pretty well."""
  179. # "+ 3" to avoid the value 0 in assert_allclose
  180. def f(x):
  181. return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3
  182. X = np.linspace(0, 1, 101)[:, None]
  183. pipe = Pipeline(
  184. steps=[
  185. (
  186. "spline",
  187. SplineTransformer(
  188. n_knots=20,
  189. degree=3,
  190. include_bias=bias,
  191. extrapolation="periodic",
  192. ),
  193. ),
  194. ("ols", LinearRegression(fit_intercept=intercept)),
  195. ]
  196. )
  197. pipe.fit(X, f(X[:, 0]))
  198. # Generate larger array to check periodic extrapolation
  199. X_ = np.linspace(-1, 2, 301)[:, None]
  200. predictions = pipe.predict(X_)
  201. assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
  202. assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)
  203. def test_spline_transformer_periodic_spline_backport():
  204. """Test that the backport of extrapolate="periodic" works correctly"""
  205. X = np.linspace(-2, 3.5, 10)[:, None]
  206. degree = 2
  207. # Use periodic extrapolation backport in SplineTransformer
  208. transformer = SplineTransformer(
  209. degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]]
  210. )
  211. Xt = transformer.fit_transform(X)
  212. # Use periodic extrapolation in BSpline
  213. coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
  214. spl = BSpline(np.arange(-3, 4), coef, degree, "periodic")
  215. Xspl = spl(X[:, 0])
  216. assert_allclose(Xt, Xspl)
  217. def test_spline_transformer_periodic_splines_periodicity():
  218. """Test if shifted knots result in the same transformation up to permutation."""
  219. X = np.linspace(0, 10, 101)[:, None]
  220. transformer_1 = SplineTransformer(
  221. degree=3,
  222. extrapolation="periodic",
  223. knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
  224. )
  225. transformer_2 = SplineTransformer(
  226. degree=3,
  227. extrapolation="periodic",
  228. knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]],
  229. )
  230. Xt_1 = transformer_1.fit_transform(X)
  231. Xt_2 = transformer_2.fit_transform(X)
  232. assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]])
  233. @pytest.mark.parametrize("degree", [3, 5])
  234. def test_spline_transformer_periodic_splines_smoothness(degree):
  235. """Test that spline transformation is smooth at first / last knot."""
  236. X = np.linspace(-2, 10, 10_000)[:, None]
  237. transformer = SplineTransformer(
  238. degree=degree,
  239. extrapolation="periodic",
  240. knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
  241. )
  242. Xt = transformer.fit_transform(X)
  243. delta = (X.max() - X.min()) / len(X)
  244. tol = 10 * delta
  245. dXt = Xt
  246. # We expect splines of degree `degree` to be (`degree`-1) times
  247. # continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th
  248. # derivative should be continuous. This is the case if the (d+1)-th
  249. # numerical derivative is reasonably small (smaller than `tol` in absolute
  250. # value). We thus compute d-th numeric derivatives for d = 1, ..., `degree`
  251. # and compare them to `tol`.
  252. #
  253. # Note that the 0-th derivative is the function itself, such that we are
  254. # also checking its continuity.
  255. for d in range(1, degree + 1):
  256. # Check continuity of the (d-1)-th derivative
  257. diff = np.diff(dXt, axis=0)
  258. assert np.abs(diff).max() < tol
  259. # Compute d-th numeric derivative
  260. dXt = diff / delta
  261. # As degree `degree` splines are not `degree` times continuously
  262. # differentiable at the knots, the `degree + 1`-th numeric derivative
  263. # should have spikes at the knots.
  264. diff = np.diff(dXt, axis=0)
  265. assert np.abs(diff).max() > 1
  266. @pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
  267. @pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
  268. def test_spline_transformer_extrapolation(bias, intercept, degree):
  269. """Test that B-spline extrapolation works correctly."""
  270. # we use a straight line for that
  271. X = np.linspace(-1, 1, 100)[:, None]
  272. y = X.squeeze()
  273. # 'constant'
  274. pipe = Pipeline(
  275. [
  276. [
  277. "spline",
  278. SplineTransformer(
  279. n_knots=4,
  280. degree=degree,
  281. include_bias=bias,
  282. extrapolation="constant",
  283. ),
  284. ],
  285. ["ols", LinearRegression(fit_intercept=intercept)],
  286. ]
  287. )
  288. pipe.fit(X, y)
  289. assert_allclose(pipe.predict([[-10], [5]]), [-1, 1])
  290. # 'linear'
  291. pipe = Pipeline(
  292. [
  293. [
  294. "spline",
  295. SplineTransformer(
  296. n_knots=4,
  297. degree=degree,
  298. include_bias=bias,
  299. extrapolation="linear",
  300. ),
  301. ],
  302. ["ols", LinearRegression(fit_intercept=intercept)],
  303. ]
  304. )
  305. pipe.fit(X, y)
  306. assert_allclose(pipe.predict([[-10], [5]]), [-10, 5])
  307. # 'error'
  308. splt = SplineTransformer(
  309. n_knots=4, degree=degree, include_bias=bias, extrapolation="error"
  310. )
  311. splt.fit(X)
  312. msg = "X contains values beyond the limits of the knots"
  313. with pytest.raises(ValueError, match=msg):
  314. splt.transform([[-10]])
  315. with pytest.raises(ValueError, match=msg):
  316. splt.transform([[5]])
  317. def test_spline_transformer_kbindiscretizer():
  318. """Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer."""
  319. rng = np.random.RandomState(97531)
  320. X = rng.randn(200).reshape(200, 1)
  321. n_bins = 5
  322. n_knots = n_bins + 1
  323. splt = SplineTransformer(
  324. n_knots=n_knots, degree=0, knots="quantile", include_bias=True
  325. )
  326. splines = splt.fit_transform(X)
  327. kbd = KBinsDiscretizer(n_bins=n_bins, encode="onehot-dense", strategy="quantile")
  328. kbins = kbd.fit_transform(X)
  329. # Though they should be exactly equal, we test approximately with high
  330. # accuracy.
  331. assert_allclose(splines, kbins, rtol=1e-13)
  332. @pytest.mark.skipif(
  333. sp_version < parse_version("1.8.0"),
  334. reason="The option `sparse_output` is available as of scipy 1.8.0",
  335. )
  336. @pytest.mark.parametrize("degree", range(1, 3))
  337. @pytest.mark.parametrize("knots", ["uniform", "quantile"])
  338. @pytest.mark.parametrize(
  339. "extrapolation", ["error", "constant", "linear", "continue", "periodic"]
  340. )
  341. @pytest.mark.parametrize("include_bias", [False, True])
  342. def test_spline_transformer_sparse_output(
  343. degree, knots, extrapolation, include_bias, global_random_seed
  344. ):
  345. rng = np.random.RandomState(global_random_seed)
  346. X = rng.randn(200).reshape(40, 5)
  347. splt_dense = SplineTransformer(
  348. degree=degree,
  349. knots=knots,
  350. extrapolation=extrapolation,
  351. include_bias=include_bias,
  352. sparse_output=False,
  353. )
  354. splt_sparse = SplineTransformer(
  355. degree=degree,
  356. knots=knots,
  357. extrapolation=extrapolation,
  358. include_bias=include_bias,
  359. sparse_output=True,
  360. )
  361. splt_dense.fit(X)
  362. splt_sparse.fit(X)
  363. X_trans_sparse = splt_sparse.transform(X)
  364. X_trans_dense = splt_dense.transform(X)
  365. assert sparse.issparse(X_trans_sparse) and X_trans_sparse.format == "csr"
  366. assert_allclose(X_trans_dense, X_trans_sparse.toarray())
  367. # extrapolation regime
  368. X_min = np.amin(X, axis=0)
  369. X_max = np.amax(X, axis=0)
  370. X_extra = np.r_[
  371. np.linspace(X_min - 5, X_min, 10), np.linspace(X_max, X_max + 5, 10)
  372. ]
  373. if extrapolation == "error":
  374. msg = "X contains values beyond the limits of the knots"
  375. with pytest.raises(ValueError, match=msg):
  376. splt_dense.transform(X_extra)
  377. msg = "Out of bounds"
  378. with pytest.raises(ValueError, match=msg):
  379. splt_sparse.transform(X_extra)
  380. else:
  381. assert_allclose(
  382. splt_dense.transform(X_extra), splt_sparse.transform(X_extra).toarray()
  383. )
  384. @pytest.mark.skipif(
  385. sp_version >= parse_version("1.8.0"),
  386. reason="The option `sparse_output` is available as of scipy 1.8.0",
  387. )
  388. def test_spline_transformer_sparse_output_raise_error_for_old_scipy():
  389. """Test that SplineTransformer with sparse=True raises for scipy<1.8.0."""
  390. X = [[1], [2]]
  391. with pytest.raises(ValueError, match="scipy>=1.8.0"):
  392. SplineTransformer(sparse_output=True).fit(X)
  393. @pytest.mark.parametrize("n_knots", [5, 10])
  394. @pytest.mark.parametrize("include_bias", [True, False])
  395. @pytest.mark.parametrize("degree", [3, 4])
  396. @pytest.mark.parametrize(
  397. "extrapolation", ["error", "constant", "linear", "continue", "periodic"]
  398. )
  399. @pytest.mark.parametrize("sparse_output", [False, True])
  400. def test_spline_transformer_n_features_out(
  401. n_knots, include_bias, degree, extrapolation, sparse_output
  402. ):
  403. """Test that transform results in n_features_out_ features."""
  404. if sparse_output and sp_version < parse_version("1.8.0"):
  405. pytest.skip("The option `sparse_output` is available as of scipy 1.8.0")
  406. splt = SplineTransformer(
  407. n_knots=n_knots,
  408. degree=degree,
  409. include_bias=include_bias,
  410. extrapolation=extrapolation,
  411. sparse_output=sparse_output,
  412. )
  413. X = np.linspace(0, 1, 10)[:, None]
  414. splt.fit(X)
  415. assert splt.transform(X).shape[1] == splt.n_features_out_
  416. @pytest.mark.parametrize(
  417. "params, err_msg",
  418. [
  419. ({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"),
  420. ({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"),
  421. ({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"),
  422. ({"degree": (1, 2, 3)}, r"int or tuple \(min_degree, max_degree\)"),
  423. ],
  424. )
  425. def test_polynomial_features_input_validation(params, err_msg):
  426. """Test that we raise errors for invalid input in PolynomialFeatures."""
  427. X = [[1], [2]]
  428. with pytest.raises(ValueError, match=err_msg):
  429. PolynomialFeatures(**params).fit(X)
  430. @pytest.fixture()
  431. def single_feature_degree3():
  432. X = np.arange(6)[:, np.newaxis]
  433. P = np.hstack([np.ones_like(X), X, X**2, X**3])
  434. return X, P
  435. @pytest.mark.parametrize(
  436. "degree, include_bias, interaction_only, indices",
  437. [
  438. (3, True, False, slice(None, None)),
  439. (3, False, False, slice(1, None)),
  440. (3, True, True, [0, 1]),
  441. (3, False, True, [1]),
  442. ((2, 3), True, False, [0, 2, 3]),
  443. ((2, 3), False, False, [2, 3]),
  444. ((2, 3), True, True, [0]),
  445. ((2, 3), False, True, []),
  446. ],
  447. )
  448. @pytest.mark.parametrize(
  449. "sparse_X",
  450. [False, sparse.csr_matrix, sparse.csc_matrix],
  451. )
  452. def test_polynomial_features_one_feature(
  453. single_feature_degree3,
  454. degree,
  455. include_bias,
  456. interaction_only,
  457. indices,
  458. sparse_X,
  459. ):
  460. """Test PolynomialFeatures on single feature up to degree 3."""
  461. X, P = single_feature_degree3
  462. if sparse_X:
  463. X = sparse_X(X)
  464. tf = PolynomialFeatures(
  465. degree=degree, include_bias=include_bias, interaction_only=interaction_only
  466. ).fit(X)
  467. out = tf.transform(X)
  468. if sparse_X:
  469. out = out.toarray()
  470. assert_allclose(out, P[:, indices])
  471. if tf.n_output_features_ > 0:
  472. assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
  473. @pytest.fixture()
  474. def two_features_degree3():
  475. X = np.arange(6).reshape((3, 2))
  476. x1 = X[:, :1]
  477. x2 = X[:, 1:]
  478. P = np.hstack(
  479. [
  480. x1**0 * x2**0, # 0
  481. x1**1 * x2**0, # 1
  482. x1**0 * x2**1, # 2
  483. x1**2 * x2**0, # 3
  484. x1**1 * x2**1, # 4
  485. x1**0 * x2**2, # 5
  486. x1**3 * x2**0, # 6
  487. x1**2 * x2**1, # 7
  488. x1**1 * x2**2, # 8
  489. x1**0 * x2**3, # 9
  490. ]
  491. )
  492. return X, P
  493. @pytest.mark.parametrize(
  494. "degree, include_bias, interaction_only, indices",
  495. [
  496. (2, True, False, slice(0, 6)),
  497. (2, False, False, slice(1, 6)),
  498. (2, True, True, [0, 1, 2, 4]),
  499. (2, False, True, [1, 2, 4]),
  500. ((2, 2), True, False, [0, 3, 4, 5]),
  501. ((2, 2), False, False, [3, 4, 5]),
  502. ((2, 2), True, True, [0, 4]),
  503. ((2, 2), False, True, [4]),
  504. (3, True, False, slice(None, None)),
  505. (3, False, False, slice(1, None)),
  506. (3, True, True, [0, 1, 2, 4]),
  507. (3, False, True, [1, 2, 4]),
  508. ((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]),
  509. ((2, 3), False, False, slice(3, None)),
  510. ((2, 3), True, True, [0, 4]),
  511. ((2, 3), False, True, [4]),
  512. ((3, 3), True, False, [0, 6, 7, 8, 9]),
  513. ((3, 3), False, False, [6, 7, 8, 9]),
  514. ((3, 3), True, True, [0]),
  515. ((3, 3), False, True, []), # would need 3 input features
  516. ],
  517. )
  518. @pytest.mark.parametrize(
  519. "sparse_X",
  520. [False, sparse.csr_matrix, sparse.csc_matrix],
  521. )
  522. def test_polynomial_features_two_features(
  523. two_features_degree3,
  524. degree,
  525. include_bias,
  526. interaction_only,
  527. indices,
  528. sparse_X,
  529. ):
  530. """Test PolynomialFeatures on 2 features up to degree 3."""
  531. X, P = two_features_degree3
  532. if sparse_X:
  533. X = sparse_X(X)
  534. tf = PolynomialFeatures(
  535. degree=degree, include_bias=include_bias, interaction_only=interaction_only
  536. ).fit(X)
  537. out = tf.transform(X)
  538. if sparse_X:
  539. out = out.toarray()
  540. assert_allclose(out, P[:, indices])
  541. if tf.n_output_features_ > 0:
  542. assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
  543. def test_polynomial_feature_names():
  544. X = np.arange(30).reshape(10, 3)
  545. poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
  546. feature_names = poly.get_feature_names_out()
  547. assert_array_equal(
  548. ["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"],
  549. feature_names,
  550. )
  551. assert len(feature_names) == poly.transform(X).shape[1]
  552. poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
  553. feature_names = poly.get_feature_names_out(["a", "b", "c"])
  554. assert_array_equal(
  555. [
  556. "a",
  557. "b",
  558. "c",
  559. "a^2",
  560. "a b",
  561. "a c",
  562. "b^2",
  563. "b c",
  564. "c^2",
  565. "a^3",
  566. "a^2 b",
  567. "a^2 c",
  568. "a b^2",
  569. "a b c",
  570. "a c^2",
  571. "b^3",
  572. "b^2 c",
  573. "b c^2",
  574. "c^3",
  575. ],
  576. feature_names,
  577. )
  578. assert len(feature_names) == poly.transform(X).shape[1]
  579. poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X)
  580. feature_names = poly.get_feature_names_out(["a", "b", "c"])
  581. assert_array_equal(
  582. [
  583. "a^2",
  584. "a b",
  585. "a c",
  586. "b^2",
  587. "b c",
  588. "c^2",
  589. "a^3",
  590. "a^2 b",
  591. "a^2 c",
  592. "a b^2",
  593. "a b c",
  594. "a c^2",
  595. "b^3",
  596. "b^2 c",
  597. "b c^2",
  598. "c^3",
  599. ],
  600. feature_names,
  601. )
  602. assert len(feature_names) == poly.transform(X).shape[1]
  603. poly = PolynomialFeatures(
  604. degree=(3, 3), include_bias=True, interaction_only=True
  605. ).fit(X)
  606. feature_names = poly.get_feature_names_out(["a", "b", "c"])
  607. assert_array_equal(["1", "a b c"], feature_names)
  608. assert len(feature_names) == poly.transform(X).shape[1]
  609. # test some unicode
  610. poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
  611. feature_names = poly.get_feature_names_out(["\u0001F40D", "\u262e", "\u05d0"])
  612. assert_array_equal(["1", "\u0001F40D", "\u262e", "\u05d0"], feature_names)
  613. @pytest.mark.parametrize(
  614. ["deg", "include_bias", "interaction_only", "dtype"],
  615. [
  616. (1, True, False, int),
  617. (2, True, False, int),
  618. (2, True, False, np.float32),
  619. (2, True, False, np.float64),
  620. (3, False, False, np.float64),
  621. (3, False, True, np.float64),
  622. (4, False, False, np.float64),
  623. (4, False, True, np.float64),
  624. ],
  625. )
  626. def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype):
  627. rng = np.random.RandomState(0)
  628. X = rng.randint(0, 2, (100, 2))
  629. X_csc = sparse.csc_matrix(X)
  630. est = PolynomialFeatures(
  631. deg, include_bias=include_bias, interaction_only=interaction_only
  632. )
  633. Xt_csc = est.fit_transform(X_csc.astype(dtype))
  634. Xt_dense = est.fit_transform(X.astype(dtype))
  635. assert sparse.issparse(Xt_csc) and Xt_csc.format == "csc"
  636. assert Xt_csc.dtype == Xt_dense.dtype
  637. assert_array_almost_equal(Xt_csc.toarray(), Xt_dense)
  638. @pytest.mark.parametrize(
  639. ["deg", "include_bias", "interaction_only", "dtype"],
  640. [
  641. (1, True, False, int),
  642. (2, True, False, int),
  643. (2, True, False, np.float32),
  644. (2, True, False, np.float64),
  645. (3, False, False, np.float64),
  646. (3, False, True, np.float64),
  647. ],
  648. )
  649. def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
  650. rng = np.random.RandomState(0)
  651. X = rng.randint(0, 2, (100, 2))
  652. X_csr = sparse.csr_matrix(X)
  653. est = PolynomialFeatures(
  654. deg, include_bias=include_bias, interaction_only=interaction_only
  655. )
  656. Xt_csr = est.fit_transform(X_csr.astype(dtype))
  657. Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
  658. assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
  659. assert Xt_csr.dtype == Xt_dense.dtype
  660. assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
  661. @pytest.mark.parametrize("n_features", [1, 4, 5])
  662. @pytest.mark.parametrize(
  663. "min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)]
  664. )
  665. @pytest.mark.parametrize("interaction_only", [True, False])
  666. @pytest.mark.parametrize("include_bias", [True, False])
  667. def test_num_combinations(
  668. n_features,
  669. min_degree,
  670. max_degree,
  671. interaction_only,
  672. include_bias,
  673. ):
  674. """
  675. Test that n_output_features_ is calculated correctly.
  676. """
  677. x = sparse.csr_matrix(([1], ([0], [n_features - 1])))
  678. est = PolynomialFeatures(
  679. degree=max_degree,
  680. interaction_only=interaction_only,
  681. include_bias=include_bias,
  682. )
  683. est.fit(x)
  684. num_combos = est.n_output_features_
  685. combos = PolynomialFeatures._combinations(
  686. n_features=n_features,
  687. min_degree=0,
  688. max_degree=max_degree,
  689. interaction_only=interaction_only,
  690. include_bias=include_bias,
  691. )
  692. assert num_combos == sum([1 for _ in combos])
  693. @pytest.mark.parametrize(
  694. ["deg", "include_bias", "interaction_only", "dtype"],
  695. [
  696. (2, True, False, np.float32),
  697. (2, True, False, np.float64),
  698. (3, False, False, np.float64),
  699. (3, False, True, np.float64),
  700. ],
  701. )
  702. def test_polynomial_features_csr_X_floats(deg, include_bias, interaction_only, dtype):
  703. X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
  704. X = X_csr.toarray()
  705. est = PolynomialFeatures(
  706. deg, include_bias=include_bias, interaction_only=interaction_only
  707. )
  708. Xt_csr = est.fit_transform(X_csr.astype(dtype))
  709. Xt_dense = est.fit_transform(X.astype(dtype))
  710. assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
  711. assert Xt_csr.dtype == Xt_dense.dtype
  712. assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
  713. @pytest.mark.parametrize(
  714. ["zero_row_index", "deg", "interaction_only"],
  715. [
  716. (0, 2, True),
  717. (1, 2, True),
  718. (2, 2, True),
  719. (0, 3, True),
  720. (1, 3, True),
  721. (2, 3, True),
  722. (0, 2, False),
  723. (1, 2, False),
  724. (2, 2, False),
  725. (0, 3, False),
  726. (1, 3, False),
  727. (2, 3, False),
  728. ],
  729. )
  730. def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, interaction_only):
  731. X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr()
  732. X_csr[zero_row_index, :] = 0.0
  733. X = X_csr.toarray()
  734. est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only)
  735. Xt_csr = est.fit_transform(X_csr)
  736. Xt_dense = est.fit_transform(X)
  737. assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
  738. assert Xt_csr.dtype == Xt_dense.dtype
  739. assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
  740. # This degree should always be one more than the highest degree supported by
  741. # _csr_expansion.
  742. @pytest.mark.parametrize(
  743. ["include_bias", "interaction_only"],
  744. [(True, True), (True, False), (False, True), (False, False)],
  745. )
  746. def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only):
  747. X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
  748. X = X_csr.toarray()
  749. est = PolynomialFeatures(
  750. 4, include_bias=include_bias, interaction_only=interaction_only
  751. )
  752. Xt_csr = est.fit_transform(X_csr)
  753. Xt_dense = est.fit_transform(X)
  754. assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
  755. assert Xt_csr.dtype == Xt_dense.dtype
  756. assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
  757. @pytest.mark.parametrize(
  758. ["deg", "dim", "interaction_only"],
  759. [
  760. (2, 1, True),
  761. (2, 2, True),
  762. (3, 1, True),
  763. (3, 2, True),
  764. (3, 3, True),
  765. (2, 1, False),
  766. (2, 2, False),
  767. (3, 1, False),
  768. (3, 2, False),
  769. (3, 3, False),
  770. ],
  771. )
  772. def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only):
  773. X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr()
  774. X = X_csr.toarray()
  775. est = PolynomialFeatures(deg, interaction_only=interaction_only)
  776. Xt_csr = est.fit_transform(X_csr)
  777. Xt_dense = est.fit_transform(X)
  778. assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
  779. assert Xt_csr.dtype == Xt_dense.dtype
  780. assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
  781. @pytest.mark.parametrize("interaction_only", [True, False])
  782. @pytest.mark.parametrize("include_bias", [True, False])
  783. def test_csr_polynomial_expansion_index_overflow_non_regression(
  784. interaction_only, include_bias
  785. ):
  786. """Check the automatic index dtype promotion to `np.int64` when needed.
  787. This ensures that sufficiently large input configurations get
  788. properly promoted to use `np.int64` for index and indptr representation
  789. while preserving data integrity. Non-regression test for gh-16803.
  790. Note that this is only possible for Python runtimes with a 64 bit address
  791. space. On 32 bit platforms, a `ValueError` is raised instead.
  792. """
  793. def degree_2_calc(d, i, j):
  794. if interaction_only:
  795. return d * i - (i**2 + 3 * i) // 2 - 1 + j
  796. else:
  797. return d * i - (i**2 + i) // 2 + j
  798. n_samples = 13
  799. n_features = 120001
  800. data_dtype = np.float32
  801. data = np.arange(1, 5, dtype=np.int64)
  802. row = np.array([n_samples - 2, n_samples - 2, n_samples - 1, n_samples - 1])
  803. # An int64 dtype is required to avoid overflow error on Windows within the
  804. # `degree_2_calc` function.
  805. col = np.array(
  806. [n_features - 2, n_features - 1, n_features - 2, n_features - 1], dtype=np.int64
  807. )
  808. X = sparse.csr_matrix(
  809. (data, (row, col)),
  810. shape=(n_samples, n_features),
  811. dtype=data_dtype,
  812. )
  813. pf = PolynomialFeatures(
  814. interaction_only=interaction_only, include_bias=include_bias, degree=2
  815. )
  816. # Calculate the number of combinations a-priori, and if needed check for
  817. # the correct ValueError and terminate the test early.
  818. num_combinations = pf._num_combinations(
  819. n_features=n_features,
  820. min_degree=0,
  821. max_degree=2,
  822. interaction_only=pf.interaction_only,
  823. include_bias=pf.include_bias,
  824. )
  825. if num_combinations > np.iinfo(np.intp).max:
  826. msg = (
  827. r"The output that would result from the current configuration would have"
  828. r" \d* features which is too large to be indexed"
  829. )
  830. with pytest.raises(ValueError, match=msg):
  831. pf.fit(X)
  832. return
  833. X_trans = pf.fit_transform(X)
  834. row_nonzero, col_nonzero = X_trans.nonzero()
  835. n_degree_1_features_out = n_features + include_bias
  836. max_degree_2_idx = (
  837. degree_2_calc(n_features, col[int(not interaction_only)], col[1])
  838. + n_degree_1_features_out
  839. )
  840. # Account for bias of all samples except last one which will be handled
  841. # separately since there are distinct data values before it
  842. data_target = [1] * (n_samples - 2) if include_bias else []
  843. col_nonzero_target = [0] * (n_samples - 2) if include_bias else []
  844. for i in range(2):
  845. x = data[2 * i]
  846. y = data[2 * i + 1]
  847. x_idx = col[2 * i]
  848. y_idx = col[2 * i + 1]
  849. if include_bias:
  850. data_target.append(1)
  851. col_nonzero_target.append(0)
  852. data_target.extend([x, y])
  853. col_nonzero_target.extend(
  854. [x_idx + int(include_bias), y_idx + int(include_bias)]
  855. )
  856. if not interaction_only:
  857. data_target.extend([x * x, x * y, y * y])
  858. col_nonzero_target.extend(
  859. [
  860. degree_2_calc(n_features, x_idx, x_idx) + n_degree_1_features_out,
  861. degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out,
  862. degree_2_calc(n_features, y_idx, y_idx) + n_degree_1_features_out,
  863. ]
  864. )
  865. else:
  866. data_target.extend([x * y])
  867. col_nonzero_target.append(
  868. degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out
  869. )
  870. nnz_per_row = int(include_bias) + 3 + 2 * int(not interaction_only)
  871. assert pf.n_output_features_ == max_degree_2_idx + 1
  872. assert X_trans.dtype == data_dtype
  873. assert X_trans.shape == (n_samples, max_degree_2_idx + 1)
  874. assert X_trans.indptr.dtype == X_trans.indices.dtype == np.int64
  875. # Ensure that dtype promotion was actually required:
  876. assert X_trans.indices.max() > np.iinfo(np.int32).max
  877. row_nonzero_target = list(range(n_samples - 2)) if include_bias else []
  878. row_nonzero_target.extend(
  879. [n_samples - 2] * nnz_per_row + [n_samples - 1] * nnz_per_row
  880. )
  881. assert_allclose(X_trans.data, data_target)
  882. assert_array_equal(row_nonzero, row_nonzero_target)
  883. assert_array_equal(col_nonzero, col_nonzero_target)
  884. @pytest.mark.parametrize(
  885. "degree, n_features",
  886. [
  887. # Needs promotion to int64 when interaction_only=False
  888. (2, 65535),
  889. (3, 2344),
  890. # This guarantees that the intermediate operation when calculating
  891. # output columns would overflow a C-long, hence checks that python-
  892. # longs are being used.
  893. (2, int(np.sqrt(np.iinfo(np.int64).max) + 1)),
  894. (3, 65535),
  895. # This case tests the second clause of the overflow check which
  896. # takes into account the value of `n_features` itself.
  897. (2, int(np.sqrt(np.iinfo(np.int64).max))),
  898. ],
  899. )
  900. @pytest.mark.parametrize("interaction_only", [True, False])
  901. @pytest.mark.parametrize("include_bias", [True, False])
  902. def test_csr_polynomial_expansion_index_overflow(
  903. degree, n_features, interaction_only, include_bias
  904. ):
  905. """Tests known edge-cases to the dtype promotion strategy and custom
  906. Cython code, including a current bug in the upstream
  907. `scipy.sparse.hstack`.
  908. """
  909. data = [1.0]
  910. row = [0]
  911. col = [n_features - 1]
  912. # First degree index
  913. expected_indices = [
  914. n_features - 1 + int(include_bias),
  915. ]
  916. # Second degree index
  917. expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0])
  918. # Third degree index
  919. expected_indices.append(
  920. n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]
  921. )
  922. X = sparse.csr_matrix((data, (row, col)))
  923. pf = PolynomialFeatures(
  924. interaction_only=interaction_only, include_bias=include_bias, degree=degree
  925. )
  926. # Calculate the number of combinations a-priori, and if needed check for
  927. # the correct ValueError and terminate the test early.
  928. num_combinations = pf._num_combinations(
  929. n_features=n_features,
  930. min_degree=0,
  931. max_degree=degree,
  932. interaction_only=pf.interaction_only,
  933. include_bias=pf.include_bias,
  934. )
  935. if num_combinations > np.iinfo(np.intp).max:
  936. msg = (
  937. r"The output that would result from the current configuration would have"
  938. r" \d* features which is too large to be indexed"
  939. )
  940. with pytest.raises(ValueError, match=msg):
  941. pf.fit(X)
  942. return
  943. # In SciPy < 1.8, a bug occurs when an intermediate matrix in
  944. # `to_stack` in `hstack` fits within int32 however would require int64 when
  945. # combined with all previous matrices in `to_stack`.
  946. if sp_version < parse_version("1.8.0"):
  947. has_bug = False
  948. max_int32 = np.iinfo(np.int32).max
  949. cumulative_size = n_features + include_bias
  950. for deg in range(2, degree + 1):
  951. max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg)
  952. max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1
  953. cumulative_size += max_indices + 1
  954. needs_int64 = max(max_indices, max_indptr) > max_int32
  955. has_bug |= not needs_int64 and cumulative_size > max_int32
  956. if has_bug:
  957. msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
  958. with pytest.raises(ValueError, match=msg):
  959. X_trans = pf.fit_transform(X)
  960. return
  961. # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
  962. # dtype for representing indices and indptr if `n_features` is still
  963. # small enough so that each block matrix's indices and indptr arrays
  964. # can be represented with `np.int32`. We test `n_features==65535`
  965. # since it is guaranteed to run into this bug.
  966. if (
  967. sp_version < parse_version("1.9.2")
  968. and n_features == 65535
  969. and degree == 2
  970. and not interaction_only
  971. ): # pragma: no cover
  972. msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
  973. with pytest.raises(ValueError, match=msg):
  974. X_trans = pf.fit_transform(X)
  975. return
  976. X_trans = pf.fit_transform(X)
  977. expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32
  978. # Terms higher than first degree
  979. non_bias_terms = 1 + (degree - 1) * int(not interaction_only)
  980. expected_nnz = int(include_bias) + non_bias_terms
  981. assert X_trans.dtype == X.dtype
  982. assert X_trans.shape == (1, pf.n_output_features_)
  983. assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype
  984. assert X_trans.nnz == expected_nnz
  985. if include_bias:
  986. assert X_trans[0, 0] == pytest.approx(1.0)
  987. for idx in range(non_bias_terms):
  988. assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0)
  989. offset = interaction_only * n_features
  990. if degree == 3:
  991. offset *= 1 + n_features
  992. assert pf.n_output_features_ == expected_indices[degree - 1] + 1 - offset
  993. @pytest.mark.parametrize("interaction_only", [True, False])
  994. @pytest.mark.parametrize("include_bias", [True, False])
  995. def test_csr_polynomial_expansion_too_large_to_index(interaction_only, include_bias):
  996. n_features = np.iinfo(np.int64).max // 2
  997. data = [1.0]
  998. row = [0]
  999. col = [n_features - 1]
  1000. X = sparse.csr_matrix((data, (row, col)))
  1001. pf = PolynomialFeatures(
  1002. interaction_only=interaction_only, include_bias=include_bias, degree=(2, 2)
  1003. )
  1004. msg = (
  1005. r"The output that would result from the current configuration would have \d*"
  1006. r" features which is too large to be indexed"
  1007. )
  1008. with pytest.raises(ValueError, match=msg):
  1009. pf.fit(X)
  1010. with pytest.raises(ValueError, match=msg):
  1011. pf.fit_transform(X)
  1012. def test_polynomial_features_behaviour_on_zero_degree():
  1013. """Check that PolynomialFeatures raises error when degree=0 and include_bias=False,
  1014. and output a single constant column when include_bias=True
  1015. """
  1016. X = np.ones((10, 2))
  1017. poly = PolynomialFeatures(degree=0, include_bias=False)
  1018. err_msg = (
  1019. "Setting degree to zero and include_bias to False would result in"
  1020. " an empty output array."
  1021. )
  1022. with pytest.raises(ValueError, match=err_msg):
  1023. poly.fit_transform(X)
  1024. poly = PolynomialFeatures(degree=(0, 0), include_bias=False)
  1025. err_msg = (
  1026. "Setting both min_degree and max_degree to zero and include_bias to"
  1027. " False would result in an empty output array."
  1028. )
  1029. with pytest.raises(ValueError, match=err_msg):
  1030. poly.fit_transform(X)
  1031. for _X in [X, sparse.csr_matrix(X), sparse.csc_matrix(X)]:
  1032. poly = PolynomialFeatures(degree=0, include_bias=True)
  1033. output = poly.fit_transform(_X)
  1034. # convert to dense array if needed
  1035. if sparse.issparse(output):
  1036. output = output.toarray()
  1037. assert_array_equal(output, np.ones((X.shape[0], 1)))
  1038. def test_sizeof_LARGEST_INT_t():
  1039. # On Windows, scikit-learn is typically compiled with MSVC that
  1040. # does not support int128 arithmetic (at the time of writing):
  1041. # https://stackoverflow.com/a/6761962/163740
  1042. if sys.platform == "win32" or (
  1043. sys.maxsize <= 2**32 and sys.platform != "emscripten"
  1044. ):
  1045. expected_size = 8
  1046. else:
  1047. expected_size = 16
  1048. assert _get_sizeof_LARGEST_INT_t() == expected_size
  1049. @pytest.mark.xfail(
  1050. sys.platform == "win32",
  1051. reason=(
  1052. "On Windows, scikit-learn is typically compiled with MSVC that does not support"
  1053. " int128 arithmetic (at the time of writing)"
  1054. ),
  1055. run=True,
  1056. )
  1057. def test_csr_polynomial_expansion_windows_fail():
  1058. # Minimum needed to ensure integer overflow occurs while guaranteeing an
  1059. # int64-indexable output.
  1060. n_features = int(np.iinfo(np.int64).max ** (1 / 3) + 3)
  1061. data = [1.0]
  1062. row = [0]
  1063. col = [n_features - 1]
  1064. # First degree index
  1065. expected_indices = [
  1066. n_features - 1,
  1067. ]
  1068. # Second degree index
  1069. expected_indices.append(
  1070. int(n_features * (n_features + 1) // 2 + expected_indices[0])
  1071. )
  1072. # Third degree index
  1073. expected_indices.append(
  1074. int(n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1])
  1075. )
  1076. X = sparse.csr_matrix((data, (row, col)))
  1077. pf = PolynomialFeatures(interaction_only=False, include_bias=False, degree=3)
  1078. if sys.maxsize <= 2**32:
  1079. msg = (
  1080. r"The output that would result from the current configuration would"
  1081. r" have \d*"
  1082. r" features which is too large to be indexed"
  1083. )
  1084. with pytest.raises(ValueError, match=msg):
  1085. pf.fit_transform(X)
  1086. else:
  1087. X_trans = pf.fit_transform(X)
  1088. for idx in range(3):
  1089. assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0)