test_target_encoder.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. import numpy as np
  2. import pytest
  3. from numpy.testing import assert_allclose, assert_array_equal
  4. from sklearn.ensemble import RandomForestRegressor
  5. from sklearn.linear_model import Ridge
  6. from sklearn.model_selection import (
  7. KFold,
  8. ShuffleSplit,
  9. StratifiedKFold,
  10. cross_val_score,
  11. train_test_split,
  12. )
  13. from sklearn.pipeline import make_pipeline
  14. from sklearn.preprocessing import (
  15. KBinsDiscretizer,
  16. LabelEncoder,
  17. TargetEncoder,
  18. )
  19. def _encode_target(X_ordinal, y_int, n_categories, smooth):
  20. """Simple Python implementation of target encoding."""
  21. cur_encodings = np.zeros(n_categories, dtype=np.float64)
  22. y_mean = np.mean(y_int)
  23. if smooth == "auto":
  24. y_variance = np.var(y_int)
  25. for c in range(n_categories):
  26. y_subset = y_int[X_ordinal == c]
  27. n_i = y_subset.shape[0]
  28. if n_i == 0:
  29. cur_encodings[c] = y_mean
  30. continue
  31. y_subset_variance = np.var(y_subset)
  32. m = y_subset_variance / y_variance
  33. lambda_ = n_i / (n_i + m)
  34. cur_encodings[c] = lambda_ * np.mean(y_subset) + (1 - lambda_) * y_mean
  35. return cur_encodings
  36. else: # float
  37. for c in range(n_categories):
  38. y_subset = y_int[X_ordinal == c]
  39. current_sum = np.sum(y_subset) + y_mean * smooth
  40. current_cnt = y_subset.shape[0] + smooth
  41. cur_encodings[c] = current_sum / current_cnt
  42. return cur_encodings
  43. @pytest.mark.parametrize(
  44. "categories, unknown_value",
  45. [
  46. ([np.array([0, 1, 2], dtype=np.int64)], 4),
  47. ([np.array([1.0, 3.0, np.nan], dtype=np.float64)], 6.0),
  48. ([np.array(["cat", "dog", "snake"], dtype=object)], "bear"),
  49. ("auto", 3),
  50. ],
  51. )
  52. @pytest.mark.parametrize("smooth", [5.0, "auto"])
  53. @pytest.mark.parametrize("target_type", ["binary", "continuous"])
  54. def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type):
  55. """Check encoding for binary and continuous targets.
  56. Compare the values returned by `TargetEncoder.fit_transform` against the
  57. expected encodings for cv splits from a naive reference Python
  58. implementation in _encode_target.
  59. """
  60. n_categories = 3
  61. X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T
  62. X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T
  63. n_samples = X_train_int_array.shape[0]
  64. if categories == "auto":
  65. X_train = X_train_int_array
  66. X_test = X_test_int_array
  67. else:
  68. X_train = categories[0][X_train_int_array]
  69. X_test = categories[0][X_test_int_array]
  70. X_test = np.concatenate((X_test, [[unknown_value]]))
  71. data_rng = np.random.RandomState(global_random_seed)
  72. n_splits = 3
  73. if target_type == "binary":
  74. y_int = data_rng.randint(low=0, high=2, size=n_samples)
  75. target_names = np.array(["cat", "dog"], dtype=object)
  76. y_train = target_names[y_int]
  77. else: # target_type == continuous
  78. y_int = data_rng.uniform(low=-10, high=20, size=n_samples)
  79. y_train = y_int
  80. shuffled_idx = data_rng.permutation(n_samples)
  81. X_train_int_array = X_train_int_array[shuffled_idx]
  82. X_train = X_train[shuffled_idx]
  83. y_train = y_train[shuffled_idx]
  84. y_int = y_int[shuffled_idx]
  85. # Define our CV splitting strategy
  86. if target_type == "binary":
  87. cv = StratifiedKFold(
  88. n_splits=n_splits, random_state=global_random_seed, shuffle=True
  89. )
  90. else:
  91. cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True)
  92. # Compute the expected values using our reference Python implementation of
  93. # target encoding:
  94. expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64)
  95. for train_idx, test_idx in cv.split(X_train_int_array, y_train):
  96. X_, y_ = X_train_int_array[train_idx, 0], y_int[train_idx]
  97. cur_encodings = _encode_target(X_, y_, n_categories, smooth)
  98. expected_X_fit_transform[test_idx, 0] = cur_encodings[
  99. X_train_int_array[test_idx, 0]
  100. ]
  101. # Check that we can obtain the same encodings by calling `fit_transform` on
  102. # the estimator with the same CV parameters:
  103. target_encoder = TargetEncoder(
  104. smooth=smooth,
  105. categories=categories,
  106. cv=n_splits,
  107. random_state=global_random_seed,
  108. )
  109. X_fit_transform = target_encoder.fit_transform(X_train, y_train)
  110. assert target_encoder.target_type_ == target_type
  111. assert_allclose(X_fit_transform, expected_X_fit_transform)
  112. assert len(target_encoder.encodings_) == 1
  113. # compute encodings for all data to validate `transform`
  114. y_mean = np.mean(y_int)
  115. expected_encodings = _encode_target(
  116. X_train_int_array[:, 0], y_int, n_categories, smooth
  117. )
  118. assert_allclose(target_encoder.encodings_[0], expected_encodings)
  119. assert target_encoder.target_mean_ == pytest.approx(y_mean)
  120. # Transform on test data, the last value is unknown so it is encoded as the target
  121. # mean
  122. expected_X_test_transform = np.concatenate(
  123. (expected_encodings, np.array([y_mean]))
  124. ).reshape(-1, 1)
  125. X_test_transform = target_encoder.transform(X_test)
  126. assert_allclose(X_test_transform, expected_X_test_transform)
  127. @pytest.mark.parametrize(
  128. "X, categories",
  129. [
  130. (
  131. np.array([[0] * 10 + [1] * 10 + [3]], dtype=np.int64).T, # 3 is unknown
  132. [[0, 1, 2]],
  133. ),
  134. (
  135. np.array(
  136. [["cat"] * 10 + ["dog"] * 10 + ["snake"]], dtype=object
  137. ).T, # snake is unknown
  138. [["dog", "cat", "cow"]],
  139. ),
  140. ],
  141. )
  142. @pytest.mark.parametrize("smooth", [4.0, "auto"])
  143. def test_custom_categories(X, categories, smooth):
  144. """Custom categories with unknown categories that are not in training data."""
  145. rng = np.random.RandomState(0)
  146. y = rng.uniform(low=-10, high=20, size=X.shape[0])
  147. enc = TargetEncoder(categories=categories, smooth=smooth, random_state=0).fit(X, y)
  148. # The last element is unknown and encoded as the mean
  149. y_mean = y.mean()
  150. X_trans = enc.transform(X[-1:])
  151. assert X_trans[0, 0] == pytest.approx(y_mean)
  152. assert len(enc.encodings_) == 1
  153. # custom category that is not in training data
  154. assert enc.encodings_[0][-1] == pytest.approx(y_mean)
  155. @pytest.mark.parametrize(
  156. "y, msg",
  157. [
  158. ([1, 2, 0, 1], "Found input variables with inconsistent"),
  159. (
  160. np.array([[1, 2, 0], [1, 2, 3]]).T,
  161. "Target type was inferred to be 'multiclass-multioutput'",
  162. ),
  163. (["cat", "dog", "bear"], "Target type was inferred to be 'multiclass'"),
  164. ],
  165. )
  166. def test_errors(y, msg):
  167. """Check invalidate input."""
  168. X = np.array([[1, 0, 1]]).T
  169. enc = TargetEncoder()
  170. with pytest.raises(ValueError, match=msg):
  171. enc.fit_transform(X, y)
  172. def test_use_regression_target():
  173. """Custom target_type to avoid inferring the target type."""
  174. X = np.array([[0, 1, 0, 1, 0, 1]]).T
  175. # XXX: When multiclass is supported, then the following `y`
  176. # is considered a multiclass problem and `TargetEncoder` will not error.
  177. # type_of_target would be 'multiclass'
  178. y = np.array([1.0, 2.0, 3.0, 2.0, 3.0, 4.0])
  179. enc = TargetEncoder()
  180. msg = "Target type was inferred to be 'multiclass'"
  181. with pytest.raises(ValueError, match=msg):
  182. enc.fit_transform(X, y)
  183. enc = TargetEncoder(target_type="continuous")
  184. enc.fit_transform(X, y)
  185. assert enc.target_type_ == "continuous"
  186. def test_feature_names_out_set_output():
  187. """Check TargetEncoder works with set_output."""
  188. pd = pytest.importorskip("pandas")
  189. X_df = pd.DataFrame({"A": ["a", "b"] * 10, "B": [1, 2] * 10})
  190. y = [1, 2] * 10
  191. enc_default = TargetEncoder(cv=2, smooth=3.0, random_state=0)
  192. enc_default.set_output(transform="default")
  193. enc_pandas = TargetEncoder(cv=2, smooth=3.0, random_state=0)
  194. enc_pandas.set_output(transform="pandas")
  195. X_default = enc_default.fit_transform(X_df, y)
  196. X_pandas = enc_pandas.fit_transform(X_df, y)
  197. assert_allclose(X_pandas.to_numpy(), X_default)
  198. assert_array_equal(enc_pandas.get_feature_names_out(), ["A", "B"])
  199. assert_array_equal(enc_pandas.get_feature_names_out(), X_pandas.columns)
  200. @pytest.mark.parametrize("to_pandas", [True, False])
  201. @pytest.mark.parametrize("smooth", [1.0, "auto"])
  202. @pytest.mark.parametrize("target_type", ["binary-ints", "binary-str", "continuous"])
  203. def test_multiple_features_quick(to_pandas, smooth, target_type):
  204. """Check target encoder with multiple features."""
  205. X_ordinal = np.array(
  206. [[1, 1], [0, 1], [1, 1], [2, 1], [1, 0], [0, 1], [1, 0], [0, 0]], dtype=np.int64
  207. )
  208. if target_type == "binary-str":
  209. y_train = np.array(["a", "b", "a", "a", "b", "b", "a", "b"])
  210. y_integer = LabelEncoder().fit_transform(y_train)
  211. cv = StratifiedKFold(2, random_state=0, shuffle=True)
  212. elif target_type == "binary-ints":
  213. y_train = np.array([3, 4, 3, 3, 3, 4, 4, 4])
  214. y_integer = LabelEncoder().fit_transform(y_train)
  215. cv = StratifiedKFold(2, random_state=0, shuffle=True)
  216. else:
  217. y_train = np.array([3.0, 5.1, 2.4, 3.5, 4.1, 5.5, 10.3, 7.3], dtype=np.float32)
  218. y_integer = y_train
  219. cv = KFold(2, random_state=0, shuffle=True)
  220. y_mean = np.mean(y_integer)
  221. categories = [[0, 1, 2], [0, 1]]
  222. X_test = np.array(
  223. [
  224. [0, 1],
  225. [3, 0], # 3 is unknown
  226. [1, 10], # 10 is unknown
  227. ],
  228. dtype=np.int64,
  229. )
  230. if to_pandas:
  231. pd = pytest.importorskip("pandas")
  232. # convert second feature to an object
  233. X_train = pd.DataFrame(
  234. {
  235. "feat0": X_ordinal[:, 0],
  236. "feat1": np.array(["cat", "dog"], dtype=object)[X_ordinal[:, 1]],
  237. }
  238. )
  239. # "snake" is unknown
  240. X_test = pd.DataFrame({"feat0": X_test[:, 0], "feat1": ["dog", "cat", "snake"]})
  241. else:
  242. X_train = X_ordinal
  243. # manually compute encoding for fit_transform
  244. expected_X_fit_transform = np.empty_like(X_ordinal, dtype=np.float64)
  245. for f_idx, cats in enumerate(categories):
  246. for train_idx, test_idx in cv.split(X_ordinal, y_integer):
  247. X_, y_ = X_ordinal[train_idx, f_idx], y_integer[train_idx]
  248. current_encoding = _encode_target(X_, y_, len(cats), smooth)
  249. expected_X_fit_transform[test_idx, f_idx] = current_encoding[
  250. X_ordinal[test_idx, f_idx]
  251. ]
  252. # manually compute encoding for transform
  253. expected_encodings = []
  254. for f_idx, cats in enumerate(categories):
  255. current_encoding = _encode_target(
  256. X_ordinal[:, f_idx], y_integer, len(cats), smooth
  257. )
  258. expected_encodings.append(current_encoding)
  259. expected_X_test_transform = np.array(
  260. [
  261. [expected_encodings[0][0], expected_encodings[1][1]],
  262. [y_mean, expected_encodings[1][0]],
  263. [expected_encodings[0][1], y_mean],
  264. ],
  265. dtype=np.float64,
  266. )
  267. enc = TargetEncoder(smooth=smooth, cv=2, random_state=0)
  268. X_fit_transform = enc.fit_transform(X_train, y_train)
  269. assert_allclose(X_fit_transform, expected_X_fit_transform)
  270. assert len(enc.encodings_) == 2
  271. for i in range(2):
  272. assert_allclose(enc.encodings_[i], expected_encodings[i])
  273. X_test_transform = enc.transform(X_test)
  274. assert_allclose(X_test_transform, expected_X_test_transform)
  275. @pytest.mark.parametrize(
  276. "y, y_mean",
  277. [
  278. (np.array([3.4] * 20), 3.4),
  279. (np.array([0] * 20), 0),
  280. (np.array(["a"] * 20, dtype=object), 0),
  281. ],
  282. ids=["continuous", "binary", "binary-string"],
  283. )
  284. @pytest.mark.parametrize("smooth", ["auto", 4.0, 0.0])
  285. def test_constant_target_and_feature(y, y_mean, smooth):
  286. """Check edge case where feature and target is constant."""
  287. X = np.array([[1] * 20]).T
  288. n_samples = X.shape[0]
  289. enc = TargetEncoder(cv=2, smooth=smooth, random_state=0)
  290. X_trans = enc.fit_transform(X, y)
  291. assert_allclose(X_trans, np.repeat([[y_mean]], n_samples, axis=0))
  292. assert enc.encodings_[0][0] == pytest.approx(y_mean)
  293. assert enc.target_mean_ == pytest.approx(y_mean)
  294. X_test = np.array([[1], [0]])
  295. X_test_trans = enc.transform(X_test)
  296. assert_allclose(X_test_trans, np.repeat([[y_mean]], 2, axis=0))
  297. def test_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not(
  298. global_random_seed,
  299. ):
  300. cardinality = 30 # not too large, otherwise we need a very large n_samples
  301. n_samples = 3000
  302. rng = np.random.RandomState(global_random_seed)
  303. y_train = rng.normal(size=n_samples)
  304. X_train = rng.randint(0, cardinality, size=n_samples).reshape(-1, 1)
  305. # Sort by y_train to attempt to cause a leak
  306. y_sorted_indices = y_train.argsort()
  307. y_train = y_train[y_sorted_indices]
  308. X_train = X_train[y_sorted_indices]
  309. target_encoder = TargetEncoder(shuffle=True, random_state=global_random_seed)
  310. X_encoded_train_shuffled = target_encoder.fit_transform(X_train, y_train)
  311. target_encoder = TargetEncoder(shuffle=False)
  312. X_encoded_train_no_shuffled = target_encoder.fit_transform(X_train, y_train)
  313. # Check that no information about y_train has leaked into X_train:
  314. regressor = RandomForestRegressor(
  315. n_estimators=10, min_samples_leaf=20, random_state=global_random_seed
  316. )
  317. # It's impossible to learn a good predictive model on the training set when
  318. # using the original representation X_train or the target encoded
  319. # representation with shuffled inner CV. For the latter, no information
  320. # about y_train has inadvertently leaked into the prior used to generate
  321. # `X_encoded_train_shuffled`:
  322. cv = ShuffleSplit(n_splits=50, random_state=global_random_seed)
  323. assert cross_val_score(regressor, X_train, y_train, cv=cv).mean() < 0.1
  324. assert (
  325. cross_val_score(regressor, X_encoded_train_shuffled, y_train, cv=cv).mean()
  326. < 0.1
  327. )
  328. # Without the inner CV shuffling, a lot of information about y_train goes into the
  329. # the per-fold y_train.mean() priors: shrinkage is no longer effective in this
  330. # case and would no longer be able to prevent downstream over-fitting.
  331. assert (
  332. cross_val_score(regressor, X_encoded_train_no_shuffled, y_train, cv=cv).mean()
  333. > 0.5
  334. )
  335. def test_smooth_zero():
  336. """Check edge case with zero smoothing and cv does not contain category."""
  337. X = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]).T
  338. y = np.array([2.1, 4.3, 1.2, 3.1, 1.0, 9.0, 10.3, 14.2, 13.3, 15.0])
  339. enc = TargetEncoder(smooth=0.0, shuffle=False, cv=2)
  340. X_trans = enc.fit_transform(X, y)
  341. # With cv = 2, category 0 does not exist in the second half, thus
  342. # it will be encoded as the mean of the second half
  343. assert_allclose(X_trans[0], np.mean(y[5:]))
  344. # category 1 does not exist in the first half, thus it will be encoded as
  345. # the mean of the first half
  346. assert_allclose(X_trans[-1], np.mean(y[:5]))
  347. @pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"])
  348. def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed):
  349. # Check that the encoding does not depend on the integer of the value of
  350. # the integer labels. This is quite a trivial property but it is helpful
  351. # to understand the following test.
  352. rng = np.random.RandomState(global_random_seed)
  353. # Random y and informative categorical X to make the test non-trivial when
  354. # using smoothing.
  355. y = rng.normal(size=1000)
  356. n_categories = 30
  357. X = KBinsDiscretizer(n_bins=n_categories, encode="ordinal").fit_transform(
  358. y.reshape(-1, 1)
  359. )
  360. X_train, X_test, y_train, y_test = train_test_split(
  361. X, y, random_state=global_random_seed
  362. )
  363. # Shuffle the labels to make sure that the encoding is invariant to the
  364. # permutation of the labels
  365. permutated_labels = rng.permutation(n_categories)
  366. X_train_permuted = permutated_labels[X_train.astype(np.int32)]
  367. X_test_permuted = permutated_labels[X_test.astype(np.int32)]
  368. target_encoder = TargetEncoder(smooth=smooth, random_state=global_random_seed)
  369. X_train_encoded = target_encoder.fit_transform(X_train, y_train)
  370. X_test_encoded = target_encoder.transform(X_test)
  371. X_train_permuted_encoded = target_encoder.fit_transform(X_train_permuted, y_train)
  372. X_test_permuted_encoded = target_encoder.transform(X_test_permuted)
  373. assert_allclose(X_train_encoded, X_train_permuted_encoded)
  374. assert_allclose(X_test_encoded, X_test_permuted_encoded)
  375. # TODO(1.5) remove warning filter when kbd's subsample default is changed
  376. @pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
  377. @pytest.mark.parametrize("smooth", [0.0, "auto"])
  378. def test_target_encoding_for_linear_regression(smooth, global_random_seed):
  379. # Check some expected statistical properties when fitting a linear
  380. # regression model on target encoded features depending on their relation
  381. # with that target.
  382. # In this test, we use the Ridge class with the "lsqr" solver and a little
  383. # bit of regularization to implement a linear regression model that
  384. # converges quickly for large `n_samples` and robustly in case of
  385. # correlated features. Since we will fit this model on a mean centered
  386. # target, we do not need to fit an intercept and this will help simplify
  387. # the analysis with respect to the expected coefficients.
  388. linear_regression = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)
  389. # Construct a random target variable. We need a large number of samples for
  390. # this test to be stable across all values of the random seed.
  391. n_samples = 50_000
  392. rng = np.random.RandomState(global_random_seed)
  393. y = rng.randn(n_samples)
  394. # Generate a single informative ordinal feature with medium cardinality.
  395. # Inject some irreducible noise to make it harder for a multivariate model
  396. # to identify the informative feature from other pure noise features.
  397. noise = 0.8 * rng.randn(n_samples)
  398. n_categories = 100
  399. X_informative = KBinsDiscretizer(
  400. n_bins=n_categories,
  401. encode="ordinal",
  402. strategy="uniform",
  403. random_state=rng,
  404. ).fit_transform((y + noise).reshape(-1, 1))
  405. # Let's permute the labels to hide the fact that this feature is
  406. # informative to naive linear regression model trained on the raw ordinal
  407. # values. As highlighted in the previous test, the target encoding should be
  408. # invariant to such a permutation.
  409. permutated_labels = rng.permutation(n_categories)
  410. X_informative = permutated_labels[X_informative.astype(np.int32)]
  411. # Generate a shuffled copy of the informative feature to destroy the
  412. # relationship with the target.
  413. X_shuffled = rng.permutation(X_informative)
  414. # Also include a very high cardinality categorical feature that is by
  415. # itself independent of the target variable: target encoding such a feature
  416. # without internal cross-validation should cause catastrophic overfitting
  417. # for the downstream regressor, even with shrinkage. This kind of features
  418. # typically represents near unique identifiers of samples. In general they
  419. # should be removed from a machine learning datasets but here we want to
  420. # study the ability of the default behavior of TargetEncoder to mitigate
  421. # them automatically.
  422. X_near_unique_categories = rng.choice(
  423. int(0.9 * n_samples), size=n_samples, replace=True
  424. ).reshape(-1, 1)
  425. # Assemble the dataset and do a train-test split:
  426. X = np.concatenate(
  427. [X_informative, X_shuffled, X_near_unique_categories],
  428. axis=1,
  429. )
  430. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
  431. # Let's first check that a linear regression model trained on the raw
  432. # features underfits because of the meaning-less ordinal encoding of the
  433. # labels.
  434. raw_model = linear_regression.fit(X_train, y_train)
  435. assert raw_model.score(X_train, y_train) < 0.1
  436. assert raw_model.score(X_test, y_test) < 0.1
  437. # Now do the same with target encoding using the internal CV mechanism
  438. # implemented when using fit_transform.
  439. model_with_cv = make_pipeline(
  440. TargetEncoder(smooth=smooth, random_state=rng), linear_regression
  441. ).fit(X_train, y_train)
  442. # This model should be able to fit the data well and also generalise to the
  443. # test data (assuming that the binning is fine-grained enough). The R2
  444. # scores are not perfect because of the noise injected during the
  445. # generation of the unique informative feature.
  446. coef = model_with_cv[-1].coef_
  447. assert model_with_cv.score(X_train, y_train) > 0.5, coef
  448. assert model_with_cv.score(X_test, y_test) > 0.5, coef
  449. # The target encoder recovers the linear relationship with slope 1 between
  450. # the target encoded unique informative predictor and the target. Since the
  451. # target encoding of the 2 other features is not informative thanks to the
  452. # use of internal cross-validation, the multivariate linear regressor
  453. # assigns a coef of 1 to the first feature and 0 to the other 2.
  454. assert coef[0] == pytest.approx(1, abs=1e-2)
  455. assert (np.abs(coef[1:]) < 0.2).all()
  456. # Let's now disable the internal cross-validation by calling fit and then
  457. # transform separately on the training set:
  458. target_encoder = TargetEncoder(smooth=smooth, random_state=rng).fit(
  459. X_train, y_train
  460. )
  461. X_enc_no_cv_train = target_encoder.transform(X_train)
  462. X_enc_no_cv_test = target_encoder.transform(X_test)
  463. model_no_cv = linear_regression.fit(X_enc_no_cv_train, y_train)
  464. # The linear regression model should always overfit because it assigns
  465. # too much weight to the extremely high cardinality feature relatively to
  466. # the informative feature. Note that this is the case even when using
  467. # the empirical Bayes smoothing which is not enough to prevent such
  468. # overfitting alone.
  469. coef = model_no_cv.coef_
  470. assert model_no_cv.score(X_enc_no_cv_train, y_train) > 0.7, coef
  471. assert model_no_cv.score(X_enc_no_cv_test, y_test) < 0.5, coef
  472. # The model overfits because it assigns too much weight to the high
  473. # cardinality yet non-informative feature instead of the lower
  474. # cardinality yet informative feature:
  475. assert abs(coef[0]) < abs(coef[2])