XMD
/
Lightstar


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897
							# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
#         Fabian Pedregosa <fabian.pedregosa@inria.fr>
#         Maria Telenczuk <https://github.com/maikia>
#
# License: BSD 3 clause

import warnings

import numpy as np
import pytest
from scipy import linalg, sparse

from sklearn.datasets import load_iris, make_regression, make_sparse_uncorrelated
from sklearn.linear_model import LinearRegression
from sklearn.linear_model._base import (
    _deprecate_normalize,
    _preprocess_data,
    _rescale_data,
    make_dataset,
)
from sklearn.preprocessing import StandardScaler, add_dummy_feature
from sklearn.utils._testing import (
    assert_allclose,
    assert_array_almost_equal,
    assert_array_equal,
)

rtol = 1e-6


def test_linear_regression():
    # Test LinearRegression on a simple dataset.
    # a simple dataset
    X = [[1], [2]]
    Y = [1, 2]

    reg = LinearRegression()
    reg.fit(X, Y)

    assert_array_almost_equal(reg.coef_, [1])
    assert_array_almost_equal(reg.intercept_, [0])
    assert_array_almost_equal(reg.predict(X), [1, 2])

    # test it also for degenerate input
    X = [[1]]
    Y = [0]

    reg = LinearRegression()
    reg.fit(X, Y)
    assert_array_almost_equal(reg.coef_, [0])
    assert_array_almost_equal(reg.intercept_, [0])
    assert_array_almost_equal(reg.predict(X), [0])


@pytest.mark.parametrize("array_constr", [np.array, sparse.csr_matrix])
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_linear_regression_sample_weights(
    array_constr, fit_intercept, global_random_seed
):
    rng = np.random.RandomState(global_random_seed)

    # It would not work with under-determined systems
    n_samples, n_features = 6, 5

    X = array_constr(rng.normal(size=(n_samples, n_features)))
    y = rng.normal(size=n_samples)

    sample_weight = 1.0 + rng.uniform(size=n_samples)

    # LinearRegression with explicit sample_weight
    reg = LinearRegression(fit_intercept=fit_intercept)
    reg.fit(X, y, sample_weight=sample_weight)
    coefs1 = reg.coef_
    inter1 = reg.intercept_

    assert reg.coef_.shape == (X.shape[1],)  # sanity checks

    # Closed form of the weighted least square
    # theta = (X^T W X)^(-1) @ X^T W y
    W = np.diag(sample_weight)
    X_aug = X if not fit_intercept else add_dummy_feature(X)

    Xw = X_aug.T @ W @ X_aug
    yw = X_aug.T @ W @ y
    coefs2 = linalg.solve(Xw, yw)

    if not fit_intercept:
        assert_allclose(coefs1, coefs2)
    else:
        assert_allclose(coefs1, coefs2[1:])
        assert_allclose(inter1, coefs2[0])


def test_raises_value_error_if_positive_and_sparse():
    error_msg = "A sparse matrix was passed, but dense data is required."
    # X must not be sparse if positive == True
    X = sparse.eye(10)
    y = np.ones(10)

    reg = LinearRegression(positive=True)

    with pytest.raises(TypeError, match=error_msg):
        reg.fit(X, y)


@pytest.mark.parametrize("n_samples, n_features", [(2, 3), (3, 2)])
def test_raises_value_error_if_sample_weights_greater_than_1d(n_samples, n_features):
    # Sample weights must be either scalar or 1D
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)
    y = rng.randn(n_samples)
    sample_weights_OK = rng.randn(n_samples) ** 2 + 1
    sample_weights_OK_1 = 1.0
    sample_weights_OK_2 = 2.0

    reg = LinearRegression()

    # make sure the "OK" sample weights actually work
    reg.fit(X, y, sample_weights_OK)
    reg.fit(X, y, sample_weights_OK_1)
    reg.fit(X, y, sample_weights_OK_2)


def test_fit_intercept():
    # Test assertions on betas shape.
    X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]])
    X3 = np.array(
        [[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]
    )
    y = np.array([1, 1])

    lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y)
    lr2_with_intercept = LinearRegression().fit(X2, y)

    lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y)
    lr3_with_intercept = LinearRegression().fit(X3, y)

    assert lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape
    assert lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape
    assert lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim


def test_error_on_wrong_normalize():
    normalize = "wrong"
    error_msg = "Leave 'normalize' to its default"
    with pytest.raises(ValueError, match=error_msg):
        _deprecate_normalize(normalize, "estimator")


# TODO(1.4): remove
@pytest.mark.parametrize("normalize", [True, False, "deprecated"])
def test_deprecate_normalize(normalize):
    # test all possible case of the normalize parameter deprecation
    if normalize == "deprecated":
        # no warning
        output = False
        expected = None
        warning_msg = []
    else:
        output = normalize
        expected = FutureWarning
        warning_msg = ["1.4"]
        if not normalize:
            warning_msg.append("default value")
        else:
            warning_msg.append("StandardScaler(")

    if expected is None:
        with warnings.catch_warnings():
            warnings.simplefilter("error", FutureWarning)
            _normalize = _deprecate_normalize(normalize, "estimator")
    else:
        with pytest.warns(expected) as record:
            _normalize = _deprecate_normalize(normalize, "estimator")
        assert all([warning in str(record[0].message) for warning in warning_msg])
    assert _normalize == output


def test_linear_regression_sparse(global_random_seed):
    # Test that linear regression also works with sparse data
    rng = np.random.RandomState(global_random_seed)
    n = 100
    X = sparse.eye(n, n)
    beta = rng.rand(n)
    y = X @ beta

    ols = LinearRegression()
    ols.fit(X, y.ravel())
    assert_array_almost_equal(beta, ols.coef_ + ols.intercept_)

    assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)


@pytest.mark.parametrize("fit_intercept", [True, False])
def test_linear_regression_sparse_equal_dense(fit_intercept):
    # Test that linear regression agrees between sparse and dense
    rng = np.random.RandomState(0)
    n_samples = 200
    n_features = 2
    X = rng.randn(n_samples, n_features)
    X[X < 0.1] = 0.0
    Xcsr = sparse.csr_matrix(X)
    y = rng.rand(n_samples)
    params = dict(fit_intercept=fit_intercept)
    clf_dense = LinearRegression(**params)
    clf_sparse = LinearRegression(**params)
    clf_dense.fit(X, y)
    clf_sparse.fit(Xcsr, y)
    assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
    assert_allclose(clf_dense.coef_, clf_sparse.coef_)


def test_linear_regression_multiple_outcome():
    # Test multiple-outcome linear regressions
    rng = np.random.RandomState(0)
    X, y = make_regression(random_state=rng)

    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    reg = LinearRegression()
    reg.fit((X), Y)
    assert reg.coef_.shape == (2, n_features)
    Y_pred = reg.predict(X)
    reg.fit(X, y)
    y_pred = reg.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)


def test_linear_regression_sparse_multiple_outcome(global_random_seed):
    # Test multiple-outcome linear regressions with sparse data
    rng = np.random.RandomState(global_random_seed)
    X, y = make_sparse_uncorrelated(random_state=rng)
    X = sparse.coo_matrix(X)
    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    ols = LinearRegression()
    ols.fit(X, Y)
    assert ols.coef_.shape == (2, n_features)
    Y_pred = ols.predict(X)
    ols.fit(X, y.ravel())
    y_pred = ols.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)


def test_linear_regression_positive():
    # Test nonnegative LinearRegression on a simple dataset.
    X = [[1], [2]]
    y = [1, 2]

    reg = LinearRegression(positive=True)
    reg.fit(X, y)

    assert_array_almost_equal(reg.coef_, [1])
    assert_array_almost_equal(reg.intercept_, [0])
    assert_array_almost_equal(reg.predict(X), [1, 2])

    # test it also for degenerate input
    X = [[1]]
    y = [0]

    reg = LinearRegression(positive=True)
    reg.fit(X, y)
    assert_allclose(reg.coef_, [0])
    assert_allclose(reg.intercept_, [0])
    assert_allclose(reg.predict(X), [0])


def test_linear_regression_positive_multiple_outcome(global_random_seed):
    # Test multiple-outcome nonnegative linear regressions
    rng = np.random.RandomState(global_random_seed)
    X, y = make_sparse_uncorrelated(random_state=rng)
    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    ols = LinearRegression(positive=True)
    ols.fit(X, Y)
    assert ols.coef_.shape == (2, n_features)
    assert np.all(ols.coef_ >= 0.0)
    Y_pred = ols.predict(X)
    ols.fit(X, y.ravel())
    y_pred = ols.predict(X)
    assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)


def test_linear_regression_positive_vs_nonpositive(global_random_seed):
    # Test differences with LinearRegression when positive=False.
    rng = np.random.RandomState(global_random_seed)
    X, y = make_sparse_uncorrelated(random_state=rng)

    reg = LinearRegression(positive=True)
    reg.fit(X, y)
    regn = LinearRegression(positive=False)
    regn.fit(X, y)

    assert np.mean((reg.coef_ - regn.coef_) ** 2) > 1e-3


def test_linear_regression_positive_vs_nonpositive_when_positive(global_random_seed):
    # Test LinearRegression fitted coefficients
    # when the problem is positive.
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 4
    X = rng.rand(n_samples, n_features)
    y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3]

    reg = LinearRegression(positive=True)
    reg.fit(X, y)
    regn = LinearRegression(positive=False)
    regn.fit(X, y)

    assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6


@pytest.mark.parametrize("sparse_X", [True, False])
@pytest.mark.parametrize("use_sw", [True, False])
def test_inplace_data_preprocessing(sparse_X, use_sw, global_random_seed):
    # Check that the data is not modified inplace by the linear regression
    # estimator.
    rng = np.random.RandomState(global_random_seed)
    original_X_data = rng.randn(10, 12)
    original_y_data = rng.randn(10, 2)
    orginal_sw_data = rng.rand(10)

    if sparse_X:
        X = sparse.csr_matrix(original_X_data)
    else:
        X = original_X_data.copy()
    y = original_y_data.copy()
    # XXX: Note hat y_sparse is not supported (broken?) in the current
    # implementation of LinearRegression.

    if use_sw:
        sample_weight = orginal_sw_data.copy()
    else:
        sample_weight = None

    # Do not allow inplace preprocessing of X and y:
    reg = LinearRegression()
    reg.fit(X, y, sample_weight=sample_weight)
    if sparse_X:
        assert_allclose(X.toarray(), original_X_data)
    else:
        assert_allclose(X, original_X_data)
    assert_allclose(y, original_y_data)

    if use_sw:
        assert_allclose(sample_weight, orginal_sw_data)

    # Allow inplace preprocessing of X and y
    reg = LinearRegression(copy_X=False)
    reg.fit(X, y, sample_weight=sample_weight)
    if sparse_X:
        # No optimization relying on the inplace modification of sparse input
        # data has been implemented at this time.
        assert_allclose(X.toarray(), original_X_data)
    else:
        # X has been offset (and optionally rescaled by sample weights)
        # inplace. The 0.42 threshold is arbitrary and has been found to be
        # robust to any random seed in the admissible range.
        assert np.linalg.norm(X - original_X_data) > 0.42

    # y should not have been modified inplace by LinearRegression.fit.
    assert_allclose(y, original_y_data)

    if use_sw:
        # Sample weights have no reason to ever be modified inplace.
        assert_allclose(sample_weight, orginal_sw_data)


def test_linear_regression_pd_sparse_dataframe_warning():
    pd = pytest.importorskip("pandas")

    # Warning is raised only when some of the columns is sparse
    df = pd.DataFrame({"0": np.random.randn(10)})
    for col in range(1, 4):
        arr = np.random.randn(10)
        arr[:8] = 0
        # all columns but the first column is sparse
        if col != 0:
            arr = pd.arrays.SparseArray(arr, fill_value=0)
        df[str(col)] = arr

    msg = "pandas.DataFrame with sparse columns found."

    reg = LinearRegression()
    with pytest.warns(UserWarning, match=msg):
        reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])

    # does not warn when the whole dataframe is sparse
    df["0"] = pd.arrays.SparseArray(df["0"], fill_value=0)
    assert hasattr(df, "sparse")

    with warnings.catch_warnings():
        warnings.simplefilter("error", UserWarning)
        reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])


def test_preprocess_data(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    expected_X_mean = np.mean(X, axis=0)
    expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0])
    expected_y_mean = np.mean(y, axis=0)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=False, normalize=False
    )
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt, X)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=True, normalize=False
    )
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=True, normalize=True
    )
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, expected_X_scale)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)
    assert_array_almost_equal(yt, y - expected_y_mean)


def test_preprocess_data_multioutput(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 3
    n_outputs = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples, n_outputs)
    expected_y_mean = np.mean(y, axis=0)

    args = [X, sparse.csc_matrix(X)]
    for X in args:
        _, yt, _, y_mean, _ = _preprocess_data(
            X, y, fit_intercept=False, normalize=False
        )
        assert_array_almost_equal(y_mean, np.zeros(n_outputs))
        assert_array_almost_equal(yt, y)

        _, yt, _, y_mean, _ = _preprocess_data(
            X, y, fit_intercept=True, normalize=False
        )
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)

        _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)


@pytest.mark.parametrize("is_sparse", [False, True])
def test_preprocess_data_weighted(is_sparse, global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 4
    # Generate random data with 50% of zero values to make sure
    # that the sparse variant of this test is actually sparse. This also
    # shifts the mean value for each columns in X further away from
    # zero.
    X = rng.rand(n_samples, n_features)
    X[X < 0.5] = 0.0

    # Scale the first feature of X to be 10 larger than the other to
    # better check the impact of feature scaling.
    X[:, 0] *= 10

    # Constant non-zero feature.
    X[:, 2] = 1.0

    # Constant zero feature (non-materialized in the sparse case)
    X[:, 3] = 0.0
    y = rng.rand(n_samples)

    sample_weight = rng.rand(n_samples)
    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
    expected_y_mean = np.average(y, axis=0, weights=sample_weight)

    X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
    X_sample_weight_var = np.average(
        (X - X_sample_weight_avg) ** 2, weights=sample_weight, axis=0
    )
    constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
    assert_array_equal(constant_mask, [0, 0, 1, 1])
    expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())

    # near constant features should not be scaled
    expected_X_scale[constant_mask] = 1

    if is_sparse:
        X = sparse.csr_matrix(X)

    # normalize is False
    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X,
        y,
        fit_intercept=True,
        normalize=False,
        sample_weight=sample_weight,
    )
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    if is_sparse:
        assert_array_almost_equal(Xt.toarray(), X.toarray())
    else:
        assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    # normalize is True
    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X,
        y,
        fit_intercept=True,
        normalize=True,
        sample_weight=sample_weight,
    )

    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, expected_X_scale)

    if is_sparse:
        # X is not centered
        assert_array_almost_equal(Xt.toarray(), X.toarray() / expected_X_scale)
    else:
        assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)

    # _preprocess_data with normalize=True scales the data by the feature-wise
    # euclidean norms while StandardScaler scales the data by the feature-wise
    # standard deviations.
    # The two are equivalent up to a ratio of np.sqrt(n_samples) if unweighted
    # or np.sqrt(sample_weight.sum()) if weighted.
    if is_sparse:
        scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight)

        # Non-constant features are scaled similarly with np.sqrt(n_samples)
        assert_array_almost_equal(
            scaler.transform(X).toarray()[:, :2] / np.sqrt(sample_weight.sum()),
            Xt.toarray()[:, :2],
        )

        # Constant features go through un-scaled.
        assert_array_almost_equal(
            scaler.transform(X).toarray()[:, 2:], Xt.toarray()[:, 2:]
        )
    else:
        scaler = StandardScaler(with_mean=True).fit(X, sample_weight=sample_weight)
        assert_array_almost_equal(scaler.mean_, X_mean)
        assert_array_almost_equal(
            scaler.transform(X) / np.sqrt(sample_weight.sum()),
            Xt,
        )
    assert_array_almost_equal(yt, y - expected_y_mean)


def test_sparse_preprocess_data_offsets(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 2
    X = sparse.rand(n_samples, n_features, density=0.5, random_state=rng)
    X = X.tolil()
    y = rng.rand(n_samples)
    XA = X.toarray()
    expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0])

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=False, normalize=False
    )
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=True, normalize=False
    )
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=True, normalize=True
    )
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_scale, expected_X_scale)
    assert_array_almost_equal(Xt.A, XA / expected_X_scale)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))


def test_csr_preprocess_data():
    # Test output format of _preprocess_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = _preprocess_data(csr, y, True)
    assert csr_.getformat() == "csr"


@pytest.mark.parametrize("is_sparse", (True, False))
@pytest.mark.parametrize("to_copy", (True, False))
def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
    X, y = make_regression()
    X[X < 2.5] = 0.0

    if is_sparse:
        X = sparse.csr_matrix(X)

    X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False)

    if to_copy and is_sparse:
        assert not np.may_share_memory(X_.data, X.data)
    elif to_copy:
        assert not np.may_share_memory(X_, X)
    elif is_sparse:
        assert np.may_share_memory(X_.data, X.data)
    else:
        assert np.may_share_memory(X_, X)


def test_dtype_preprocess_data(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)

    X_32 = np.asarray(X, dtype=np.float32)
    y_32 = np.asarray(y, dtype=np.float32)
    X_64 = np.asarray(X, dtype=np.float64)
    y_64 = np.asarray(y, dtype=np.float64)

    for fit_intercept in [True, False]:
        for normalize in [True, False]:
            Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data(
                X_32,
                y_32,
                fit_intercept=fit_intercept,
                normalize=normalize,
            )

            Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data(
                X_64,
                y_64,
                fit_intercept=fit_intercept,
                normalize=normalize,
            )

            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = _preprocess_data(
                X_32,
                y_64,
                fit_intercept=fit_intercept,
                normalize=normalize,
            )

            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = _preprocess_data(
                X_64,
                y_32,
                fit_intercept=fit_intercept,
                normalize=normalize,
            )

            assert Xt_32.dtype == np.float32
            assert yt_32.dtype == np.float32
            assert X_mean_32.dtype == np.float32
            assert y_mean_32.dtype == np.float32
            assert X_scale_32.dtype == np.float32

            assert Xt_64.dtype == np.float64
            assert yt_64.dtype == np.float64
            assert X_mean_64.dtype == np.float64
            assert y_mean_64.dtype == np.float64
            assert X_scale_64.dtype == np.float64

            assert Xt_3264.dtype == np.float32
            assert yt_3264.dtype == np.float32
            assert X_mean_3264.dtype == np.float32
            assert y_mean_3264.dtype == np.float32
            assert X_scale_3264.dtype == np.float32

            assert Xt_6432.dtype == np.float64
            assert yt_6432.dtype == np.float64
            assert X_mean_6432.dtype == np.float64
            assert y_mean_6432.dtype == np.float64
            assert X_scale_6432.dtype == np.float64

            assert X_32.dtype == np.float32
            assert y_32.dtype == np.float32
            assert X_64.dtype == np.float64
            assert y_64.dtype == np.float64

            assert_array_almost_equal(Xt_32, Xt_64)
            assert_array_almost_equal(yt_32, yt_64)
            assert_array_almost_equal(X_mean_32, X_mean_64)
            assert_array_almost_equal(y_mean_32, y_mean_64)
            assert_array_almost_equal(X_scale_32, X_scale_64)


@pytest.mark.parametrize("n_targets", [None, 2])
@pytest.mark.parametrize("sparse_data", [True, False])
def test_rescale_data(n_targets, sparse_data, global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 2

    sample_weight = 1.0 + rng.rand(n_samples)
    X = rng.rand(n_samples, n_features)
    if n_targets is None:
        y = rng.rand(n_samples)
    else:
        y = rng.rand(n_samples, n_targets)

    expected_sqrt_sw = np.sqrt(sample_weight)
    expected_rescaled_X = X * expected_sqrt_sw[:, np.newaxis]

    if n_targets is None:
        expected_rescaled_y = y * expected_sqrt_sw
    else:
        expected_rescaled_y = y * expected_sqrt_sw[:, np.newaxis]

    if sparse_data:
        X = sparse.csr_matrix(X)
        if n_targets is None:
            y = sparse.csr_matrix(y.reshape(-1, 1))
        else:
            y = sparse.csr_matrix(y)

    rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight)

    assert_allclose(sqrt_sw, expected_sqrt_sw)

    if sparse_data:
        rescaled_X = rescaled_X.toarray()
        rescaled_y = rescaled_y.toarray()
        if n_targets is None:
            rescaled_y = rescaled_y.ravel()

    assert_allclose(rescaled_X, expected_rescaled_X)
    assert_allclose(rescaled_y, expected_rescaled_y)


def test_fused_types_make_dataset():
    iris = load_iris()

    X_32 = iris.data.astype(np.float32)
    y_32 = iris.target.astype(np.float32)
    X_csr_32 = sparse.csr_matrix(X_32)
    sample_weight_32 = np.arange(y_32.size, dtype=np.float32)

    X_64 = iris.data.astype(np.float64)
    y_64 = iris.target.astype(np.float64)
    X_csr_64 = sparse.csr_matrix(X_64)
    sample_weight_64 = np.arange(y_64.size, dtype=np.float64)

    # array
    dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32)
    dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64)
    xi_32, yi_32, _, _ = dataset_32._next_py()
    xi_64, yi_64, _, _ = dataset_64._next_py()
    xi_data_32, _, _ = xi_32
    xi_data_64, _, _ = xi_64

    assert xi_data_32.dtype == np.float32
    assert xi_data_64.dtype == np.float64
    assert_allclose(yi_64, yi_32, rtol=rtol)

    # csr
    datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32)
    datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64)
    xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py()
    xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py()
    xicsr_data_32, _, _ = xicsr_32
    xicsr_data_64, _, _ = xicsr_64

    assert xicsr_data_32.dtype == np.float32
    assert xicsr_data_64.dtype == np.float64

    assert_allclose(xicsr_data_64, xicsr_data_32, rtol=rtol)
    assert_allclose(yicsr_64, yicsr_32, rtol=rtol)

    assert_array_equal(xi_data_32, xicsr_data_32)
    assert_array_equal(xi_data_64, xicsr_data_64)
    assert_array_equal(yi_32, yicsr_32)
    assert_array_equal(yi_64, yicsr_64)


@pytest.mark.parametrize("sparseX", [False, True])
@pytest.mark.parametrize("fit_intercept", [False, True])
def test_linear_regression_sample_weight_consistency(
    sparseX, fit_intercept, global_random_seed
):
    """Test that the impact of sample_weight is consistent.

    Note that this test is stricter than the common test
    check_sample_weights_invariance alone and also tests sparse X.
    It is very similar to test_enet_sample_weight_consistency.
    """
    rng = np.random.RandomState(global_random_seed)
    n_samples, n_features = 10, 5

    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    if sparseX:
        X = sparse.csr_matrix(X)
    params = dict(fit_intercept=fit_intercept)

    reg = LinearRegression(**params).fit(X, y, sample_weight=None)
    coef = reg.coef_.copy()
    if fit_intercept:
        intercept = reg.intercept_

    # 1) sample_weight=np.ones(..) must be equivalent to sample_weight=None
    # same check as check_sample_weights_invariance(name, reg, kind="ones"), but we also
    # test with sparse input.
    sample_weight = np.ones_like(y)
    reg.fit(X, y, sample_weight=sample_weight)
    assert_allclose(reg.coef_, coef, rtol=1e-6)
    if fit_intercept:
        assert_allclose(reg.intercept_, intercept)

    # 2) sample_weight=None should be equivalent to sample_weight = number
    sample_weight = 123.0
    reg.fit(X, y, sample_weight=sample_weight)
    assert_allclose(reg.coef_, coef, rtol=1e-6)
    if fit_intercept:
        assert_allclose(reg.intercept_, intercept)

    # 3) scaling of sample_weight should have no effect, cf. np.average()
    sample_weight = rng.uniform(low=0.01, high=2, size=X.shape[0])
    reg = reg.fit(X, y, sample_weight=sample_weight)
    coef = reg.coef_.copy()
    if fit_intercept:
        intercept = reg.intercept_

    reg.fit(X, y, sample_weight=np.pi * sample_weight)
    assert_allclose(reg.coef_, coef, rtol=1e-5 if sparseX else 1e-6)
    if fit_intercept:
        assert_allclose(reg.intercept_, intercept)

    # 4) setting elements of sample_weight to 0 is equivalent to removing these samples
    sample_weight_0 = sample_weight.copy()
    sample_weight_0[-5:] = 0
    y[-5:] *= 1000  # to make excluding those samples important
    reg.fit(X, y, sample_weight=sample_weight_0)
    coef_0 = reg.coef_.copy()
    if fit_intercept:
        intercept_0 = reg.intercept_
    reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5])
    if fit_intercept and not sparseX:
        # FIXME: https://github.com/scikit-learn/scikit-learn/issues/26164
        # This often fails, e.g. when calling
        # SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" pytest \
        # sklearn/linear_model/tests/test_base.py\
        # ::test_linear_regression_sample_weight_consistency
        pass
    else:
        assert_allclose(reg.coef_, coef_0, rtol=1e-5)
        if fit_intercept:
            assert_allclose(reg.intercept_, intercept_0)

    # 5) check that multiplying sample_weight by 2 is equivalent to repeating
    # corresponding samples twice
    if sparseX:
        X2 = sparse.vstack([X, X[: n_samples // 2]], format="csc")
    else:
        X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
    y2 = np.concatenate([y, y[: n_samples // 2]])
    sample_weight_1 = sample_weight.copy()
    sample_weight_1[: n_samples // 2] *= 2
    sample_weight_2 = np.concatenate(
        [sample_weight, sample_weight[: n_samples // 2]], axis=0
    )

    reg1 = LinearRegression(**params).fit(X, y, sample_weight=sample_weight_1)
    reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=sample_weight_2)
    assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-6)
    if fit_intercept:
        assert_allclose(reg1.intercept_, reg2.intercept_)