| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- """
- The :mod:`sklearn.datasets` module includes utilities to load datasets,
- including methods to load and fetch popular reference datasets. It also
- features some artificial data generators.
- """
- import textwrap
- from ._base import (
- clear_data_home,
- get_data_home,
- load_breast_cancer,
- load_diabetes,
- load_digits,
- load_files,
- load_iris,
- load_linnerud,
- load_sample_image,
- load_sample_images,
- load_wine,
- )
- from ._california_housing import fetch_california_housing
- from ._covtype import fetch_covtype
- from ._kddcup99 import fetch_kddcup99
- from ._lfw import fetch_lfw_pairs, fetch_lfw_people
- from ._olivetti_faces import fetch_olivetti_faces
- from ._openml import fetch_openml
- from ._rcv1 import fetch_rcv1
- from ._samples_generator import (
- make_biclusters,
- make_blobs,
- make_checkerboard,
- make_circles,
- make_classification,
- make_friedman1,
- make_friedman2,
- make_friedman3,
- make_gaussian_quantiles,
- make_hastie_10_2,
- make_low_rank_matrix,
- make_moons,
- make_multilabel_classification,
- make_regression,
- make_s_curve,
- make_sparse_coded_signal,
- make_sparse_spd_matrix,
- make_sparse_uncorrelated,
- make_spd_matrix,
- make_swiss_roll,
- )
- from ._species_distributions import fetch_species_distributions
- from ._svmlight_format_io import (
- dump_svmlight_file,
- load_svmlight_file,
- load_svmlight_files,
- )
- from ._twenty_newsgroups import fetch_20newsgroups, fetch_20newsgroups_vectorized
- __all__ = [
- "clear_data_home",
- "dump_svmlight_file",
- "fetch_20newsgroups",
- "fetch_20newsgroups_vectorized",
- "fetch_lfw_pairs",
- "fetch_lfw_people",
- "fetch_olivetti_faces",
- "fetch_species_distributions",
- "fetch_california_housing",
- "fetch_covtype",
- "fetch_rcv1",
- "fetch_kddcup99",
- "fetch_openml",
- "get_data_home",
- "load_diabetes",
- "load_digits",
- "load_files",
- "load_iris",
- "load_breast_cancer",
- "load_linnerud",
- "load_sample_image",
- "load_sample_images",
- "load_svmlight_file",
- "load_svmlight_files",
- "load_wine",
- "make_biclusters",
- "make_blobs",
- "make_circles",
- "make_classification",
- "make_checkerboard",
- "make_friedman1",
- "make_friedman2",
- "make_friedman3",
- "make_gaussian_quantiles",
- "make_hastie_10_2",
- "make_low_rank_matrix",
- "make_moons",
- "make_multilabel_classification",
- "make_regression",
- "make_s_curve",
- "make_sparse_coded_signal",
- "make_sparse_spd_matrix",
- "make_sparse_uncorrelated",
- "make_spd_matrix",
- "make_swiss_roll",
- ]
- def __getattr__(name):
- if name == "load_boston":
- msg = textwrap.dedent("""
- `load_boston` has been removed from scikit-learn since version 1.2.
- The Boston housing prices dataset has an ethical problem: as
- investigated in [1], the authors of this dataset engineered a
- non-invertible variable "B" assuming that racial self-segregation had a
- positive impact on house prices [2]. Furthermore the goal of the
- research that led to the creation of this dataset was to study the
- impact of air quality but it did not give adequate demonstration of the
- validity of this assumption.
- The scikit-learn maintainers therefore strongly discourage the use of
- this dataset unless the purpose of the code is to study and educate
- about ethical issues in data science and machine learning.
- In this special case, you can fetch the dataset from the original
- source::
- import pandas as pd
- import numpy as np
- data_url = "http://lib.stat.cmu.edu/datasets/boston"
- raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None)
- data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
- target = raw_df.values[1::2, 2]
- Alternative datasets include the California housing dataset and the
- Ames housing dataset. You can load the datasets as follows::
- from sklearn.datasets import fetch_california_housing
- housing = fetch_california_housing()
- for the California housing dataset and::
- from sklearn.datasets import fetch_openml
- housing = fetch_openml(name="house_prices", as_frame=True)
- for the Ames housing dataset.
- [1] M Carlisle.
- "Racist data destruction?"
- <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>
- [2] Harrison Jr, David, and Daniel L. Rubinfeld.
- "Hedonic housing prices and the demand for clean air."
- Journal of environmental economics and management 5.1 (1978): 81-102.
- <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>
- """)
- raise ImportError(msg)
- try:
- return globals()[name]
- except KeyError:
- # This is turned into the appropriate ImportError
- raise AttributeError
|