| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399 |
- """
- Base IO code for all datasets
- """
- # Copyright (c) 2007 David Cournapeau <cournape@gmail.com>
- # 2010 Fabian Pedregosa <fabian.pedregosa@inria.fr>
- # 2010 Olivier Grisel <olivier.grisel@ensta.org>
- # License: BSD 3 clause
- import csv
- import gzip
- import hashlib
- import os
- import shutil
- from collections import namedtuple
- from numbers import Integral
- from os import environ, listdir, makedirs
- from os.path import expanduser, isdir, join, splitext
- from pathlib import Path
- from urllib.request import urlretrieve
- import numpy as np
- from ..preprocessing import scale
- from ..utils import Bunch, check_pandas_support, check_random_state
- from ..utils._param_validation import Interval, StrOptions, validate_params
- from ..utils.fixes import _contents, _open_binary, _open_text, _read_text
- DATA_MODULE = "sklearn.datasets.data"
- DESCR_MODULE = "sklearn.datasets.descr"
- IMAGES_MODULE = "sklearn.datasets.images"
- RemoteFileMetadata = namedtuple("RemoteFileMetadata", ["filename", "url", "checksum"])
- @validate_params(
- {
- "data_home": [str, os.PathLike, None],
- },
- prefer_skip_nested_validation=True,
- )
- def get_data_home(data_home=None) -> str:
- """Return the path of the scikit-learn data directory.
- This folder is used by some large dataset loaders to avoid downloading the
- data several times.
- By default the data directory is set to a folder named 'scikit_learn_data' in the
- user home folder.
- Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment
- variable or programmatically by giving an explicit folder path. The '~'
- symbol is expanded to the user home folder.
- If the folder does not already exist, it is automatically created.
- Parameters
- ----------
- data_home : str or path-like, default=None
- The path to scikit-learn data directory. If `None`, the default path
- is `~/scikit_learn_data`.
- Returns
- -------
- data_home: str
- The path to scikit-learn data directory.
- """
- if data_home is None:
- data_home = environ.get("SCIKIT_LEARN_DATA", join("~", "scikit_learn_data"))
- data_home = expanduser(data_home)
- makedirs(data_home, exist_ok=True)
- return data_home
- @validate_params(
- {
- "data_home": [str, os.PathLike, None],
- },
- prefer_skip_nested_validation=True,
- )
- def clear_data_home(data_home=None):
- """Delete all the content of the data home cache.
- Parameters
- ----------
- data_home : str or path-like, default=None
- The path to scikit-learn data directory. If `None`, the default path
- is `~/scikit_learn_data`.
- """
- data_home = get_data_home(data_home)
- shutil.rmtree(data_home)
- def _convert_data_dataframe(
- caller_name, data, target, feature_names, target_names, sparse_data=False
- ):
- pd = check_pandas_support("{} with as_frame=True".format(caller_name))
- if not sparse_data:
- data_df = pd.DataFrame(data, columns=feature_names, copy=False)
- else:
- data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names)
- target_df = pd.DataFrame(target, columns=target_names)
- combined_df = pd.concat([data_df, target_df], axis=1)
- X = combined_df[feature_names]
- y = combined_df[target_names]
- if y.shape[1] == 1:
- y = y.iloc[:, 0]
- return combined_df, X, y
- @validate_params(
- {
- "container_path": [str, os.PathLike],
- "description": [str, None],
- "categories": [list, None],
- "load_content": ["boolean"],
- "shuffle": ["boolean"],
- "encoding": [str, None],
- "decode_error": [StrOptions({"strict", "ignore", "replace"})],
- "random_state": ["random_state"],
- "allowed_extensions": [list, None],
- },
- prefer_skip_nested_validation=True,
- )
- def load_files(
- container_path,
- *,
- description=None,
- categories=None,
- load_content=True,
- shuffle=True,
- encoding=None,
- decode_error="strict",
- random_state=0,
- allowed_extensions=None,
- ):
- """Load text files with categories as subfolder names.
- Individual samples are assumed to be files stored a two levels folder
- structure such as the following:
- container_folder/
- category_1_folder/
- file_1.txt
- file_2.txt
- ...
- file_42.txt
- category_2_folder/
- file_43.txt
- file_44.txt
- ...
- The folder names are used as supervised signal label names. The individual
- file names are not important.
- This function does not try to extract features into a numpy array or scipy
- sparse matrix. In addition, if load_content is false it does not try to
- load the files in memory.
- To use text files in a scikit-learn classification or clustering algorithm,
- you will need to use the :mod:`~sklearn.feature_extraction.text` module to
- build a feature extraction transformer that suits your problem.
- If you set load_content=True, you should also specify the encoding of the
- text using the 'encoding' parameter. For many modern text files, 'utf-8'
- will be the correct encoding. If you leave encoding equal to None, then the
- content will be made of bytes instead of Unicode, and you will not be able
- to use most functions in :mod:`~sklearn.feature_extraction.text`.
- Similar feature extractors should be built for other kind of unstructured
- data input such as images, audio, video, ...
- If you want files with a specific file extension (e.g. `.txt`) then you
- can pass a list of those file extensions to `allowed_extensions`.
- Read more in the :ref:`User Guide <datasets>`.
- Parameters
- ----------
- container_path : str
- Path to the main folder holding one subfolder per category.
- description : str, default=None
- A paragraph describing the characteristic of the dataset: its source,
- reference, etc.
- categories : list of str, default=None
- If None (default), load all the categories. If not None, list of
- category names to load (other categories ignored).
- load_content : bool, default=True
- Whether to load or not the content of the different files. If true a
- 'data' attribute containing the text information is present in the data
- structure returned. If not, a filenames attribute gives the path to the
- files.
- shuffle : bool, default=True
- Whether or not to shuffle the data: might be important for models that
- make the assumption that the samples are independent and identically
- distributed (i.i.d.), such as stochastic gradient descent.
- encoding : str, default=None
- If None, do not try to decode the content of the files (e.g. for images
- or other non-text content). If not None, encoding to use to decode text
- files to Unicode if load_content is True.
- decode_error : {'strict', 'ignore', 'replace'}, default='strict'
- Instruction on what to do if a byte sequence is given to analyze that
- contains characters not of the given `encoding`. Passed as keyword
- argument 'errors' to bytes.decode.
- random_state : int, RandomState instance or None, default=0
- Determines random number generation for dataset shuffling. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- allowed_extensions : list of str, default=None
- List of desired file extensions to filter the files to be loaded.
- Returns
- -------
- data : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- data : list of str
- Only present when `load_content=True`.
- The raw text data to learn.
- target : ndarray
- The target labels (integer index).
- target_names : list
- The names of target classes.
- DESCR : str
- The full description of the dataset.
- filenames: ndarray
- The filenames holding the dataset.
- """
- target = []
- target_names = []
- filenames = []
- folders = [
- f for f in sorted(listdir(container_path)) if isdir(join(container_path, f))
- ]
- if categories is not None:
- folders = [f for f in folders if f in categories]
- if allowed_extensions is not None:
- allowed_extensions = frozenset(allowed_extensions)
- for label, folder in enumerate(folders):
- target_names.append(folder)
- folder_path = join(container_path, folder)
- files = sorted(listdir(folder_path))
- if allowed_extensions is not None:
- documents = [
- join(folder_path, file)
- for file in files
- if os.path.splitext(file)[1] in allowed_extensions
- ]
- else:
- documents = [join(folder_path, file) for file in files]
- target.extend(len(documents) * [label])
- filenames.extend(documents)
- # convert to array for fancy indexing
- filenames = np.array(filenames)
- target = np.array(target)
- if shuffle:
- random_state = check_random_state(random_state)
- indices = np.arange(filenames.shape[0])
- random_state.shuffle(indices)
- filenames = filenames[indices]
- target = target[indices]
- if load_content:
- data = []
- for filename in filenames:
- data.append(Path(filename).read_bytes())
- if encoding is not None:
- data = [d.decode(encoding, decode_error) for d in data]
- return Bunch(
- data=data,
- filenames=filenames,
- target_names=target_names,
- target=target,
- DESCR=description,
- )
- return Bunch(
- filenames=filenames, target_names=target_names, target=target, DESCR=description
- )
- def load_csv_data(
- data_file_name,
- *,
- data_module=DATA_MODULE,
- descr_file_name=None,
- descr_module=DESCR_MODULE,
- ):
- """Loads `data_file_name` from `data_module with `importlib.resources`.
- Parameters
- ----------
- data_file_name : str
- Name of csv file to be loaded from `data_module/data_file_name`.
- For example `'wine_data.csv'`.
- data_module : str or module, default='sklearn.datasets.data'
- Module where data lives. The default is `'sklearn.datasets.data'`.
- descr_file_name : str, default=None
- Name of rst file to be loaded from `descr_module/descr_file_name`.
- For example `'wine_data.rst'`. See also :func:`load_descr`.
- If not None, also returns the corresponding description of
- the dataset.
- descr_module : str or module, default='sklearn.datasets.descr'
- Module where `descr_file_name` lives. See also :func:`load_descr`.
- The default is `'sklearn.datasets.descr'`.
- Returns
- -------
- data : ndarray of shape (n_samples, n_features)
- A 2D array with each row representing one sample and each column
- representing the features of a given sample.
- target : ndarry of shape (n_samples,)
- A 1D array holding target variables for all the samples in `data`.
- For example target[0] is the target variable for data[0].
- target_names : ndarry of shape (n_samples,)
- A 1D array containing the names of the classifications. For example
- target_names[0] is the name of the target[0] class.
- descr : str, optional
- Description of the dataset (the content of `descr_file_name`).
- Only returned if `descr_file_name` is not None.
- """
- with _open_text(data_module, data_file_name) as csv_file:
- data_file = csv.reader(csv_file)
- temp = next(data_file)
- n_samples = int(temp[0])
- n_features = int(temp[1])
- target_names = np.array(temp[2:])
- data = np.empty((n_samples, n_features))
- target = np.empty((n_samples,), dtype=int)
- for i, ir in enumerate(data_file):
- data[i] = np.asarray(ir[:-1], dtype=np.float64)
- target[i] = np.asarray(ir[-1], dtype=int)
- if descr_file_name is None:
- return data, target, target_names
- else:
- assert descr_module is not None
- descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)
- return data, target, target_names, descr
- def load_gzip_compressed_csv_data(
- data_file_name,
- *,
- data_module=DATA_MODULE,
- descr_file_name=None,
- descr_module=DESCR_MODULE,
- encoding="utf-8",
- **kwargs,
- ):
- """Loads gzip-compressed with `importlib.resources`.
- 1) Open resource file with `importlib.resources.open_binary`
- 2) Decompress file obj with `gzip.open`
- 3) Load decompressed data with `np.loadtxt`
- Parameters
- ----------
- data_file_name : str
- Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from
- `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`.
- data_module : str or module, default='sklearn.datasets.data'
- Module where data lives. The default is `'sklearn.datasets.data'`.
- descr_file_name : str, default=None
- Name of rst file to be loaded from `descr_module/descr_file_name`.
- For example `'wine_data.rst'`. See also :func:`load_descr`.
- If not None, also returns the corresponding description of
- the dataset.
- descr_module : str or module, default='sklearn.datasets.descr'
- Module where `descr_file_name` lives. See also :func:`load_descr`.
- The default is `'sklearn.datasets.descr'`.
- encoding : str, default="utf-8"
- Name of the encoding that the gzip-decompressed file will be
- decoded with. The default is 'utf-8'.
- **kwargs : dict, optional
- Keyword arguments to be passed to `np.loadtxt`;
- e.g. delimiter=','.
- Returns
- -------
- data : ndarray of shape (n_samples, n_features)
- A 2D array with each row representing one sample and each column
- representing the features and/or target of a given sample.
- descr : str, optional
- Description of the dataset (the content of `descr_file_name`).
- Only returned if `descr_file_name` is not None.
- """
- with _open_binary(data_module, data_file_name) as compressed_file:
- compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding)
- data = np.loadtxt(compressed_file, **kwargs)
- if descr_file_name is None:
- return data
- else:
- assert descr_module is not None
- descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)
- return data, descr
- def load_descr(descr_file_name, *, descr_module=DESCR_MODULE):
- """Load `descr_file_name` from `descr_module` with `importlib.resources`.
- Parameters
- ----------
- descr_file_name : str, default=None
- Name of rst file to be loaded from `descr_module/descr_file_name`.
- For example `'wine_data.rst'`. See also :func:`load_descr`.
- If not None, also returns the corresponding description of
- the dataset.
- descr_module : str or module, default='sklearn.datasets.descr'
- Module where `descr_file_name` lives. See also :func:`load_descr`.
- The default is `'sklearn.datasets.descr'`.
- Returns
- -------
- fdescr : str
- Content of `descr_file_name`.
- """
- fdescr = _read_text(descr_module, descr_file_name)
- return fdescr
- @validate_params(
- {
- "return_X_y": ["boolean"],
- "as_frame": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def load_wine(*, return_X_y=False, as_frame=False):
- """Load and return the wine dataset (classification).
- .. versionadded:: 0.18
- The wine dataset is a classic and very easy multi-class classification
- dataset.
- ================= ==============
- Classes 3
- Samples per class [59,71,48]
- Samples total 178
- Dimensionality 13
- Features real, positive
- ================= ==============
- The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit
- standard format from:
- https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
- Read more in the :ref:`User Guide <wine_dataset>`.
- Parameters
- ----------
- return_X_y : bool, default=False
- If True, returns ``(data, target)`` instead of a Bunch object.
- See below for more information about the `data` and `target` object.
- as_frame : bool, default=False
- If True, the data is a pandas DataFrame including columns with
- appropriate dtypes (numeric). The target is
- a pandas DataFrame or Series depending on the number of target columns.
- If `return_X_y` is True, then (`data`, `target`) will be pandas
- DataFrames or Series as described below.
- .. versionadded:: 0.23
- Returns
- -------
- data : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- data : {ndarray, dataframe} of shape (178, 13)
- The data matrix. If `as_frame=True`, `data` will be a pandas
- DataFrame.
- target: {ndarray, Series} of shape (178,)
- The classification target. If `as_frame=True`, `target` will be
- a pandas Series.
- feature_names: list
- The names of the dataset columns.
- target_names: list
- The names of target classes.
- frame: DataFrame of shape (178, 14)
- Only present when `as_frame=True`. DataFrame with `data` and
- `target`.
- .. versionadded:: 0.23
- DESCR: str
- The full description of the dataset.
- (data, target) : tuple if ``return_X_y`` is True
- A tuple of two ndarrays by default. The first contains a 2D array of shape
- (178, 13) with each row representing one sample and each column representing
- the features. The second array of shape (178,) contains the target samples.
- Examples
- --------
- Let's say you are interested in the samples 10, 80, and 140, and want to
- know their class name.
- >>> from sklearn.datasets import load_wine
- >>> data = load_wine()
- >>> data.target[[10, 80, 140]]
- array([0, 1, 2])
- >>> list(data.target_names)
- ['class_0', 'class_1', 'class_2']
- """
- data, target, target_names, fdescr = load_csv_data(
- data_file_name="wine_data.csv", descr_file_name="wine_data.rst"
- )
- feature_names = [
- "alcohol",
- "malic_acid",
- "ash",
- "alcalinity_of_ash",
- "magnesium",
- "total_phenols",
- "flavanoids",
- "nonflavanoid_phenols",
- "proanthocyanins",
- "color_intensity",
- "hue",
- "od280/od315_of_diluted_wines",
- "proline",
- ]
- frame = None
- target_columns = [
- "target",
- ]
- if as_frame:
- frame, data, target = _convert_data_dataframe(
- "load_wine", data, target, feature_names, target_columns
- )
- if return_X_y:
- return data, target
- return Bunch(
- data=data,
- target=target,
- frame=frame,
- target_names=target_names,
- DESCR=fdescr,
- feature_names=feature_names,
- )
- @validate_params(
- {"return_X_y": ["boolean"], "as_frame": ["boolean"]},
- prefer_skip_nested_validation=True,
- )
- def load_iris(*, return_X_y=False, as_frame=False):
- """Load and return the iris dataset (classification).
- The iris dataset is a classic and very easy multi-class classification
- dataset.
- ================= ==============
- Classes 3
- Samples per class 50
- Samples total 150
- Dimensionality 4
- Features real, positive
- ================= ==============
- Read more in the :ref:`User Guide <iris_dataset>`.
- Parameters
- ----------
- return_X_y : bool, default=False
- If True, returns ``(data, target)`` instead of a Bunch object. See
- below for more information about the `data` and `target` object.
- .. versionadded:: 0.18
- as_frame : bool, default=False
- If True, the data is a pandas DataFrame including columns with
- appropriate dtypes (numeric). The target is
- a pandas DataFrame or Series depending on the number of target columns.
- If `return_X_y` is True, then (`data`, `target`) will be pandas
- DataFrames or Series as described below.
- .. versionadded:: 0.23
- Returns
- -------
- data : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- data : {ndarray, dataframe} of shape (150, 4)
- The data matrix. If `as_frame=True`, `data` will be a pandas
- DataFrame.
- target: {ndarray, Series} of shape (150,)
- The classification target. If `as_frame=True`, `target` will be
- a pandas Series.
- feature_names: list
- The names of the dataset columns.
- target_names: list
- The names of target classes.
- frame: DataFrame of shape (150, 5)
- Only present when `as_frame=True`. DataFrame with `data` and
- `target`.
- .. versionadded:: 0.23
- DESCR: str
- The full description of the dataset.
- filename: str
- The path to the location of the data.
- .. versionadded:: 0.20
- (data, target) : tuple if ``return_X_y`` is True
- A tuple of two ndarray. The first containing a 2D array of shape
- (n_samples, n_features) with each row representing one sample and
- each column representing the features. The second ndarray of shape
- (n_samples,) containing the target samples.
- .. versionadded:: 0.18
- Notes
- -----
- .. versionchanged:: 0.20
- Fixed two wrong data points according to Fisher's paper.
- The new version is the same as in R, but not as in the UCI
- Machine Learning Repository.
- Examples
- --------
- Let's say you are interested in the samples 10, 25, and 50, and want to
- know their class name.
- >>> from sklearn.datasets import load_iris
- >>> data = load_iris()
- >>> data.target[[10, 25, 50]]
- array([0, 0, 1])
- >>> list(data.target_names)
- ['setosa', 'versicolor', 'virginica']
- See :ref:`sphx_glr_auto_examples_datasets_plot_iris_dataset.py` for a more
- detailed example of how to work with the iris dataset.
- """
- data_file_name = "iris.csv"
- data, target, target_names, fdescr = load_csv_data(
- data_file_name=data_file_name, descr_file_name="iris.rst"
- )
- feature_names = [
- "sepal length (cm)",
- "sepal width (cm)",
- "petal length (cm)",
- "petal width (cm)",
- ]
- frame = None
- target_columns = [
- "target",
- ]
- if as_frame:
- frame, data, target = _convert_data_dataframe(
- "load_iris", data, target, feature_names, target_columns
- )
- if return_X_y:
- return data, target
- return Bunch(
- data=data,
- target=target,
- frame=frame,
- target_names=target_names,
- DESCR=fdescr,
- feature_names=feature_names,
- filename=data_file_name,
- data_module=DATA_MODULE,
- )
- @validate_params(
- {"return_X_y": ["boolean"], "as_frame": ["boolean"]},
- prefer_skip_nested_validation=True,
- )
- def load_breast_cancer(*, return_X_y=False, as_frame=False):
- """Load and return the breast cancer wisconsin dataset (classification).
- The breast cancer dataset is a classic and very easy binary classification
- dataset.
- ================= ==============
- Classes 2
- Samples per class 212(M),357(B)
- Samples total 569
- Dimensionality 30
- Features real, positive
- ================= ==============
- The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is
- downloaded from:
- https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic
- Read more in the :ref:`User Guide <breast_cancer_dataset>`.
- Parameters
- ----------
- return_X_y : bool, default=False
- If True, returns ``(data, target)`` instead of a Bunch object.
- See below for more information about the `data` and `target` object.
- .. versionadded:: 0.18
- as_frame : bool, default=False
- If True, the data is a pandas DataFrame including columns with
- appropriate dtypes (numeric). The target is
- a pandas DataFrame or Series depending on the number of target columns.
- If `return_X_y` is True, then (`data`, `target`) will be pandas
- DataFrames or Series as described below.
- .. versionadded:: 0.23
- Returns
- -------
- data : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- data : {ndarray, dataframe} of shape (569, 30)
- The data matrix. If `as_frame=True`, `data` will be a pandas
- DataFrame.
- target : {ndarray, Series} of shape (569,)
- The classification target. If `as_frame=True`, `target` will be
- a pandas Series.
- feature_names : ndarray of shape (30,)
- The names of the dataset columns.
- target_names : ndarray of shape (2,)
- The names of target classes.
- frame : DataFrame of shape (569, 31)
- Only present when `as_frame=True`. DataFrame with `data` and
- `target`.
- .. versionadded:: 0.23
- DESCR : str
- The full description of the dataset.
- filename : str
- The path to the location of the data.
- .. versionadded:: 0.20
- (data, target) : tuple if ``return_X_y`` is True
- A tuple of two ndarrays by default. The first contains a 2D ndarray of
- shape (569, 30) with each row representing one sample and each column
- representing the features. The second ndarray of shape (569,) contains
- the target samples. If `as_frame=True`, both arrays are pandas objects,
- i.e. `X` a dataframe and `y` a series.
- .. versionadded:: 0.18
- Examples
- --------
- Let's say you are interested in the samples 10, 50, and 85, and want to
- know their class name.
- >>> from sklearn.datasets import load_breast_cancer
- >>> data = load_breast_cancer()
- >>> data.target[[10, 50, 85]]
- array([0, 1, 0])
- >>> list(data.target_names)
- ['malignant', 'benign']
- """
- data_file_name = "breast_cancer.csv"
- data, target, target_names, fdescr = load_csv_data(
- data_file_name=data_file_name, descr_file_name="breast_cancer.rst"
- )
- feature_names = np.array(
- [
- "mean radius",
- "mean texture",
- "mean perimeter",
- "mean area",
- "mean smoothness",
- "mean compactness",
- "mean concavity",
- "mean concave points",
- "mean symmetry",
- "mean fractal dimension",
- "radius error",
- "texture error",
- "perimeter error",
- "area error",
- "smoothness error",
- "compactness error",
- "concavity error",
- "concave points error",
- "symmetry error",
- "fractal dimension error",
- "worst radius",
- "worst texture",
- "worst perimeter",
- "worst area",
- "worst smoothness",
- "worst compactness",
- "worst concavity",
- "worst concave points",
- "worst symmetry",
- "worst fractal dimension",
- ]
- )
- frame = None
- target_columns = [
- "target",
- ]
- if as_frame:
- frame, data, target = _convert_data_dataframe(
- "load_breast_cancer", data, target, feature_names, target_columns
- )
- if return_X_y:
- return data, target
- return Bunch(
- data=data,
- target=target,
- frame=frame,
- target_names=target_names,
- DESCR=fdescr,
- feature_names=feature_names,
- filename=data_file_name,
- data_module=DATA_MODULE,
- )
- @validate_params(
- {
- "n_class": [Interval(Integral, 1, 10, closed="both")],
- "return_X_y": ["boolean"],
- "as_frame": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
- """Load and return the digits dataset (classification).
- Each datapoint is a 8x8 image of a digit.
- ================= ==============
- Classes 10
- Samples per class ~180
- Samples total 1797
- Dimensionality 64
- Features integers 0-16
- ================= ==============
- This is a copy of the test set of the UCI ML hand-written digits datasets
- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
- Read more in the :ref:`User Guide <digits_dataset>`.
- Parameters
- ----------
- n_class : int, default=10
- The number of classes to return. Between 0 and 10.
- return_X_y : bool, default=False
- If True, returns ``(data, target)`` instead of a Bunch object.
- See below for more information about the `data` and `target` object.
- .. versionadded:: 0.18
- as_frame : bool, default=False
- If True, the data is a pandas DataFrame including columns with
- appropriate dtypes (numeric). The target is
- a pandas DataFrame or Series depending on the number of target columns.
- If `return_X_y` is True, then (`data`, `target`) will be pandas
- DataFrames or Series as described below.
- .. versionadded:: 0.23
- Returns
- -------
- data : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- data : {ndarray, dataframe} of shape (1797, 64)
- The flattened data matrix. If `as_frame=True`, `data` will be
- a pandas DataFrame.
- target: {ndarray, Series} of shape (1797,)
- The classification target. If `as_frame=True`, `target` will be
- a pandas Series.
- feature_names: list
- The names of the dataset columns.
- target_names: list
- The names of target classes.
- .. versionadded:: 0.20
- frame: DataFrame of shape (1797, 65)
- Only present when `as_frame=True`. DataFrame with `data` and
- `target`.
- .. versionadded:: 0.23
- images: {ndarray} of shape (1797, 8, 8)
- The raw image data.
- DESCR: str
- The full description of the dataset.
- (data, target) : tuple if ``return_X_y`` is True
- A tuple of two ndarrays by default. The first contains a 2D ndarray of
- shape (1797, 64) with each row representing one sample and each column
- representing the features. The second ndarray of shape (1797) contains
- the target samples. If `as_frame=True`, both arrays are pandas objects,
- i.e. `X` a dataframe and `y` a series.
- .. versionadded:: 0.18
- Examples
- --------
- To load the data and visualize the images::
- >>> from sklearn.datasets import load_digits
- >>> digits = load_digits()
- >>> print(digits.data.shape)
- (1797, 64)
- >>> import matplotlib.pyplot as plt
- >>> plt.gray()
- >>> plt.matshow(digits.images[0])
- <...>
- >>> plt.show()
- """
- data, fdescr = load_gzip_compressed_csv_data(
- data_file_name="digits.csv.gz", descr_file_name="digits.rst", delimiter=","
- )
- target = data[:, -1].astype(int, copy=False)
- flat_data = data[:, :-1]
- images = flat_data.view()
- images.shape = (-1, 8, 8)
- if n_class < 10:
- idx = target < n_class
- flat_data, target = flat_data[idx], target[idx]
- images = images[idx]
- feature_names = [
- "pixel_{}_{}".format(row_idx, col_idx)
- for row_idx in range(8)
- for col_idx in range(8)
- ]
- frame = None
- target_columns = [
- "target",
- ]
- if as_frame:
- frame, flat_data, target = _convert_data_dataframe(
- "load_digits", flat_data, target, feature_names, target_columns
- )
- if return_X_y:
- return flat_data, target
- return Bunch(
- data=flat_data,
- target=target,
- frame=frame,
- feature_names=feature_names,
- target_names=np.arange(10),
- images=images,
- DESCR=fdescr,
- )
- @validate_params(
- {"return_X_y": ["boolean"], "as_frame": ["boolean"], "scaled": ["boolean"]},
- prefer_skip_nested_validation=True,
- )
- def load_diabetes(*, return_X_y=False, as_frame=False, scaled=True):
- """Load and return the diabetes dataset (regression).
- ============== ==================
- Samples total 442
- Dimensionality 10
- Features real, -.2 < x < .2
- Targets integer 25 - 346
- ============== ==================
- .. note::
- The meaning of each feature (i.e. `feature_names`) might be unclear
- (especially for `ltg`) as the documentation of the original dataset is
- not explicit. We provide information that seems correct in regard with
- the scientific literature in this field of research.
- Read more in the :ref:`User Guide <diabetes_dataset>`.
- Parameters
- ----------
- return_X_y : bool, default=False
- If True, returns ``(data, target)`` instead of a Bunch object.
- See below for more information about the `data` and `target` object.
- .. versionadded:: 0.18
- as_frame : bool, default=False
- If True, the data is a pandas DataFrame including columns with
- appropriate dtypes (numeric). The target is
- a pandas DataFrame or Series depending on the number of target columns.
- If `return_X_y` is True, then (`data`, `target`) will be pandas
- DataFrames or Series as described below.
- .. versionadded:: 0.23
- scaled : bool, default=True
- If True, the feature variables are mean centered and scaled by the
- standard deviation times the square root of `n_samples`.
- If False, raw data is returned for the feature variables.
- .. versionadded:: 1.1
- Returns
- -------
- data : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- data : {ndarray, dataframe} of shape (442, 10)
- The data matrix. If `as_frame=True`, `data` will be a pandas
- DataFrame.
- target: {ndarray, Series} of shape (442,)
- The regression target. If `as_frame=True`, `target` will be
- a pandas Series.
- feature_names: list
- The names of the dataset columns.
- frame: DataFrame of shape (442, 11)
- Only present when `as_frame=True`. DataFrame with `data` and
- `target`.
- .. versionadded:: 0.23
- DESCR: str
- The full description of the dataset.
- data_filename: str
- The path to the location of the data.
- target_filename: str
- The path to the location of the target.
- (data, target) : tuple if ``return_X_y`` is True
- Returns a tuple of two ndarray of shape (n_samples, n_features)
- A 2D array with each row representing one sample and each column
- representing the features and/or target of a given sample.
- .. versionadded:: 0.18
- """
- data_filename = "diabetes_data_raw.csv.gz"
- target_filename = "diabetes_target.csv.gz"
- data = load_gzip_compressed_csv_data(data_filename)
- target = load_gzip_compressed_csv_data(target_filename)
- if scaled:
- data = scale(data, copy=False)
- data /= data.shape[0] ** 0.5
- fdescr = load_descr("diabetes.rst")
- feature_names = ["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"]
- frame = None
- target_columns = [
- "target",
- ]
- if as_frame:
- frame, data, target = _convert_data_dataframe(
- "load_diabetes", data, target, feature_names, target_columns
- )
- if return_X_y:
- return data, target
- return Bunch(
- data=data,
- target=target,
- frame=frame,
- DESCR=fdescr,
- feature_names=feature_names,
- data_filename=data_filename,
- target_filename=target_filename,
- data_module=DATA_MODULE,
- )
- @validate_params(
- {
- "return_X_y": ["boolean"],
- "as_frame": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def load_linnerud(*, return_X_y=False, as_frame=False):
- """Load and return the physical exercise Linnerud dataset.
- This dataset is suitable for multi-output regression tasks.
- ============== ============================
- Samples total 20
- Dimensionality 3 (for both data and target)
- Features integer
- Targets integer
- ============== ============================
- Read more in the :ref:`User Guide <linnerrud_dataset>`.
- Parameters
- ----------
- return_X_y : bool, default=False
- If True, returns ``(data, target)`` instead of a Bunch object.
- See below for more information about the `data` and `target` object.
- .. versionadded:: 0.18
- as_frame : bool, default=False
- If True, the data is a pandas DataFrame including columns with
- appropriate dtypes (numeric, string or categorical). The target is
- a pandas DataFrame or Series depending on the number of target columns.
- If `return_X_y` is True, then (`data`, `target`) will be pandas
- DataFrames or Series as described below.
- .. versionadded:: 0.23
- Returns
- -------
- data : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- data : {ndarray, dataframe} of shape (20, 3)
- The data matrix. If `as_frame=True`, `data` will be a pandas
- DataFrame.
- target: {ndarray, dataframe} of shape (20, 3)
- The regression targets. If `as_frame=True`, `target` will be
- a pandas DataFrame.
- feature_names: list
- The names of the dataset columns.
- target_names: list
- The names of the target columns.
- frame: DataFrame of shape (20, 6)
- Only present when `as_frame=True`. DataFrame with `data` and
- `target`.
- .. versionadded:: 0.23
- DESCR: str
- The full description of the dataset.
- data_filename: str
- The path to the location of the data.
- target_filename: str
- The path to the location of the target.
- .. versionadded:: 0.20
- (data, target) : tuple if ``return_X_y`` is True
- Returns a tuple of two ndarrays or dataframe of shape
- `(20, 3)`. Each row represents one sample and each column represents the
- features in `X` and a target in `y` of a given sample.
- .. versionadded:: 0.18
- """
- data_filename = "linnerud_exercise.csv"
- target_filename = "linnerud_physiological.csv"
- # Read header and data
- with _open_text(DATA_MODULE, data_filename) as f:
- header_exercise = f.readline().split()
- f.seek(0) # reset file obj
- data_exercise = np.loadtxt(f, skiprows=1)
- with _open_text(DATA_MODULE, target_filename) as f:
- header_physiological = f.readline().split()
- f.seek(0) # reset file obj
- data_physiological = np.loadtxt(f, skiprows=1)
- fdescr = load_descr("linnerud.rst")
- frame = None
- if as_frame:
- (frame, data_exercise, data_physiological) = _convert_data_dataframe(
- "load_linnerud",
- data_exercise,
- data_physiological,
- header_exercise,
- header_physiological,
- )
- if return_X_y:
- return data_exercise, data_physiological
- return Bunch(
- data=data_exercise,
- feature_names=header_exercise,
- target=data_physiological,
- target_names=header_physiological,
- frame=frame,
- DESCR=fdescr,
- data_filename=data_filename,
- target_filename=target_filename,
- data_module=DATA_MODULE,
- )
- def load_sample_images():
- """Load sample images for image manipulation.
- Loads both, ``china`` and ``flower``.
- Read more in the :ref:`User Guide <sample_images>`.
- Returns
- -------
- data : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- images : list of ndarray of shape (427, 640, 3)
- The two sample image.
- filenames : list
- The filenames for the images.
- DESCR : str
- The full description of the dataset.
- Examples
- --------
- To load the data and visualize the images:
- >>> from sklearn.datasets import load_sample_images
- >>> dataset = load_sample_images() #doctest: +SKIP
- >>> len(dataset.images) #doctest: +SKIP
- 2
- >>> first_img_data = dataset.images[0] #doctest: +SKIP
- >>> first_img_data.shape #doctest: +SKIP
- (427, 640, 3)
- >>> first_img_data.dtype #doctest: +SKIP
- dtype('uint8')
- """
- try:
- from PIL import Image
- except ImportError:
- raise ImportError(
- "The Python Imaging Library (PIL) is required to load data "
- "from jpeg files. Please refer to "
- "https://pillow.readthedocs.io/en/stable/installation.html "
- "for installing PIL."
- )
- descr = load_descr("README.txt", descr_module=IMAGES_MODULE)
- filenames, images = [], []
- for filename in sorted(_contents(IMAGES_MODULE)):
- if filename.endswith(".jpg"):
- filenames.append(filename)
- with _open_binary(IMAGES_MODULE, filename) as image_file:
- pil_image = Image.open(image_file)
- image = np.asarray(pil_image)
- images.append(image)
- return Bunch(images=images, filenames=filenames, DESCR=descr)
- @validate_params(
- {
- "image_name": [StrOptions({"china.jpg", "flower.jpg"})],
- },
- prefer_skip_nested_validation=True,
- )
- def load_sample_image(image_name):
- """Load the numpy array of a single sample image.
- Read more in the :ref:`User Guide <sample_images>`.
- Parameters
- ----------
- image_name : {`china.jpg`, `flower.jpg`}
- The name of the sample image loaded.
- Returns
- -------
- img : 3D array
- The image as a numpy array: height x width x color.
- Examples
- --------
- >>> from sklearn.datasets import load_sample_image
- >>> china = load_sample_image('china.jpg') # doctest: +SKIP
- >>> china.dtype # doctest: +SKIP
- dtype('uint8')
- >>> china.shape # doctest: +SKIP
- (427, 640, 3)
- >>> flower = load_sample_image('flower.jpg') # doctest: +SKIP
- >>> flower.dtype # doctest: +SKIP
- dtype('uint8')
- >>> flower.shape # doctest: +SKIP
- (427, 640, 3)
- """
- images = load_sample_images()
- index = None
- for i, filename in enumerate(images.filenames):
- if filename.endswith(image_name):
- index = i
- break
- if index is None:
- raise AttributeError("Cannot find sample image: %s" % image_name)
- return images.images[index]
- def _pkl_filepath(*args, **kwargs):
- """Return filename for Python 3 pickles
- args[-1] is expected to be the ".pkl" filename. For compatibility with
- older scikit-learn versions, a suffix is inserted before the extension.
- _pkl_filepath('/path/to/folder', 'filename.pkl') returns
- '/path/to/folder/filename_py3.pkl'
- """
- py3_suffix = kwargs.get("py3_suffix", "_py3")
- basename, ext = splitext(args[-1])
- basename += py3_suffix
- new_args = args[:-1] + (basename + ext,)
- return join(*new_args)
- def _sha256(path):
- """Calculate the sha256 hash of the file at path."""
- sha256hash = hashlib.sha256()
- chunk_size = 8192
- with open(path, "rb") as f:
- while True:
- buffer = f.read(chunk_size)
- if not buffer:
- break
- sha256hash.update(buffer)
- return sha256hash.hexdigest()
- def _fetch_remote(remote, dirname=None):
- """Helper function to download a remote dataset into path
- Fetch a dataset pointed by remote's url, save into path using remote's
- filename and ensure its integrity based on the SHA256 Checksum of the
- downloaded file.
- Parameters
- ----------
- remote : RemoteFileMetadata
- Named tuple containing remote dataset meta information: url, filename
- and checksum
- dirname : str
- Directory to save the file to.
- Returns
- -------
- file_path: str
- Full path of the created file.
- """
- file_path = remote.filename if dirname is None else join(dirname, remote.filename)
- urlretrieve(remote.url, file_path)
- checksum = _sha256(file_path)
- if remote.checksum != checksum:
- raise OSError(
- "{} has an SHA256 checksum ({}) "
- "differing from expected ({}), "
- "file may be corrupted.".format(file_path, checksum, remote.checksum)
- )
- return file_path
|