_base.py 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399
  1. """
  2. Base IO code for all datasets
  3. """
  4. # Copyright (c) 2007 David Cournapeau <cournape@gmail.com>
  5. # 2010 Fabian Pedregosa <fabian.pedregosa@inria.fr>
  6. # 2010 Olivier Grisel <olivier.grisel@ensta.org>
  7. # License: BSD 3 clause
  8. import csv
  9. import gzip
  10. import hashlib
  11. import os
  12. import shutil
  13. from collections import namedtuple
  14. from numbers import Integral
  15. from os import environ, listdir, makedirs
  16. from os.path import expanduser, isdir, join, splitext
  17. from pathlib import Path
  18. from urllib.request import urlretrieve
  19. import numpy as np
  20. from ..preprocessing import scale
  21. from ..utils import Bunch, check_pandas_support, check_random_state
  22. from ..utils._param_validation import Interval, StrOptions, validate_params
  23. from ..utils.fixes import _contents, _open_binary, _open_text, _read_text
  24. DATA_MODULE = "sklearn.datasets.data"
  25. DESCR_MODULE = "sklearn.datasets.descr"
  26. IMAGES_MODULE = "sklearn.datasets.images"
  27. RemoteFileMetadata = namedtuple("RemoteFileMetadata", ["filename", "url", "checksum"])
  28. @validate_params(
  29. {
  30. "data_home": [str, os.PathLike, None],
  31. },
  32. prefer_skip_nested_validation=True,
  33. )
  34. def get_data_home(data_home=None) -> str:
  35. """Return the path of the scikit-learn data directory.
  36. This folder is used by some large dataset loaders to avoid downloading the
  37. data several times.
  38. By default the data directory is set to a folder named 'scikit_learn_data' in the
  39. user home folder.
  40. Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment
  41. variable or programmatically by giving an explicit folder path. The '~'
  42. symbol is expanded to the user home folder.
  43. If the folder does not already exist, it is automatically created.
  44. Parameters
  45. ----------
  46. data_home : str or path-like, default=None
  47. The path to scikit-learn data directory. If `None`, the default path
  48. is `~/scikit_learn_data`.
  49. Returns
  50. -------
  51. data_home: str
  52. The path to scikit-learn data directory.
  53. """
  54. if data_home is None:
  55. data_home = environ.get("SCIKIT_LEARN_DATA", join("~", "scikit_learn_data"))
  56. data_home = expanduser(data_home)
  57. makedirs(data_home, exist_ok=True)
  58. return data_home
  59. @validate_params(
  60. {
  61. "data_home": [str, os.PathLike, None],
  62. },
  63. prefer_skip_nested_validation=True,
  64. )
  65. def clear_data_home(data_home=None):
  66. """Delete all the content of the data home cache.
  67. Parameters
  68. ----------
  69. data_home : str or path-like, default=None
  70. The path to scikit-learn data directory. If `None`, the default path
  71. is `~/scikit_learn_data`.
  72. """
  73. data_home = get_data_home(data_home)
  74. shutil.rmtree(data_home)
  75. def _convert_data_dataframe(
  76. caller_name, data, target, feature_names, target_names, sparse_data=False
  77. ):
  78. pd = check_pandas_support("{} with as_frame=True".format(caller_name))
  79. if not sparse_data:
  80. data_df = pd.DataFrame(data, columns=feature_names, copy=False)
  81. else:
  82. data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names)
  83. target_df = pd.DataFrame(target, columns=target_names)
  84. combined_df = pd.concat([data_df, target_df], axis=1)
  85. X = combined_df[feature_names]
  86. y = combined_df[target_names]
  87. if y.shape[1] == 1:
  88. y = y.iloc[:, 0]
  89. return combined_df, X, y
  90. @validate_params(
  91. {
  92. "container_path": [str, os.PathLike],
  93. "description": [str, None],
  94. "categories": [list, None],
  95. "load_content": ["boolean"],
  96. "shuffle": ["boolean"],
  97. "encoding": [str, None],
  98. "decode_error": [StrOptions({"strict", "ignore", "replace"})],
  99. "random_state": ["random_state"],
  100. "allowed_extensions": [list, None],
  101. },
  102. prefer_skip_nested_validation=True,
  103. )
  104. def load_files(
  105. container_path,
  106. *,
  107. description=None,
  108. categories=None,
  109. load_content=True,
  110. shuffle=True,
  111. encoding=None,
  112. decode_error="strict",
  113. random_state=0,
  114. allowed_extensions=None,
  115. ):
  116. """Load text files with categories as subfolder names.
  117. Individual samples are assumed to be files stored a two levels folder
  118. structure such as the following:
  119. container_folder/
  120. category_1_folder/
  121. file_1.txt
  122. file_2.txt
  123. ...
  124. file_42.txt
  125. category_2_folder/
  126. file_43.txt
  127. file_44.txt
  128. ...
  129. The folder names are used as supervised signal label names. The individual
  130. file names are not important.
  131. This function does not try to extract features into a numpy array or scipy
  132. sparse matrix. In addition, if load_content is false it does not try to
  133. load the files in memory.
  134. To use text files in a scikit-learn classification or clustering algorithm,
  135. you will need to use the :mod:`~sklearn.feature_extraction.text` module to
  136. build a feature extraction transformer that suits your problem.
  137. If you set load_content=True, you should also specify the encoding of the
  138. text using the 'encoding' parameter. For many modern text files, 'utf-8'
  139. will be the correct encoding. If you leave encoding equal to None, then the
  140. content will be made of bytes instead of Unicode, and you will not be able
  141. to use most functions in :mod:`~sklearn.feature_extraction.text`.
  142. Similar feature extractors should be built for other kind of unstructured
  143. data input such as images, audio, video, ...
  144. If you want files with a specific file extension (e.g. `.txt`) then you
  145. can pass a list of those file extensions to `allowed_extensions`.
  146. Read more in the :ref:`User Guide <datasets>`.
  147. Parameters
  148. ----------
  149. container_path : str
  150. Path to the main folder holding one subfolder per category.
  151. description : str, default=None
  152. A paragraph describing the characteristic of the dataset: its source,
  153. reference, etc.
  154. categories : list of str, default=None
  155. If None (default), load all the categories. If not None, list of
  156. category names to load (other categories ignored).
  157. load_content : bool, default=True
  158. Whether to load or not the content of the different files. If true a
  159. 'data' attribute containing the text information is present in the data
  160. structure returned. If not, a filenames attribute gives the path to the
  161. files.
  162. shuffle : bool, default=True
  163. Whether or not to shuffle the data: might be important for models that
  164. make the assumption that the samples are independent and identically
  165. distributed (i.i.d.), such as stochastic gradient descent.
  166. encoding : str, default=None
  167. If None, do not try to decode the content of the files (e.g. for images
  168. or other non-text content). If not None, encoding to use to decode text
  169. files to Unicode if load_content is True.
  170. decode_error : {'strict', 'ignore', 'replace'}, default='strict'
  171. Instruction on what to do if a byte sequence is given to analyze that
  172. contains characters not of the given `encoding`. Passed as keyword
  173. argument 'errors' to bytes.decode.
  174. random_state : int, RandomState instance or None, default=0
  175. Determines random number generation for dataset shuffling. Pass an int
  176. for reproducible output across multiple function calls.
  177. See :term:`Glossary <random_state>`.
  178. allowed_extensions : list of str, default=None
  179. List of desired file extensions to filter the files to be loaded.
  180. Returns
  181. -------
  182. data : :class:`~sklearn.utils.Bunch`
  183. Dictionary-like object, with the following attributes.
  184. data : list of str
  185. Only present when `load_content=True`.
  186. The raw text data to learn.
  187. target : ndarray
  188. The target labels (integer index).
  189. target_names : list
  190. The names of target classes.
  191. DESCR : str
  192. The full description of the dataset.
  193. filenames: ndarray
  194. The filenames holding the dataset.
  195. """
  196. target = []
  197. target_names = []
  198. filenames = []
  199. folders = [
  200. f for f in sorted(listdir(container_path)) if isdir(join(container_path, f))
  201. ]
  202. if categories is not None:
  203. folders = [f for f in folders if f in categories]
  204. if allowed_extensions is not None:
  205. allowed_extensions = frozenset(allowed_extensions)
  206. for label, folder in enumerate(folders):
  207. target_names.append(folder)
  208. folder_path = join(container_path, folder)
  209. files = sorted(listdir(folder_path))
  210. if allowed_extensions is not None:
  211. documents = [
  212. join(folder_path, file)
  213. for file in files
  214. if os.path.splitext(file)[1] in allowed_extensions
  215. ]
  216. else:
  217. documents = [join(folder_path, file) for file in files]
  218. target.extend(len(documents) * [label])
  219. filenames.extend(documents)
  220. # convert to array for fancy indexing
  221. filenames = np.array(filenames)
  222. target = np.array(target)
  223. if shuffle:
  224. random_state = check_random_state(random_state)
  225. indices = np.arange(filenames.shape[0])
  226. random_state.shuffle(indices)
  227. filenames = filenames[indices]
  228. target = target[indices]
  229. if load_content:
  230. data = []
  231. for filename in filenames:
  232. data.append(Path(filename).read_bytes())
  233. if encoding is not None:
  234. data = [d.decode(encoding, decode_error) for d in data]
  235. return Bunch(
  236. data=data,
  237. filenames=filenames,
  238. target_names=target_names,
  239. target=target,
  240. DESCR=description,
  241. )
  242. return Bunch(
  243. filenames=filenames, target_names=target_names, target=target, DESCR=description
  244. )
  245. def load_csv_data(
  246. data_file_name,
  247. *,
  248. data_module=DATA_MODULE,
  249. descr_file_name=None,
  250. descr_module=DESCR_MODULE,
  251. ):
  252. """Loads `data_file_name` from `data_module with `importlib.resources`.
  253. Parameters
  254. ----------
  255. data_file_name : str
  256. Name of csv file to be loaded from `data_module/data_file_name`.
  257. For example `'wine_data.csv'`.
  258. data_module : str or module, default='sklearn.datasets.data'
  259. Module where data lives. The default is `'sklearn.datasets.data'`.
  260. descr_file_name : str, default=None
  261. Name of rst file to be loaded from `descr_module/descr_file_name`.
  262. For example `'wine_data.rst'`. See also :func:`load_descr`.
  263. If not None, also returns the corresponding description of
  264. the dataset.
  265. descr_module : str or module, default='sklearn.datasets.descr'
  266. Module where `descr_file_name` lives. See also :func:`load_descr`.
  267. The default is `'sklearn.datasets.descr'`.
  268. Returns
  269. -------
  270. data : ndarray of shape (n_samples, n_features)
  271. A 2D array with each row representing one sample and each column
  272. representing the features of a given sample.
  273. target : ndarry of shape (n_samples,)
  274. A 1D array holding target variables for all the samples in `data`.
  275. For example target[0] is the target variable for data[0].
  276. target_names : ndarry of shape (n_samples,)
  277. A 1D array containing the names of the classifications. For example
  278. target_names[0] is the name of the target[0] class.
  279. descr : str, optional
  280. Description of the dataset (the content of `descr_file_name`).
  281. Only returned if `descr_file_name` is not None.
  282. """
  283. with _open_text(data_module, data_file_name) as csv_file:
  284. data_file = csv.reader(csv_file)
  285. temp = next(data_file)
  286. n_samples = int(temp[0])
  287. n_features = int(temp[1])
  288. target_names = np.array(temp[2:])
  289. data = np.empty((n_samples, n_features))
  290. target = np.empty((n_samples,), dtype=int)
  291. for i, ir in enumerate(data_file):
  292. data[i] = np.asarray(ir[:-1], dtype=np.float64)
  293. target[i] = np.asarray(ir[-1], dtype=int)
  294. if descr_file_name is None:
  295. return data, target, target_names
  296. else:
  297. assert descr_module is not None
  298. descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)
  299. return data, target, target_names, descr
  300. def load_gzip_compressed_csv_data(
  301. data_file_name,
  302. *,
  303. data_module=DATA_MODULE,
  304. descr_file_name=None,
  305. descr_module=DESCR_MODULE,
  306. encoding="utf-8",
  307. **kwargs,
  308. ):
  309. """Loads gzip-compressed with `importlib.resources`.
  310. 1) Open resource file with `importlib.resources.open_binary`
  311. 2) Decompress file obj with `gzip.open`
  312. 3) Load decompressed data with `np.loadtxt`
  313. Parameters
  314. ----------
  315. data_file_name : str
  316. Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from
  317. `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`.
  318. data_module : str or module, default='sklearn.datasets.data'
  319. Module where data lives. The default is `'sklearn.datasets.data'`.
  320. descr_file_name : str, default=None
  321. Name of rst file to be loaded from `descr_module/descr_file_name`.
  322. For example `'wine_data.rst'`. See also :func:`load_descr`.
  323. If not None, also returns the corresponding description of
  324. the dataset.
  325. descr_module : str or module, default='sklearn.datasets.descr'
  326. Module where `descr_file_name` lives. See also :func:`load_descr`.
  327. The default is `'sklearn.datasets.descr'`.
  328. encoding : str, default="utf-8"
  329. Name of the encoding that the gzip-decompressed file will be
  330. decoded with. The default is 'utf-8'.
  331. **kwargs : dict, optional
  332. Keyword arguments to be passed to `np.loadtxt`;
  333. e.g. delimiter=','.
  334. Returns
  335. -------
  336. data : ndarray of shape (n_samples, n_features)
  337. A 2D array with each row representing one sample and each column
  338. representing the features and/or target of a given sample.
  339. descr : str, optional
  340. Description of the dataset (the content of `descr_file_name`).
  341. Only returned if `descr_file_name` is not None.
  342. """
  343. with _open_binary(data_module, data_file_name) as compressed_file:
  344. compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding)
  345. data = np.loadtxt(compressed_file, **kwargs)
  346. if descr_file_name is None:
  347. return data
  348. else:
  349. assert descr_module is not None
  350. descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)
  351. return data, descr
  352. def load_descr(descr_file_name, *, descr_module=DESCR_MODULE):
  353. """Load `descr_file_name` from `descr_module` with `importlib.resources`.
  354. Parameters
  355. ----------
  356. descr_file_name : str, default=None
  357. Name of rst file to be loaded from `descr_module/descr_file_name`.
  358. For example `'wine_data.rst'`. See also :func:`load_descr`.
  359. If not None, also returns the corresponding description of
  360. the dataset.
  361. descr_module : str or module, default='sklearn.datasets.descr'
  362. Module where `descr_file_name` lives. See also :func:`load_descr`.
  363. The default is `'sklearn.datasets.descr'`.
  364. Returns
  365. -------
  366. fdescr : str
  367. Content of `descr_file_name`.
  368. """
  369. fdescr = _read_text(descr_module, descr_file_name)
  370. return fdescr
  371. @validate_params(
  372. {
  373. "return_X_y": ["boolean"],
  374. "as_frame": ["boolean"],
  375. },
  376. prefer_skip_nested_validation=True,
  377. )
  378. def load_wine(*, return_X_y=False, as_frame=False):
  379. """Load and return the wine dataset (classification).
  380. .. versionadded:: 0.18
  381. The wine dataset is a classic and very easy multi-class classification
  382. dataset.
  383. ================= ==============
  384. Classes 3
  385. Samples per class [59,71,48]
  386. Samples total 178
  387. Dimensionality 13
  388. Features real, positive
  389. ================= ==============
  390. The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit
  391. standard format from:
  392. https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
  393. Read more in the :ref:`User Guide <wine_dataset>`.
  394. Parameters
  395. ----------
  396. return_X_y : bool, default=False
  397. If True, returns ``(data, target)`` instead of a Bunch object.
  398. See below for more information about the `data` and `target` object.
  399. as_frame : bool, default=False
  400. If True, the data is a pandas DataFrame including columns with
  401. appropriate dtypes (numeric). The target is
  402. a pandas DataFrame or Series depending on the number of target columns.
  403. If `return_X_y` is True, then (`data`, `target`) will be pandas
  404. DataFrames or Series as described below.
  405. .. versionadded:: 0.23
  406. Returns
  407. -------
  408. data : :class:`~sklearn.utils.Bunch`
  409. Dictionary-like object, with the following attributes.
  410. data : {ndarray, dataframe} of shape (178, 13)
  411. The data matrix. If `as_frame=True`, `data` will be a pandas
  412. DataFrame.
  413. target: {ndarray, Series} of shape (178,)
  414. The classification target. If `as_frame=True`, `target` will be
  415. a pandas Series.
  416. feature_names: list
  417. The names of the dataset columns.
  418. target_names: list
  419. The names of target classes.
  420. frame: DataFrame of shape (178, 14)
  421. Only present when `as_frame=True`. DataFrame with `data` and
  422. `target`.
  423. .. versionadded:: 0.23
  424. DESCR: str
  425. The full description of the dataset.
  426. (data, target) : tuple if ``return_X_y`` is True
  427. A tuple of two ndarrays by default. The first contains a 2D array of shape
  428. (178, 13) with each row representing one sample and each column representing
  429. the features. The second array of shape (178,) contains the target samples.
  430. Examples
  431. --------
  432. Let's say you are interested in the samples 10, 80, and 140, and want to
  433. know their class name.
  434. >>> from sklearn.datasets import load_wine
  435. >>> data = load_wine()
  436. >>> data.target[[10, 80, 140]]
  437. array([0, 1, 2])
  438. >>> list(data.target_names)
  439. ['class_0', 'class_1', 'class_2']
  440. """
  441. data, target, target_names, fdescr = load_csv_data(
  442. data_file_name="wine_data.csv", descr_file_name="wine_data.rst"
  443. )
  444. feature_names = [
  445. "alcohol",
  446. "malic_acid",
  447. "ash",
  448. "alcalinity_of_ash",
  449. "magnesium",
  450. "total_phenols",
  451. "flavanoids",
  452. "nonflavanoid_phenols",
  453. "proanthocyanins",
  454. "color_intensity",
  455. "hue",
  456. "od280/od315_of_diluted_wines",
  457. "proline",
  458. ]
  459. frame = None
  460. target_columns = [
  461. "target",
  462. ]
  463. if as_frame:
  464. frame, data, target = _convert_data_dataframe(
  465. "load_wine", data, target, feature_names, target_columns
  466. )
  467. if return_X_y:
  468. return data, target
  469. return Bunch(
  470. data=data,
  471. target=target,
  472. frame=frame,
  473. target_names=target_names,
  474. DESCR=fdescr,
  475. feature_names=feature_names,
  476. )
  477. @validate_params(
  478. {"return_X_y": ["boolean"], "as_frame": ["boolean"]},
  479. prefer_skip_nested_validation=True,
  480. )
  481. def load_iris(*, return_X_y=False, as_frame=False):
  482. """Load and return the iris dataset (classification).
  483. The iris dataset is a classic and very easy multi-class classification
  484. dataset.
  485. ================= ==============
  486. Classes 3
  487. Samples per class 50
  488. Samples total 150
  489. Dimensionality 4
  490. Features real, positive
  491. ================= ==============
  492. Read more in the :ref:`User Guide <iris_dataset>`.
  493. Parameters
  494. ----------
  495. return_X_y : bool, default=False
  496. If True, returns ``(data, target)`` instead of a Bunch object. See
  497. below for more information about the `data` and `target` object.
  498. .. versionadded:: 0.18
  499. as_frame : bool, default=False
  500. If True, the data is a pandas DataFrame including columns with
  501. appropriate dtypes (numeric). The target is
  502. a pandas DataFrame or Series depending on the number of target columns.
  503. If `return_X_y` is True, then (`data`, `target`) will be pandas
  504. DataFrames or Series as described below.
  505. .. versionadded:: 0.23
  506. Returns
  507. -------
  508. data : :class:`~sklearn.utils.Bunch`
  509. Dictionary-like object, with the following attributes.
  510. data : {ndarray, dataframe} of shape (150, 4)
  511. The data matrix. If `as_frame=True`, `data` will be a pandas
  512. DataFrame.
  513. target: {ndarray, Series} of shape (150,)
  514. The classification target. If `as_frame=True`, `target` will be
  515. a pandas Series.
  516. feature_names: list
  517. The names of the dataset columns.
  518. target_names: list
  519. The names of target classes.
  520. frame: DataFrame of shape (150, 5)
  521. Only present when `as_frame=True`. DataFrame with `data` and
  522. `target`.
  523. .. versionadded:: 0.23
  524. DESCR: str
  525. The full description of the dataset.
  526. filename: str
  527. The path to the location of the data.
  528. .. versionadded:: 0.20
  529. (data, target) : tuple if ``return_X_y`` is True
  530. A tuple of two ndarray. The first containing a 2D array of shape
  531. (n_samples, n_features) with each row representing one sample and
  532. each column representing the features. The second ndarray of shape
  533. (n_samples,) containing the target samples.
  534. .. versionadded:: 0.18
  535. Notes
  536. -----
  537. .. versionchanged:: 0.20
  538. Fixed two wrong data points according to Fisher's paper.
  539. The new version is the same as in R, but not as in the UCI
  540. Machine Learning Repository.
  541. Examples
  542. --------
  543. Let's say you are interested in the samples 10, 25, and 50, and want to
  544. know their class name.
  545. >>> from sklearn.datasets import load_iris
  546. >>> data = load_iris()
  547. >>> data.target[[10, 25, 50]]
  548. array([0, 0, 1])
  549. >>> list(data.target_names)
  550. ['setosa', 'versicolor', 'virginica']
  551. See :ref:`sphx_glr_auto_examples_datasets_plot_iris_dataset.py` for a more
  552. detailed example of how to work with the iris dataset.
  553. """
  554. data_file_name = "iris.csv"
  555. data, target, target_names, fdescr = load_csv_data(
  556. data_file_name=data_file_name, descr_file_name="iris.rst"
  557. )
  558. feature_names = [
  559. "sepal length (cm)",
  560. "sepal width (cm)",
  561. "petal length (cm)",
  562. "petal width (cm)",
  563. ]
  564. frame = None
  565. target_columns = [
  566. "target",
  567. ]
  568. if as_frame:
  569. frame, data, target = _convert_data_dataframe(
  570. "load_iris", data, target, feature_names, target_columns
  571. )
  572. if return_X_y:
  573. return data, target
  574. return Bunch(
  575. data=data,
  576. target=target,
  577. frame=frame,
  578. target_names=target_names,
  579. DESCR=fdescr,
  580. feature_names=feature_names,
  581. filename=data_file_name,
  582. data_module=DATA_MODULE,
  583. )
  584. @validate_params(
  585. {"return_X_y": ["boolean"], "as_frame": ["boolean"]},
  586. prefer_skip_nested_validation=True,
  587. )
  588. def load_breast_cancer(*, return_X_y=False, as_frame=False):
  589. """Load and return the breast cancer wisconsin dataset (classification).
  590. The breast cancer dataset is a classic and very easy binary classification
  591. dataset.
  592. ================= ==============
  593. Classes 2
  594. Samples per class 212(M),357(B)
  595. Samples total 569
  596. Dimensionality 30
  597. Features real, positive
  598. ================= ==============
  599. The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is
  600. downloaded from:
  601. https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic
  602. Read more in the :ref:`User Guide <breast_cancer_dataset>`.
  603. Parameters
  604. ----------
  605. return_X_y : bool, default=False
  606. If True, returns ``(data, target)`` instead of a Bunch object.
  607. See below for more information about the `data` and `target` object.
  608. .. versionadded:: 0.18
  609. as_frame : bool, default=False
  610. If True, the data is a pandas DataFrame including columns with
  611. appropriate dtypes (numeric). The target is
  612. a pandas DataFrame or Series depending on the number of target columns.
  613. If `return_X_y` is True, then (`data`, `target`) will be pandas
  614. DataFrames or Series as described below.
  615. .. versionadded:: 0.23
  616. Returns
  617. -------
  618. data : :class:`~sklearn.utils.Bunch`
  619. Dictionary-like object, with the following attributes.
  620. data : {ndarray, dataframe} of shape (569, 30)
  621. The data matrix. If `as_frame=True`, `data` will be a pandas
  622. DataFrame.
  623. target : {ndarray, Series} of shape (569,)
  624. The classification target. If `as_frame=True`, `target` will be
  625. a pandas Series.
  626. feature_names : ndarray of shape (30,)
  627. The names of the dataset columns.
  628. target_names : ndarray of shape (2,)
  629. The names of target classes.
  630. frame : DataFrame of shape (569, 31)
  631. Only present when `as_frame=True`. DataFrame with `data` and
  632. `target`.
  633. .. versionadded:: 0.23
  634. DESCR : str
  635. The full description of the dataset.
  636. filename : str
  637. The path to the location of the data.
  638. .. versionadded:: 0.20
  639. (data, target) : tuple if ``return_X_y`` is True
  640. A tuple of two ndarrays by default. The first contains a 2D ndarray of
  641. shape (569, 30) with each row representing one sample and each column
  642. representing the features. The second ndarray of shape (569,) contains
  643. the target samples. If `as_frame=True`, both arrays are pandas objects,
  644. i.e. `X` a dataframe and `y` a series.
  645. .. versionadded:: 0.18
  646. Examples
  647. --------
  648. Let's say you are interested in the samples 10, 50, and 85, and want to
  649. know their class name.
  650. >>> from sklearn.datasets import load_breast_cancer
  651. >>> data = load_breast_cancer()
  652. >>> data.target[[10, 50, 85]]
  653. array([0, 1, 0])
  654. >>> list(data.target_names)
  655. ['malignant', 'benign']
  656. """
  657. data_file_name = "breast_cancer.csv"
  658. data, target, target_names, fdescr = load_csv_data(
  659. data_file_name=data_file_name, descr_file_name="breast_cancer.rst"
  660. )
  661. feature_names = np.array(
  662. [
  663. "mean radius",
  664. "mean texture",
  665. "mean perimeter",
  666. "mean area",
  667. "mean smoothness",
  668. "mean compactness",
  669. "mean concavity",
  670. "mean concave points",
  671. "mean symmetry",
  672. "mean fractal dimension",
  673. "radius error",
  674. "texture error",
  675. "perimeter error",
  676. "area error",
  677. "smoothness error",
  678. "compactness error",
  679. "concavity error",
  680. "concave points error",
  681. "symmetry error",
  682. "fractal dimension error",
  683. "worst radius",
  684. "worst texture",
  685. "worst perimeter",
  686. "worst area",
  687. "worst smoothness",
  688. "worst compactness",
  689. "worst concavity",
  690. "worst concave points",
  691. "worst symmetry",
  692. "worst fractal dimension",
  693. ]
  694. )
  695. frame = None
  696. target_columns = [
  697. "target",
  698. ]
  699. if as_frame:
  700. frame, data, target = _convert_data_dataframe(
  701. "load_breast_cancer", data, target, feature_names, target_columns
  702. )
  703. if return_X_y:
  704. return data, target
  705. return Bunch(
  706. data=data,
  707. target=target,
  708. frame=frame,
  709. target_names=target_names,
  710. DESCR=fdescr,
  711. feature_names=feature_names,
  712. filename=data_file_name,
  713. data_module=DATA_MODULE,
  714. )
  715. @validate_params(
  716. {
  717. "n_class": [Interval(Integral, 1, 10, closed="both")],
  718. "return_X_y": ["boolean"],
  719. "as_frame": ["boolean"],
  720. },
  721. prefer_skip_nested_validation=True,
  722. )
  723. def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
  724. """Load and return the digits dataset (classification).
  725. Each datapoint is a 8x8 image of a digit.
  726. ================= ==============
  727. Classes 10
  728. Samples per class ~180
  729. Samples total 1797
  730. Dimensionality 64
  731. Features integers 0-16
  732. ================= ==============
  733. This is a copy of the test set of the UCI ML hand-written digits datasets
  734. https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
  735. Read more in the :ref:`User Guide <digits_dataset>`.
  736. Parameters
  737. ----------
  738. n_class : int, default=10
  739. The number of classes to return. Between 0 and 10.
  740. return_X_y : bool, default=False
  741. If True, returns ``(data, target)`` instead of a Bunch object.
  742. See below for more information about the `data` and `target` object.
  743. .. versionadded:: 0.18
  744. as_frame : bool, default=False
  745. If True, the data is a pandas DataFrame including columns with
  746. appropriate dtypes (numeric). The target is
  747. a pandas DataFrame or Series depending on the number of target columns.
  748. If `return_X_y` is True, then (`data`, `target`) will be pandas
  749. DataFrames or Series as described below.
  750. .. versionadded:: 0.23
  751. Returns
  752. -------
  753. data : :class:`~sklearn.utils.Bunch`
  754. Dictionary-like object, with the following attributes.
  755. data : {ndarray, dataframe} of shape (1797, 64)
  756. The flattened data matrix. If `as_frame=True`, `data` will be
  757. a pandas DataFrame.
  758. target: {ndarray, Series} of shape (1797,)
  759. The classification target. If `as_frame=True`, `target` will be
  760. a pandas Series.
  761. feature_names: list
  762. The names of the dataset columns.
  763. target_names: list
  764. The names of target classes.
  765. .. versionadded:: 0.20
  766. frame: DataFrame of shape (1797, 65)
  767. Only present when `as_frame=True`. DataFrame with `data` and
  768. `target`.
  769. .. versionadded:: 0.23
  770. images: {ndarray} of shape (1797, 8, 8)
  771. The raw image data.
  772. DESCR: str
  773. The full description of the dataset.
  774. (data, target) : tuple if ``return_X_y`` is True
  775. A tuple of two ndarrays by default. The first contains a 2D ndarray of
  776. shape (1797, 64) with each row representing one sample and each column
  777. representing the features. The second ndarray of shape (1797) contains
  778. the target samples. If `as_frame=True`, both arrays are pandas objects,
  779. i.e. `X` a dataframe and `y` a series.
  780. .. versionadded:: 0.18
  781. Examples
  782. --------
  783. To load the data and visualize the images::
  784. >>> from sklearn.datasets import load_digits
  785. >>> digits = load_digits()
  786. >>> print(digits.data.shape)
  787. (1797, 64)
  788. >>> import matplotlib.pyplot as plt
  789. >>> plt.gray()
  790. >>> plt.matshow(digits.images[0])
  791. <...>
  792. >>> plt.show()
  793. """
  794. data, fdescr = load_gzip_compressed_csv_data(
  795. data_file_name="digits.csv.gz", descr_file_name="digits.rst", delimiter=","
  796. )
  797. target = data[:, -1].astype(int, copy=False)
  798. flat_data = data[:, :-1]
  799. images = flat_data.view()
  800. images.shape = (-1, 8, 8)
  801. if n_class < 10:
  802. idx = target < n_class
  803. flat_data, target = flat_data[idx], target[idx]
  804. images = images[idx]
  805. feature_names = [
  806. "pixel_{}_{}".format(row_idx, col_idx)
  807. for row_idx in range(8)
  808. for col_idx in range(8)
  809. ]
  810. frame = None
  811. target_columns = [
  812. "target",
  813. ]
  814. if as_frame:
  815. frame, flat_data, target = _convert_data_dataframe(
  816. "load_digits", flat_data, target, feature_names, target_columns
  817. )
  818. if return_X_y:
  819. return flat_data, target
  820. return Bunch(
  821. data=flat_data,
  822. target=target,
  823. frame=frame,
  824. feature_names=feature_names,
  825. target_names=np.arange(10),
  826. images=images,
  827. DESCR=fdescr,
  828. )
  829. @validate_params(
  830. {"return_X_y": ["boolean"], "as_frame": ["boolean"], "scaled": ["boolean"]},
  831. prefer_skip_nested_validation=True,
  832. )
  833. def load_diabetes(*, return_X_y=False, as_frame=False, scaled=True):
  834. """Load and return the diabetes dataset (regression).
  835. ============== ==================
  836. Samples total 442
  837. Dimensionality 10
  838. Features real, -.2 < x < .2
  839. Targets integer 25 - 346
  840. ============== ==================
  841. .. note::
  842. The meaning of each feature (i.e. `feature_names`) might be unclear
  843. (especially for `ltg`) as the documentation of the original dataset is
  844. not explicit. We provide information that seems correct in regard with
  845. the scientific literature in this field of research.
  846. Read more in the :ref:`User Guide <diabetes_dataset>`.
  847. Parameters
  848. ----------
  849. return_X_y : bool, default=False
  850. If True, returns ``(data, target)`` instead of a Bunch object.
  851. See below for more information about the `data` and `target` object.
  852. .. versionadded:: 0.18
  853. as_frame : bool, default=False
  854. If True, the data is a pandas DataFrame including columns with
  855. appropriate dtypes (numeric). The target is
  856. a pandas DataFrame or Series depending on the number of target columns.
  857. If `return_X_y` is True, then (`data`, `target`) will be pandas
  858. DataFrames or Series as described below.
  859. .. versionadded:: 0.23
  860. scaled : bool, default=True
  861. If True, the feature variables are mean centered and scaled by the
  862. standard deviation times the square root of `n_samples`.
  863. If False, raw data is returned for the feature variables.
  864. .. versionadded:: 1.1
  865. Returns
  866. -------
  867. data : :class:`~sklearn.utils.Bunch`
  868. Dictionary-like object, with the following attributes.
  869. data : {ndarray, dataframe} of shape (442, 10)
  870. The data matrix. If `as_frame=True`, `data` will be a pandas
  871. DataFrame.
  872. target: {ndarray, Series} of shape (442,)
  873. The regression target. If `as_frame=True`, `target` will be
  874. a pandas Series.
  875. feature_names: list
  876. The names of the dataset columns.
  877. frame: DataFrame of shape (442, 11)
  878. Only present when `as_frame=True`. DataFrame with `data` and
  879. `target`.
  880. .. versionadded:: 0.23
  881. DESCR: str
  882. The full description of the dataset.
  883. data_filename: str
  884. The path to the location of the data.
  885. target_filename: str
  886. The path to the location of the target.
  887. (data, target) : tuple if ``return_X_y`` is True
  888. Returns a tuple of two ndarray of shape (n_samples, n_features)
  889. A 2D array with each row representing one sample and each column
  890. representing the features and/or target of a given sample.
  891. .. versionadded:: 0.18
  892. """
  893. data_filename = "diabetes_data_raw.csv.gz"
  894. target_filename = "diabetes_target.csv.gz"
  895. data = load_gzip_compressed_csv_data(data_filename)
  896. target = load_gzip_compressed_csv_data(target_filename)
  897. if scaled:
  898. data = scale(data, copy=False)
  899. data /= data.shape[0] ** 0.5
  900. fdescr = load_descr("diabetes.rst")
  901. feature_names = ["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"]
  902. frame = None
  903. target_columns = [
  904. "target",
  905. ]
  906. if as_frame:
  907. frame, data, target = _convert_data_dataframe(
  908. "load_diabetes", data, target, feature_names, target_columns
  909. )
  910. if return_X_y:
  911. return data, target
  912. return Bunch(
  913. data=data,
  914. target=target,
  915. frame=frame,
  916. DESCR=fdescr,
  917. feature_names=feature_names,
  918. data_filename=data_filename,
  919. target_filename=target_filename,
  920. data_module=DATA_MODULE,
  921. )
  922. @validate_params(
  923. {
  924. "return_X_y": ["boolean"],
  925. "as_frame": ["boolean"],
  926. },
  927. prefer_skip_nested_validation=True,
  928. )
  929. def load_linnerud(*, return_X_y=False, as_frame=False):
  930. """Load and return the physical exercise Linnerud dataset.
  931. This dataset is suitable for multi-output regression tasks.
  932. ============== ============================
  933. Samples total 20
  934. Dimensionality 3 (for both data and target)
  935. Features integer
  936. Targets integer
  937. ============== ============================
  938. Read more in the :ref:`User Guide <linnerrud_dataset>`.
  939. Parameters
  940. ----------
  941. return_X_y : bool, default=False
  942. If True, returns ``(data, target)`` instead of a Bunch object.
  943. See below for more information about the `data` and `target` object.
  944. .. versionadded:: 0.18
  945. as_frame : bool, default=False
  946. If True, the data is a pandas DataFrame including columns with
  947. appropriate dtypes (numeric, string or categorical). The target is
  948. a pandas DataFrame or Series depending on the number of target columns.
  949. If `return_X_y` is True, then (`data`, `target`) will be pandas
  950. DataFrames or Series as described below.
  951. .. versionadded:: 0.23
  952. Returns
  953. -------
  954. data : :class:`~sklearn.utils.Bunch`
  955. Dictionary-like object, with the following attributes.
  956. data : {ndarray, dataframe} of shape (20, 3)
  957. The data matrix. If `as_frame=True`, `data` will be a pandas
  958. DataFrame.
  959. target: {ndarray, dataframe} of shape (20, 3)
  960. The regression targets. If `as_frame=True`, `target` will be
  961. a pandas DataFrame.
  962. feature_names: list
  963. The names of the dataset columns.
  964. target_names: list
  965. The names of the target columns.
  966. frame: DataFrame of shape (20, 6)
  967. Only present when `as_frame=True`. DataFrame with `data` and
  968. `target`.
  969. .. versionadded:: 0.23
  970. DESCR: str
  971. The full description of the dataset.
  972. data_filename: str
  973. The path to the location of the data.
  974. target_filename: str
  975. The path to the location of the target.
  976. .. versionadded:: 0.20
  977. (data, target) : tuple if ``return_X_y`` is True
  978. Returns a tuple of two ndarrays or dataframe of shape
  979. `(20, 3)`. Each row represents one sample and each column represents the
  980. features in `X` and a target in `y` of a given sample.
  981. .. versionadded:: 0.18
  982. """
  983. data_filename = "linnerud_exercise.csv"
  984. target_filename = "linnerud_physiological.csv"
  985. # Read header and data
  986. with _open_text(DATA_MODULE, data_filename) as f:
  987. header_exercise = f.readline().split()
  988. f.seek(0) # reset file obj
  989. data_exercise = np.loadtxt(f, skiprows=1)
  990. with _open_text(DATA_MODULE, target_filename) as f:
  991. header_physiological = f.readline().split()
  992. f.seek(0) # reset file obj
  993. data_physiological = np.loadtxt(f, skiprows=1)
  994. fdescr = load_descr("linnerud.rst")
  995. frame = None
  996. if as_frame:
  997. (frame, data_exercise, data_physiological) = _convert_data_dataframe(
  998. "load_linnerud",
  999. data_exercise,
  1000. data_physiological,
  1001. header_exercise,
  1002. header_physiological,
  1003. )
  1004. if return_X_y:
  1005. return data_exercise, data_physiological
  1006. return Bunch(
  1007. data=data_exercise,
  1008. feature_names=header_exercise,
  1009. target=data_physiological,
  1010. target_names=header_physiological,
  1011. frame=frame,
  1012. DESCR=fdescr,
  1013. data_filename=data_filename,
  1014. target_filename=target_filename,
  1015. data_module=DATA_MODULE,
  1016. )
  1017. def load_sample_images():
  1018. """Load sample images for image manipulation.
  1019. Loads both, ``china`` and ``flower``.
  1020. Read more in the :ref:`User Guide <sample_images>`.
  1021. Returns
  1022. -------
  1023. data : :class:`~sklearn.utils.Bunch`
  1024. Dictionary-like object, with the following attributes.
  1025. images : list of ndarray of shape (427, 640, 3)
  1026. The two sample image.
  1027. filenames : list
  1028. The filenames for the images.
  1029. DESCR : str
  1030. The full description of the dataset.
  1031. Examples
  1032. --------
  1033. To load the data and visualize the images:
  1034. >>> from sklearn.datasets import load_sample_images
  1035. >>> dataset = load_sample_images() #doctest: +SKIP
  1036. >>> len(dataset.images) #doctest: +SKIP
  1037. 2
  1038. >>> first_img_data = dataset.images[0] #doctest: +SKIP
  1039. >>> first_img_data.shape #doctest: +SKIP
  1040. (427, 640, 3)
  1041. >>> first_img_data.dtype #doctest: +SKIP
  1042. dtype('uint8')
  1043. """
  1044. try:
  1045. from PIL import Image
  1046. except ImportError:
  1047. raise ImportError(
  1048. "The Python Imaging Library (PIL) is required to load data "
  1049. "from jpeg files. Please refer to "
  1050. "https://pillow.readthedocs.io/en/stable/installation.html "
  1051. "for installing PIL."
  1052. )
  1053. descr = load_descr("README.txt", descr_module=IMAGES_MODULE)
  1054. filenames, images = [], []
  1055. for filename in sorted(_contents(IMAGES_MODULE)):
  1056. if filename.endswith(".jpg"):
  1057. filenames.append(filename)
  1058. with _open_binary(IMAGES_MODULE, filename) as image_file:
  1059. pil_image = Image.open(image_file)
  1060. image = np.asarray(pil_image)
  1061. images.append(image)
  1062. return Bunch(images=images, filenames=filenames, DESCR=descr)
  1063. @validate_params(
  1064. {
  1065. "image_name": [StrOptions({"china.jpg", "flower.jpg"})],
  1066. },
  1067. prefer_skip_nested_validation=True,
  1068. )
  1069. def load_sample_image(image_name):
  1070. """Load the numpy array of a single sample image.
  1071. Read more in the :ref:`User Guide <sample_images>`.
  1072. Parameters
  1073. ----------
  1074. image_name : {`china.jpg`, `flower.jpg`}
  1075. The name of the sample image loaded.
  1076. Returns
  1077. -------
  1078. img : 3D array
  1079. The image as a numpy array: height x width x color.
  1080. Examples
  1081. --------
  1082. >>> from sklearn.datasets import load_sample_image
  1083. >>> china = load_sample_image('china.jpg') # doctest: +SKIP
  1084. >>> china.dtype # doctest: +SKIP
  1085. dtype('uint8')
  1086. >>> china.shape # doctest: +SKIP
  1087. (427, 640, 3)
  1088. >>> flower = load_sample_image('flower.jpg') # doctest: +SKIP
  1089. >>> flower.dtype # doctest: +SKIP
  1090. dtype('uint8')
  1091. >>> flower.shape # doctest: +SKIP
  1092. (427, 640, 3)
  1093. """
  1094. images = load_sample_images()
  1095. index = None
  1096. for i, filename in enumerate(images.filenames):
  1097. if filename.endswith(image_name):
  1098. index = i
  1099. break
  1100. if index is None:
  1101. raise AttributeError("Cannot find sample image: %s" % image_name)
  1102. return images.images[index]
  1103. def _pkl_filepath(*args, **kwargs):
  1104. """Return filename for Python 3 pickles
  1105. args[-1] is expected to be the ".pkl" filename. For compatibility with
  1106. older scikit-learn versions, a suffix is inserted before the extension.
  1107. _pkl_filepath('/path/to/folder', 'filename.pkl') returns
  1108. '/path/to/folder/filename_py3.pkl'
  1109. """
  1110. py3_suffix = kwargs.get("py3_suffix", "_py3")
  1111. basename, ext = splitext(args[-1])
  1112. basename += py3_suffix
  1113. new_args = args[:-1] + (basename + ext,)
  1114. return join(*new_args)
  1115. def _sha256(path):
  1116. """Calculate the sha256 hash of the file at path."""
  1117. sha256hash = hashlib.sha256()
  1118. chunk_size = 8192
  1119. with open(path, "rb") as f:
  1120. while True:
  1121. buffer = f.read(chunk_size)
  1122. if not buffer:
  1123. break
  1124. sha256hash.update(buffer)
  1125. return sha256hash.hexdigest()
  1126. def _fetch_remote(remote, dirname=None):
  1127. """Helper function to download a remote dataset into path
  1128. Fetch a dataset pointed by remote's url, save into path using remote's
  1129. filename and ensure its integrity based on the SHA256 Checksum of the
  1130. downloaded file.
  1131. Parameters
  1132. ----------
  1133. remote : RemoteFileMetadata
  1134. Named tuple containing remote dataset meta information: url, filename
  1135. and checksum
  1136. dirname : str
  1137. Directory to save the file to.
  1138. Returns
  1139. -------
  1140. file_path: str
  1141. Full path of the created file.
  1142. """
  1143. file_path = remote.filename if dirname is None else join(dirname, remote.filename)
  1144. urlretrieve(remote.url, file_path)
  1145. checksum = _sha256(file_path)
  1146. if remote.checksum != checksum:
  1147. raise OSError(
  1148. "{} has an SHA256 checksum ({}) "
  1149. "differing from expected ({}), "
  1150. "file may be corrupted.".format(file_path, checksum, remote.checksum)
  1151. )
  1152. return file_path