| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126 |
- """
- Generate samples of synthetic data sets.
- """
- # Authors: B. Thirion, G. Varoquaux, A. Gramfort, V. Michel, O. Grisel,
- # G. Louppe, J. Nothman
- # License: BSD 3 clause
- import array
- import numbers
- import warnings
- from collections.abc import Iterable
- from numbers import Integral, Real
- import numpy as np
- import scipy.sparse as sp
- from scipy import linalg
- from ..preprocessing import MultiLabelBinarizer
- from ..utils import check_array, check_random_state
- from ..utils import shuffle as util_shuffle
- from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
- from ..utils.random import sample_without_replacement
- def _generate_hypercube(samples, dimensions, rng):
- """Returns distinct binary samples of length dimensions."""
- if dimensions > 30:
- return np.hstack(
- [
- rng.randint(2, size=(samples, dimensions - 30)),
- _generate_hypercube(samples, 30, rng),
- ]
- )
- out = sample_without_replacement(2**dimensions, samples, random_state=rng).astype(
- dtype=">u4", copy=False
- )
- out = np.unpackbits(out.view(">u1")).reshape((-1, 32))[:, -dimensions:]
- return out
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "n_features": [Interval(Integral, 1, None, closed="left")],
- "n_informative": [Interval(Integral, 1, None, closed="left")],
- "n_redundant": [Interval(Integral, 0, None, closed="left")],
- "n_repeated": [Interval(Integral, 0, None, closed="left")],
- "n_classes": [Interval(Integral, 1, None, closed="left")],
- "n_clusters_per_class": [Interval(Integral, 1, None, closed="left")],
- "weights": ["array-like", None],
- "flip_y": [Interval(Real, 0, 1, closed="both")],
- "class_sep": [Interval(Real, 0, None, closed="neither")],
- "hypercube": ["boolean"],
- "shift": [Interval(Real, None, None, closed="neither"), "array-like", None],
- "scale": [Interval(Real, 0, None, closed="neither"), "array-like", None],
- "shuffle": ["boolean"],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_classification(
- n_samples=100,
- n_features=20,
- *,
- n_informative=2,
- n_redundant=2,
- n_repeated=0,
- n_classes=2,
- n_clusters_per_class=2,
- weights=None,
- flip_y=0.01,
- class_sep=1.0,
- hypercube=True,
- shift=0.0,
- scale=1.0,
- shuffle=True,
- random_state=None,
- ):
- """Generate a random n-class classification problem.
- This initially creates clusters of points normally distributed (std=1)
- about vertices of an ``n_informative``-dimensional hypercube with sides of
- length ``2*class_sep`` and assigns an equal number of clusters to each
- class. It introduces interdependence between these features and adds
- various types of further noise to the data.
- Without shuffling, ``X`` horizontally stacks features in the following
- order: the primary ``n_informative`` features, followed by ``n_redundant``
- linear combinations of the informative features, followed by ``n_repeated``
- duplicates, drawn randomly with replacement from the informative and
- redundant features. The remaining features are filled with random noise.
- Thus, without shuffling, all useful features are contained in the columns
- ``X[:, :n_informative + n_redundant + n_repeated]``.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of samples.
- n_features : int, default=20
- The total number of features. These comprise ``n_informative``
- informative features, ``n_redundant`` redundant features,
- ``n_repeated`` duplicated features and
- ``n_features-n_informative-n_redundant-n_repeated`` useless features
- drawn at random.
- n_informative : int, default=2
- The number of informative features. Each class is composed of a number
- of gaussian clusters each located around the vertices of a hypercube
- in a subspace of dimension ``n_informative``. For each cluster,
- informative features are drawn independently from N(0, 1) and then
- randomly linearly combined within each cluster in order to add
- covariance. The clusters are then placed on the vertices of the
- hypercube.
- n_redundant : int, default=2
- The number of redundant features. These features are generated as
- random linear combinations of the informative features.
- n_repeated : int, default=0
- The number of duplicated features, drawn randomly from the informative
- and the redundant features.
- n_classes : int, default=2
- The number of classes (or labels) of the classification problem.
- n_clusters_per_class : int, default=2
- The number of clusters per class.
- weights : array-like of shape (n_classes,) or (n_classes - 1,),\
- default=None
- The proportions of samples assigned to each class. If None, then
- classes are balanced. Note that if ``len(weights) == n_classes - 1``,
- then the last class weight is automatically inferred.
- More than ``n_samples`` samples may be returned if the sum of
- ``weights`` exceeds 1. Note that the actual class proportions will
- not exactly match ``weights`` when ``flip_y`` isn't 0.
- flip_y : float, default=0.01
- The fraction of samples whose class is assigned randomly. Larger
- values introduce noise in the labels and make the classification
- task harder. Note that the default setting flip_y > 0 might lead
- to less than ``n_classes`` in y in some cases.
- class_sep : float, default=1.0
- The factor multiplying the hypercube size. Larger values spread
- out the clusters/classes and make the classification task easier.
- hypercube : bool, default=True
- If True, the clusters are put on the vertices of a hypercube. If
- False, the clusters are put on the vertices of a random polytope.
- shift : float, ndarray of shape (n_features,) or None, default=0.0
- Shift features by the specified value. If None, then features
- are shifted by a random value drawn in [-class_sep, class_sep].
- scale : float, ndarray of shape (n_features,) or None, default=1.0
- Multiply features by the specified value. If None, then features
- are scaled by a random value drawn in [1, 100]. Note that scaling
- happens after shifting.
- shuffle : bool, default=True
- Shuffle the samples and the features.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, n_features)
- The generated samples.
- y : ndarray of shape (n_samples,)
- The integer labels for class membership of each sample.
- See Also
- --------
- make_blobs : Simplified variant.
- make_multilabel_classification : Unrelated generator for multilabel tasks.
- Notes
- -----
- The algorithm is adapted from Guyon [1] and was designed to generate
- the "Madelon" dataset.
- References
- ----------
- .. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable
- selection benchmark", 2003.
- """
- generator = check_random_state(random_state)
- # Count features, clusters and samples
- if n_informative + n_redundant + n_repeated > n_features:
- raise ValueError(
- "Number of informative, redundant and repeated "
- "features must sum to less than the number of total"
- " features"
- )
- # Use log2 to avoid overflow errors
- if n_informative < np.log2(n_classes * n_clusters_per_class):
- msg = "n_classes({}) * n_clusters_per_class({}) must be"
- msg += " smaller or equal 2**n_informative({})={}"
- raise ValueError(
- msg.format(
- n_classes, n_clusters_per_class, n_informative, 2**n_informative
- )
- )
- if weights is not None:
- if len(weights) not in [n_classes, n_classes - 1]:
- raise ValueError(
- "Weights specified but incompatible with number of classes."
- )
- if len(weights) == n_classes - 1:
- if isinstance(weights, list):
- weights = weights + [1.0 - sum(weights)]
- else:
- weights = np.resize(weights, n_classes)
- weights[-1] = 1.0 - sum(weights[:-1])
- else:
- weights = [1.0 / n_classes] * n_classes
- n_useless = n_features - n_informative - n_redundant - n_repeated
- n_clusters = n_classes * n_clusters_per_class
- # Distribute samples among clusters by weight
- n_samples_per_cluster = [
- int(n_samples * weights[k % n_classes] / n_clusters_per_class)
- for k in range(n_clusters)
- ]
- for i in range(n_samples - sum(n_samples_per_cluster)):
- n_samples_per_cluster[i % n_clusters] += 1
- # Initialize X and y
- X = np.zeros((n_samples, n_features))
- y = np.zeros(n_samples, dtype=int)
- # Build the polytope whose vertices become cluster centroids
- centroids = _generate_hypercube(n_clusters, n_informative, generator).astype(
- float, copy=False
- )
- centroids *= 2 * class_sep
- centroids -= class_sep
- if not hypercube:
- centroids *= generator.uniform(size=(n_clusters, 1))
- centroids *= generator.uniform(size=(1, n_informative))
- # Initially draw informative features from the standard normal
- X[:, :n_informative] = generator.standard_normal(size=(n_samples, n_informative))
- # Create each cluster; a variant of make_blobs
- stop = 0
- for k, centroid in enumerate(centroids):
- start, stop = stop, stop + n_samples_per_cluster[k]
- y[start:stop] = k % n_classes # assign labels
- X_k = X[start:stop, :n_informative] # slice a view of the cluster
- A = 2 * generator.uniform(size=(n_informative, n_informative)) - 1
- X_k[...] = np.dot(X_k, A) # introduce random covariance
- X_k += centroid # shift the cluster to a vertex
- # Create redundant features
- if n_redundant > 0:
- B = 2 * generator.uniform(size=(n_informative, n_redundant)) - 1
- X[:, n_informative : n_informative + n_redundant] = np.dot(
- X[:, :n_informative], B
- )
- # Repeat some features
- if n_repeated > 0:
- n = n_informative + n_redundant
- indices = ((n - 1) * generator.uniform(size=n_repeated) + 0.5).astype(np.intp)
- X[:, n : n + n_repeated] = X[:, indices]
- # Fill useless features
- if n_useless > 0:
- X[:, -n_useless:] = generator.standard_normal(size=(n_samples, n_useless))
- # Randomly replace labels
- if flip_y >= 0.0:
- flip_mask = generator.uniform(size=n_samples) < flip_y
- y[flip_mask] = generator.randint(n_classes, size=flip_mask.sum())
- # Randomly shift and scale
- if shift is None:
- shift = (2 * generator.uniform(size=n_features) - 1) * class_sep
- X += shift
- if scale is None:
- scale = 1 + 100 * generator.uniform(size=n_features)
- X *= scale
- if shuffle:
- # Randomly permute samples
- X, y = util_shuffle(X, y, random_state=generator)
- # Randomly permute features
- indices = np.arange(n_features)
- generator.shuffle(indices)
- X[:, :] = X[:, indices]
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "n_features": [Interval(Integral, 1, None, closed="left")],
- "n_classes": [Interval(Integral, 1, None, closed="left")],
- "n_labels": [Interval(Integral, 0, None, closed="left")],
- "length": [Interval(Integral, 1, None, closed="left")],
- "allow_unlabeled": ["boolean"],
- "sparse": ["boolean"],
- "return_indicator": [StrOptions({"dense", "sparse"}), "boolean"],
- "return_distributions": ["boolean"],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_multilabel_classification(
- n_samples=100,
- n_features=20,
- *,
- n_classes=5,
- n_labels=2,
- length=50,
- allow_unlabeled=True,
- sparse=False,
- return_indicator="dense",
- return_distributions=False,
- random_state=None,
- ):
- """Generate a random multilabel classification problem.
- For each sample, the generative process is:
- - pick the number of labels: n ~ Poisson(n_labels)
- - n times, choose a class c: c ~ Multinomial(theta)
- - pick the document length: k ~ Poisson(length)
- - k times, choose a word: w ~ Multinomial(theta_c)
- In the above process, rejection sampling is used to make sure that
- n is never zero or more than `n_classes`, and that the document length
- is never zero. Likewise, we reject classes which have already been chosen.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of samples.
- n_features : int, default=20
- The total number of features.
- n_classes : int, default=5
- The number of classes of the classification problem.
- n_labels : int, default=2
- The average number of labels per instance. More precisely, the number
- of labels per sample is drawn from a Poisson distribution with
- ``n_labels`` as its expected value, but samples are bounded (using
- rejection sampling) by ``n_classes``, and must be nonzero if
- ``allow_unlabeled`` is False.
- length : int, default=50
- The sum of the features (number of words if documents) is drawn from
- a Poisson distribution with this expected value.
- allow_unlabeled : bool, default=True
- If ``True``, some instances might not belong to any class.
- sparse : bool, default=False
- If ``True``, return a sparse feature matrix.
- .. versionadded:: 0.17
- parameter to allow *sparse* output.
- return_indicator : {'dense', 'sparse'} or False, default='dense'
- If ``'dense'`` return ``Y`` in the dense binary indicator format. If
- ``'sparse'`` return ``Y`` in the sparse binary indicator format.
- ``False`` returns a list of lists of labels.
- return_distributions : bool, default=False
- If ``True``, return the prior class probability and conditional
- probabilities of features given classes, from which the data was
- drawn.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, n_features)
- The generated samples.
- Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
- The label sets. Sparse matrix should be of CSR format.
- p_c : ndarray of shape (n_classes,)
- The probability of each class being drawn. Only returned if
- ``return_distributions=True``.
- p_w_c : ndarray of shape (n_features, n_classes)
- The probability of each feature being drawn given each class.
- Only returned if ``return_distributions=True``.
- """
- generator = check_random_state(random_state)
- p_c = generator.uniform(size=n_classes)
- p_c /= p_c.sum()
- cumulative_p_c = np.cumsum(p_c)
- p_w_c = generator.uniform(size=(n_features, n_classes))
- p_w_c /= np.sum(p_w_c, axis=0)
- def sample_example():
- _, n_classes = p_w_c.shape
- # pick a nonzero number of labels per document by rejection sampling
- y_size = n_classes + 1
- while (not allow_unlabeled and y_size == 0) or y_size > n_classes:
- y_size = generator.poisson(n_labels)
- # pick n classes
- y = set()
- while len(y) != y_size:
- # pick a class with probability P(c)
- c = np.searchsorted(cumulative_p_c, generator.uniform(size=y_size - len(y)))
- y.update(c)
- y = list(y)
- # pick a non-zero document length by rejection sampling
- n_words = 0
- while n_words == 0:
- n_words = generator.poisson(length)
- # generate a document of length n_words
- if len(y) == 0:
- # if sample does not belong to any class, generate noise word
- words = generator.randint(n_features, size=n_words)
- return words, y
- # sample words with replacement from selected classes
- cumulative_p_w_sample = p_w_c.take(y, axis=1).sum(axis=1).cumsum()
- cumulative_p_w_sample /= cumulative_p_w_sample[-1]
- words = np.searchsorted(cumulative_p_w_sample, generator.uniform(size=n_words))
- return words, y
- X_indices = array.array("i")
- X_indptr = array.array("i", [0])
- Y = []
- for i in range(n_samples):
- words, y = sample_example()
- X_indices.extend(words)
- X_indptr.append(len(X_indices))
- Y.append(y)
- X_data = np.ones(len(X_indices), dtype=np.float64)
- X = sp.csr_matrix((X_data, X_indices, X_indptr), shape=(n_samples, n_features))
- X.sum_duplicates()
- if not sparse:
- X = X.toarray()
- # return_indicator can be True due to backward compatibility
- if return_indicator in (True, "sparse", "dense"):
- lb = MultiLabelBinarizer(sparse_output=(return_indicator == "sparse"))
- Y = lb.fit([range(n_classes)]).transform(Y)
- if return_distributions:
- return X, Y, p_c, p_w_c
- return X, Y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_hastie_10_2(n_samples=12000, *, random_state=None):
- """Generate data for binary classification used in Hastie et al. 2009, Example 10.2.
- The ten features are standard independent Gaussian and
- the target ``y`` is defined by::
- y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=12000
- The number of samples.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, 10)
- The input samples.
- y : ndarray of shape (n_samples,)
- The output values.
- See Also
- --------
- make_gaussian_quantiles : A generalization of this dataset approach.
- References
- ----------
- .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
- Learning Ed. 2", Springer, 2009.
- """
- rs = check_random_state(random_state)
- shape = (n_samples, 10)
- X = rs.normal(size=shape).reshape(shape)
- y = ((X**2.0).sum(axis=1) > 9.34).astype(np.float64, copy=False)
- y[y == 0.0] = -1.0
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "n_features": [Interval(Integral, 1, None, closed="left")],
- "n_informative": [Interval(Integral, 0, None, closed="left")],
- "n_targets": [Interval(Integral, 1, None, closed="left")],
- "bias": [Interval(Real, None, None, closed="neither")],
- "effective_rank": [Interval(Integral, 1, None, closed="left"), None],
- "tail_strength": [Interval(Real, 0, 1, closed="both")],
- "noise": [Interval(Real, 0, None, closed="left")],
- "shuffle": ["boolean"],
- "coef": ["boolean"],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_regression(
- n_samples=100,
- n_features=100,
- *,
- n_informative=10,
- n_targets=1,
- bias=0.0,
- effective_rank=None,
- tail_strength=0.5,
- noise=0.0,
- shuffle=True,
- coef=False,
- random_state=None,
- ):
- """Generate a random regression problem.
- The input set can either be well conditioned (by default) or have a low
- rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
- more details.
- The output is generated by applying a (potentially biased) random linear
- regression model with `n_informative` nonzero regressors to the previously
- generated input and some gaussian centered noise with some adjustable
- scale.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of samples.
- n_features : int, default=100
- The number of features.
- n_informative : int, default=10
- The number of informative features, i.e., the number of features used
- to build the linear model used to generate the output.
- n_targets : int, default=1
- The number of regression targets, i.e., the dimension of the y output
- vector associated with a sample. By default, the output is a scalar.
- bias : float, default=0.0
- The bias term in the underlying linear model.
- effective_rank : int, default=None
- If not None:
- The approximate number of singular vectors required to explain most
- of the input data by linear combinations. Using this kind of
- singular spectrum in the input allows the generator to reproduce
- the correlations often observed in practice.
- If None:
- The input set is well conditioned, centered and gaussian with
- unit variance.
- tail_strength : float, default=0.5
- The relative importance of the fat noisy tail of the singular values
- profile if `effective_rank` is not None. When a float, it should be
- between 0 and 1.
- noise : float, default=0.0
- The standard deviation of the gaussian noise applied to the output.
- shuffle : bool, default=True
- Shuffle the samples and the features.
- coef : bool, default=False
- If True, the coefficients of the underlying linear model are returned.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, n_features)
- The input samples.
- y : ndarray of shape (n_samples,) or (n_samples, n_targets)
- The output values.
- coef : ndarray of shape (n_features,) or (n_features, n_targets)
- The coefficient of the underlying linear model. It is returned only if
- coef is True.
- Examples
- --------
- >>> from sklearn.datasets import make_regression
- >>> X, y = make_regression(n_samples=5, n_features=2, noise=1, random_state=42)
- >>> X
- array([[ 0.4967..., -0.1382... ],
- [ 0.6476..., 1.523...],
- [-0.2341..., -0.2341...],
- [-0.4694..., 0.5425...],
- [ 1.579..., 0.7674...]])
- >>> y
- array([ 6.737..., 37.79..., -10.27..., 0.4017..., 42.22...])
- """
- n_informative = min(n_features, n_informative)
- generator = check_random_state(random_state)
- if effective_rank is None:
- # Randomly generate a well conditioned input set
- X = generator.standard_normal(size=(n_samples, n_features))
- else:
- # Randomly generate a low rank, fat tail input set
- X = make_low_rank_matrix(
- n_samples=n_samples,
- n_features=n_features,
- effective_rank=effective_rank,
- tail_strength=tail_strength,
- random_state=generator,
- )
- # Generate a ground truth model with only n_informative features being non
- # zeros (the other features are not correlated to y and should be ignored
- # by a sparsifying regularizers such as L1 or elastic net)
- ground_truth = np.zeros((n_features, n_targets))
- ground_truth[:n_informative, :] = 100 * generator.uniform(
- size=(n_informative, n_targets)
- )
- y = np.dot(X, ground_truth) + bias
- # Add noise
- if noise > 0.0:
- y += generator.normal(scale=noise, size=y.shape)
- # Randomly permute samples and features
- if shuffle:
- X, y = util_shuffle(X, y, random_state=generator)
- indices = np.arange(n_features)
- generator.shuffle(indices)
- X[:, :] = X[:, indices]
- ground_truth = ground_truth[indices]
- y = np.squeeze(y)
- if coef:
- return X, y, np.squeeze(ground_truth)
- else:
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 0, None, closed="left"), tuple],
- "shuffle": ["boolean"],
- "noise": [Interval(Real, 0, None, closed="left"), None],
- "random_state": ["random_state"],
- "factor": [Interval(Real, 0, 1, closed="left")],
- },
- prefer_skip_nested_validation=True,
- )
- def make_circles(
- n_samples=100, *, shuffle=True, noise=None, random_state=None, factor=0.8
- ):
- """Make a large circle containing a smaller circle in 2d.
- A simple toy dataset to visualize clustering and classification
- algorithms.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int or tuple of shape (2,), dtype=int, default=100
- If int, it is the total number of points generated.
- For odd numbers, the inner circle will have one point more than the
- outer circle.
- If two-element tuple, number of points in outer circle and inner
- circle.
- .. versionchanged:: 0.23
- Added two-element tuple.
- shuffle : bool, default=True
- Whether to shuffle the samples.
- noise : float, default=None
- Standard deviation of Gaussian noise added to the data.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset shuffling and noise.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- factor : float, default=.8
- Scale factor between inner and outer circle in the range `[0, 1)`.
- Returns
- -------
- X : ndarray of shape (n_samples, 2)
- The generated samples.
- y : ndarray of shape (n_samples,)
- The integer labels (0 or 1) for class membership of each sample.
- """
- if isinstance(n_samples, numbers.Integral):
- n_samples_out = n_samples // 2
- n_samples_in = n_samples - n_samples_out
- else: # n_samples is a tuple
- if len(n_samples) != 2:
- raise ValueError("When a tuple, n_samples must have exactly two elements.")
- n_samples_out, n_samples_in = n_samples
- generator = check_random_state(random_state)
- # so as not to have the first point = last point, we set endpoint=False
- linspace_out = np.linspace(0, 2 * np.pi, n_samples_out, endpoint=False)
- linspace_in = np.linspace(0, 2 * np.pi, n_samples_in, endpoint=False)
- outer_circ_x = np.cos(linspace_out)
- outer_circ_y = np.sin(linspace_out)
- inner_circ_x = np.cos(linspace_in) * factor
- inner_circ_y = np.sin(linspace_in) * factor
- X = np.vstack(
- [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
- ).T
- y = np.hstack(
- [np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)]
- )
- if shuffle:
- X, y = util_shuffle(X, y, random_state=generator)
- if noise is not None:
- X += generator.normal(scale=noise, size=X.shape)
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left"), tuple],
- "shuffle": ["boolean"],
- "noise": [Interval(Real, 0, None, closed="left"), None],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
- """Make two interleaving half circles.
- A simple toy dataset to visualize clustering and classification
- algorithms. Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int or tuple of shape (2,), dtype=int, default=100
- If int, the total number of points generated.
- If two-element tuple, number of points in each of two moons.
- .. versionchanged:: 0.23
- Added two-element tuple.
- shuffle : bool, default=True
- Whether to shuffle the samples.
- noise : float, default=None
- Standard deviation of Gaussian noise added to the data.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset shuffling and noise.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, 2)
- The generated samples.
- y : ndarray of shape (n_samples,)
- The integer labels (0 or 1) for class membership of each sample.
- """
- if isinstance(n_samples, numbers.Integral):
- n_samples_out = n_samples // 2
- n_samples_in = n_samples - n_samples_out
- else:
- try:
- n_samples_out, n_samples_in = n_samples
- except ValueError as e:
- raise ValueError(
- "`n_samples` can be either an int or a two-element tuple."
- ) from e
- generator = check_random_state(random_state)
- outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))
- outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))
- inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))
- inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.5
- X = np.vstack(
- [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
- ).T
- y = np.hstack(
- [np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)]
- )
- if shuffle:
- X, y = util_shuffle(X, y, random_state=generator)
- if noise is not None:
- X += generator.normal(scale=noise, size=X.shape)
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left"), "array-like"],
- "n_features": [Interval(Integral, 1, None, closed="left")],
- "centers": [Interval(Integral, 1, None, closed="left"), "array-like", None],
- "cluster_std": [Interval(Real, 0, None, closed="left"), "array-like"],
- "center_box": [tuple],
- "shuffle": ["boolean"],
- "random_state": ["random_state"],
- "return_centers": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_blobs(
- n_samples=100,
- n_features=2,
- *,
- centers=None,
- cluster_std=1.0,
- center_box=(-10.0, 10.0),
- shuffle=True,
- random_state=None,
- return_centers=False,
- ):
- """Generate isotropic Gaussian blobs for clustering.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int or array-like, default=100
- If int, it is the total number of points equally divided among
- clusters.
- If array-like, each element of the sequence indicates
- the number of samples per cluster.
- .. versionchanged:: v0.20
- one can now pass an array-like to the ``n_samples`` parameter
- n_features : int, default=2
- The number of features for each sample.
- centers : int or array-like of shape (n_centers, n_features), default=None
- The number of centers to generate, or the fixed center locations.
- If n_samples is an int and centers is None, 3 centers are generated.
- If n_samples is array-like, centers must be
- either None or an array of length equal to the length of n_samples.
- cluster_std : float or array-like of float, default=1.0
- The standard deviation of the clusters.
- center_box : tuple of float (min, max), default=(-10.0, 10.0)
- The bounding box for each cluster center when centers are
- generated at random.
- shuffle : bool, default=True
- Shuffle the samples.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- return_centers : bool, default=False
- If True, then return the centers of each cluster.
- .. versionadded:: 0.23
- Returns
- -------
- X : ndarray of shape (n_samples, n_features)
- The generated samples.
- y : ndarray of shape (n_samples,)
- The integer labels for cluster membership of each sample.
- centers : ndarray of shape (n_centers, n_features)
- The centers of each cluster. Only returned if
- ``return_centers=True``.
- See Also
- --------
- make_classification : A more intricate variant.
- Examples
- --------
- >>> from sklearn.datasets import make_blobs
- >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
- ... random_state=0)
- >>> print(X.shape)
- (10, 2)
- >>> y
- array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
- >>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,
- ... random_state=0)
- >>> print(X.shape)
- (10, 2)
- >>> y
- array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])
- """
- generator = check_random_state(random_state)
- if isinstance(n_samples, numbers.Integral):
- # Set n_centers by looking at centers arg
- if centers is None:
- centers = 3
- if isinstance(centers, numbers.Integral):
- n_centers = centers
- centers = generator.uniform(
- center_box[0], center_box[1], size=(n_centers, n_features)
- )
- else:
- centers = check_array(centers)
- n_features = centers.shape[1]
- n_centers = centers.shape[0]
- else:
- # Set n_centers by looking at [n_samples] arg
- n_centers = len(n_samples)
- if centers is None:
- centers = generator.uniform(
- center_box[0], center_box[1], size=(n_centers, n_features)
- )
- if not isinstance(centers, Iterable):
- raise ValueError(
- "Parameter `centers` must be array-like. Got {!r} instead".format(
- centers
- )
- )
- if len(centers) != n_centers:
- raise ValueError(
- "Length of `n_samples` not consistent with number of "
- f"centers. Got n_samples = {n_samples} and centers = {centers}"
- )
- centers = check_array(centers)
- n_features = centers.shape[1]
- # stds: if cluster_std is given as list, it must be consistent
- # with the n_centers
- if hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers:
- raise ValueError(
- "Length of `clusters_std` not consistent with "
- "number of centers. Got centers = {} "
- "and cluster_std = {}".format(centers, cluster_std)
- )
- if isinstance(cluster_std, numbers.Real):
- cluster_std = np.full(len(centers), cluster_std)
- if isinstance(n_samples, Iterable):
- n_samples_per_center = n_samples
- else:
- n_samples_per_center = [int(n_samples // n_centers)] * n_centers
- for i in range(n_samples % n_centers):
- n_samples_per_center[i] += 1
- cum_sum_n_samples = np.cumsum(n_samples_per_center)
- X = np.empty(shape=(sum(n_samples_per_center), n_features), dtype=np.float64)
- y = np.empty(shape=(sum(n_samples_per_center),), dtype=int)
- for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
- start_idx = cum_sum_n_samples[i - 1] if i > 0 else 0
- end_idx = cum_sum_n_samples[i]
- X[start_idx:end_idx] = generator.normal(
- loc=centers[i], scale=std, size=(n, n_features)
- )
- y[start_idx:end_idx] = i
- if shuffle:
- X, y = util_shuffle(X, y, random_state=generator)
- if return_centers:
- return X, y, centers
- else:
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "n_features": [Interval(Integral, 5, None, closed="left")],
- "noise": [Interval(Real, 0.0, None, closed="left")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_friedman1(n_samples=100, n_features=10, *, noise=0.0, random_state=None):
- """Generate the "Friedman #1" regression problem.
- This dataset is described in Friedman [1] and Breiman [2].
- Inputs `X` are independent features uniformly distributed on the interval
- [0, 1]. The output `y` is created according to the formula::
- y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
- + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).
- Out of the `n_features` features, only 5 are actually used to compute
- `y`. The remaining features are independent of `y`.
- The number of features has to be >= 5.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of samples.
- n_features : int, default=10
- The number of features. Should be at least 5.
- noise : float, default=0.0
- The standard deviation of the gaussian noise applied to the output.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset noise. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, n_features)
- The input samples.
- y : ndarray of shape (n_samples,)
- The output values.
- References
- ----------
- .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
- of Statistics 19 (1), pages 1-67, 1991.
- .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
- pages 123-140, 1996.
- """
- generator = check_random_state(random_state)
- X = generator.uniform(size=(n_samples, n_features))
- y = (
- 10 * np.sin(np.pi * X[:, 0] * X[:, 1])
- + 20 * (X[:, 2] - 0.5) ** 2
- + 10 * X[:, 3]
- + 5 * X[:, 4]
- + noise * generator.standard_normal(size=(n_samples))
- )
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "noise": [Interval(Real, 0, None, closed="left")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
- """Generate the "Friedman #2" regression problem.
- This dataset is described in Friedman [1] and Breiman [2].
- Inputs `X` are 4 independent features uniformly distributed on the
- intervals::
- 0 <= X[:, 0] <= 100,
- 40 * pi <= X[:, 1] <= 560 * pi,
- 0 <= X[:, 2] <= 1,
- 1 <= X[:, 3] <= 11.
- The output `y` is created according to the formula::
- y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] \
- - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of samples.
- noise : float, default=0.0
- The standard deviation of the gaussian noise applied to the output.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset noise. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, 4)
- The input samples.
- y : ndarray of shape (n_samples,)
- The output values.
- References
- ----------
- .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
- of Statistics 19 (1), pages 1-67, 1991.
- .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
- pages 123-140, 1996.
- """
- generator = check_random_state(random_state)
- X = generator.uniform(size=(n_samples, 4))
- X[:, 0] *= 100
- X[:, 1] *= 520 * np.pi
- X[:, 1] += 40 * np.pi
- X[:, 3] *= 10
- X[:, 3] += 1
- y = (
- X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2
- ) ** 0.5 + noise * generator.standard_normal(size=(n_samples))
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "noise": [Interval(Real, 0, None, closed="left")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
- """Generate the "Friedman #3" regression problem.
- This dataset is described in Friedman [1] and Breiman [2].
- Inputs `X` are 4 independent features uniformly distributed on the
- intervals::
- 0 <= X[:, 0] <= 100,
- 40 * pi <= X[:, 1] <= 560 * pi,
- 0 <= X[:, 2] <= 1,
- 1 <= X[:, 3] <= 11.
- The output `y` is created according to the formula::
- y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) \
- / X[:, 0]) + noise * N(0, 1).
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of samples.
- noise : float, default=0.0
- The standard deviation of the gaussian noise applied to the output.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset noise. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, 4)
- The input samples.
- y : ndarray of shape (n_samples,)
- The output values.
- References
- ----------
- .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
- of Statistics 19 (1), pages 1-67, 1991.
- .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
- pages 123-140, 1996.
- """
- generator = check_random_state(random_state)
- X = generator.uniform(size=(n_samples, 4))
- X[:, 0] *= 100
- X[:, 1] *= 520 * np.pi
- X[:, 1] += 40 * np.pi
- X[:, 3] *= 10
- X[:, 3] += 1
- y = np.arctan(
- (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]
- ) + noise * generator.standard_normal(size=(n_samples))
- return X, y
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "n_features": [Interval(Integral, 1, None, closed="left")],
- "effective_rank": [Interval(Integral, 1, None, closed="left")],
- "tail_strength": [Interval(Real, 0, 1, closed="both")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_low_rank_matrix(
- n_samples=100,
- n_features=100,
- *,
- effective_rank=10,
- tail_strength=0.5,
- random_state=None,
- ):
- """Generate a mostly low rank matrix with bell-shaped singular values.
- Most of the variance can be explained by a bell-shaped curve of width
- effective_rank: the low rank part of the singular values profile is::
- (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)
- The remaining singular values' tail is fat, decreasing as::
- tail_strength * exp(-0.1 * i / effective_rank).
- The low rank part of the profile can be considered the structured
- signal part of the data while the tail can be considered the noisy
- part of the data that cannot be summarized by a low number of linear
- components (singular vectors).
- This kind of singular profiles is often seen in practice, for instance:
- - gray level pictures of faces
- - TF-IDF vectors of text documents crawled from the web
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of samples.
- n_features : int, default=100
- The number of features.
- effective_rank : int, default=10
- The approximate number of singular vectors required to explain most of
- the data by linear combinations.
- tail_strength : float, default=0.5
- The relative importance of the fat noisy tail of the singular values
- profile. The value should be between 0 and 1.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, n_features)
- The matrix.
- """
- generator = check_random_state(random_state)
- n = min(n_samples, n_features)
- # Random (ortho normal) vectors
- u, _ = linalg.qr(
- generator.standard_normal(size=(n_samples, n)),
- mode="economic",
- check_finite=False,
- )
- v, _ = linalg.qr(
- generator.standard_normal(size=(n_features, n)),
- mode="economic",
- check_finite=False,
- )
- # Index of the singular values
- singular_ind = np.arange(n, dtype=np.float64)
- # Build the singular profile by assembling signal and noise components
- low_rank = (1 - tail_strength) * np.exp(-1.0 * (singular_ind / effective_rank) ** 2)
- tail = tail_strength * np.exp(-0.1 * singular_ind / effective_rank)
- s = np.identity(n) * (low_rank + tail)
- return np.dot(np.dot(u, s), v.T)
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "n_components": [Interval(Integral, 1, None, closed="left")],
- "n_features": [Interval(Integral, 1, None, closed="left")],
- "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left")],
- "random_state": ["random_state"],
- "data_transposed": ["boolean", Hidden(StrOptions({"deprecated"}))],
- },
- prefer_skip_nested_validation=True,
- )
- def make_sparse_coded_signal(
- n_samples,
- *,
- n_components,
- n_features,
- n_nonzero_coefs,
- random_state=None,
- data_transposed="deprecated",
- ):
- """Generate a signal as a sparse combination of dictionary elements.
- Returns a matrix `Y = DX`, such that `D` is of shape `(n_features, n_components)`,
- `X` is of shape `(n_components, n_samples)` and each column of `X` has exactly
- `n_nonzero_coefs` non-zero elements.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int
- Number of samples to generate.
- n_components : int
- Number of components in the dictionary.
- n_features : int
- Number of features of the dataset to generate.
- n_nonzero_coefs : int
- Number of active (non-zero) coefficients in each sample.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- data_transposed : bool, default=False
- By default, Y, D and X are not transposed.
- .. versionadded:: 1.1
- .. versionchanged:: 1.3
- Default value changed from True to False.
- .. deprecated:: 1.3
- `data_transposed` is deprecated and will be removed in 1.5.
- Returns
- -------
- data : ndarray of shape (n_features, n_samples) or (n_samples, n_features)
- The encoded signal (Y). The shape is `(n_samples, n_features)` if
- `data_transposed` is False, otherwise it's `(n_features, n_samples)`.
- dictionary : ndarray of shape (n_features, n_components) or \
- (n_components, n_features)
- The dictionary with normalized components (D). The shape is
- `(n_components, n_features)` if `data_transposed` is False, otherwise it's
- `(n_features, n_components)`.
- code : ndarray of shape (n_components, n_samples) or (n_samples, n_components)
- The sparse code such that each column of this matrix has exactly
- n_nonzero_coefs non-zero items (X). The shape is `(n_samples, n_components)`
- if `data_transposed` is False, otherwise it's `(n_components, n_samples)`.
- """
- generator = check_random_state(random_state)
- # generate dictionary
- D = generator.standard_normal(size=(n_features, n_components))
- D /= np.sqrt(np.sum((D**2), axis=0))
- # generate code
- X = np.zeros((n_components, n_samples))
- for i in range(n_samples):
- idx = np.arange(n_components)
- generator.shuffle(idx)
- idx = idx[:n_nonzero_coefs]
- X[idx, i] = generator.standard_normal(size=n_nonzero_coefs)
- # encode signal
- Y = np.dot(D, X)
- # TODO(1.5) remove data_transposed
- # raise warning if data_transposed is not passed explicitly
- if data_transposed != "deprecated":
- warnings.warn(
- "data_transposed was deprecated in version 1.3 and will be removed in 1.5.",
- FutureWarning,
- )
- else:
- data_transposed = False
- # transpose if needed
- if not data_transposed:
- Y, D, X = Y.T, D.T, X.T
- return map(np.squeeze, (Y, D, X))
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "n_features": [Interval(Integral, 1, None, closed="left")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_sparse_uncorrelated(n_samples=100, n_features=10, *, random_state=None):
- """Generate a random regression problem with sparse uncorrelated design.
- This dataset is described in Celeux et al [1]. as::
- X ~ N(0, 1)
- y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]
- Only the first 4 features are informative. The remaining features are
- useless.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of samples.
- n_features : int, default=10
- The number of features.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, n_features)
- The input samples.
- y : ndarray of shape (n_samples,)
- The output values.
- References
- ----------
- .. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,
- "Regularization in regression: comparing Bayesian and frequentist
- methods in a poorly informative situation", 2009.
- """
- generator = check_random_state(random_state)
- X = generator.normal(loc=0, scale=1, size=(n_samples, n_features))
- y = generator.normal(
- loc=(X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]),
- scale=np.ones(n_samples),
- )
- return X, y
- @validate_params(
- {
- "n_dim": [Interval(Integral, 1, None, closed="left")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_spd_matrix(n_dim, *, random_state=None):
- """Generate a random symmetric, positive-definite matrix.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_dim : int
- The matrix dimension.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_dim, n_dim)
- The random symmetric, positive-definite matrix.
- See Also
- --------
- make_sparse_spd_matrix: Generate a sparse symmetric definite positive matrix.
- """
- generator = check_random_state(random_state)
- A = generator.uniform(size=(n_dim, n_dim))
- U, _, Vt = linalg.svd(np.dot(A.T, A), check_finite=False)
- X = np.dot(np.dot(U, 1.0 + np.diag(generator.uniform(size=n_dim))), Vt)
- return X
- @validate_params(
- {
- "dim": [Interval(Integral, 1, None, closed="left")],
- "alpha": [Interval(Real, 0, 1, closed="both")],
- "norm_diag": ["boolean"],
- "smallest_coef": [Interval(Real, 0, 1, closed="both")],
- "largest_coef": [Interval(Real, 0, 1, closed="both")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_sparse_spd_matrix(
- dim=1,
- *,
- alpha=0.95,
- norm_diag=False,
- smallest_coef=0.1,
- largest_coef=0.9,
- random_state=None,
- ):
- """Generate a sparse symmetric definite positive matrix.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- dim : int, default=1
- The size of the random matrix to generate.
- alpha : float, default=0.95
- The probability that a coefficient is zero (see notes). Larger values
- enforce more sparsity. The value should be in the range 0 and 1.
- norm_diag : bool, default=False
- Whether to normalize the output matrix to make the leading diagonal
- elements all 1.
- smallest_coef : float, default=0.1
- The value of the smallest coefficient between 0 and 1.
- largest_coef : float, default=0.9
- The value of the largest coefficient between 0 and 1.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- prec : sparse matrix of shape (dim, dim)
- The generated matrix.
- See Also
- --------
- make_spd_matrix : Generate a random symmetric, positive-definite matrix.
- Notes
- -----
- The sparsity is actually imposed on the cholesky factor of the matrix.
- Thus alpha does not translate directly into the filling fraction of
- the matrix itself.
- """
- random_state = check_random_state(random_state)
- chol = -np.eye(dim)
- aux = random_state.uniform(size=(dim, dim))
- aux[aux < alpha] = 0
- aux[aux > alpha] = smallest_coef + (
- largest_coef - smallest_coef
- ) * random_state.uniform(size=np.sum(aux > alpha))
- aux = np.tril(aux, k=-1)
- # Permute the lines: we don't want to have asymmetries in the final
- # SPD matrix
- permutation = random_state.permutation(dim)
- aux = aux[permutation].T[permutation]
- chol += aux
- prec = np.dot(chol.T, chol)
- if norm_diag:
- # Form the diagonal vector into a row matrix
- d = np.diag(prec).reshape(1, prec.shape[0])
- d = 1.0 / np.sqrt(d)
- prec *= d
- prec *= d.T
- return prec
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "noise": [Interval(Real, 0, None, closed="left")],
- "random_state": ["random_state"],
- "hole": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None, hole=False):
- """Generate a swiss roll dataset.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of sample points on the Swiss Roll.
- noise : float, default=0.0
- The standard deviation of the gaussian noise.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- hole : bool, default=False
- If True generates the swiss roll with hole dataset.
- Returns
- -------
- X : ndarray of shape (n_samples, 3)
- The points.
- t : ndarray of shape (n_samples,)
- The univariate position of the sample according to the main dimension
- of the points in the manifold.
- Notes
- -----
- The algorithm is from Marsland [1].
- References
- ----------
- .. [1] S. Marsland, "Machine Learning: An Algorithmic Perspective", 2nd edition,
- Chapter 6, 2014.
- https://homepages.ecs.vuw.ac.nz/~marslast/Code/Ch6/lle.py
- """
- generator = check_random_state(random_state)
- if not hole:
- t = 1.5 * np.pi * (1 + 2 * generator.uniform(size=n_samples))
- y = 21 * generator.uniform(size=n_samples)
- else:
- corners = np.array(
- [[np.pi * (1.5 + i), j * 7] for i in range(3) for j in range(3)]
- )
- corners = np.delete(corners, 4, axis=0)
- corner_index = generator.choice(8, n_samples)
- parameters = generator.uniform(size=(2, n_samples)) * np.array([[np.pi], [7]])
- t, y = corners[corner_index].T + parameters
- x = t * np.cos(t)
- z = t * np.sin(t)
- X = np.vstack((x, y, z))
- X += noise * generator.standard_normal(size=(3, n_samples))
- X = X.T
- t = np.squeeze(t)
- return X, t
- @validate_params(
- {
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "noise": [Interval(Real, 0, None, closed="left")],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
- """Generate an S curve dataset.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- n_samples : int, default=100
- The number of sample points on the S curve.
- noise : float, default=0.0
- The standard deviation of the gaussian noise.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, 3)
- The points.
- t : ndarray of shape (n_samples,)
- The univariate position of the sample according to the main dimension
- of the points in the manifold.
- """
- generator = check_random_state(random_state)
- t = 3 * np.pi * (generator.uniform(size=(1, n_samples)) - 0.5)
- X = np.empty(shape=(n_samples, 3), dtype=np.float64)
- X[:, 0] = np.sin(t)
- X[:, 1] = 2.0 * generator.uniform(size=n_samples)
- X[:, 2] = np.sign(t) * (np.cos(t) - 1)
- X += noise * generator.standard_normal(size=(3, n_samples)).T
- t = np.squeeze(t)
- return X, t
- @validate_params(
- {
- "mean": ["array-like", None],
- "cov": [Interval(Real, 0, None, closed="left")],
- "n_samples": [Interval(Integral, 1, None, closed="left")],
- "n_features": [Interval(Integral, 1, None, closed="left")],
- "n_classes": [Interval(Integral, 1, None, closed="left")],
- "shuffle": ["boolean"],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_gaussian_quantiles(
- *,
- mean=None,
- cov=1.0,
- n_samples=100,
- n_features=2,
- n_classes=3,
- shuffle=True,
- random_state=None,
- ):
- r"""Generate isotropic Gaussian and label samples by quantile.
- This classification dataset is constructed by taking a multi-dimensional
- standard normal distribution and defining classes separated by nested
- concentric multi-dimensional spheres such that roughly equal numbers of
- samples are in each class (quantiles of the :math:`\chi^2` distribution).
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- mean : ndarray of shape (n_features,), default=None
- The mean of the multi-dimensional normal distribution.
- If None then use the origin (0, 0, ...).
- cov : float, default=1.0
- The covariance matrix will be this value times the unit matrix. This
- dataset only produces symmetric normal distributions.
- n_samples : int, default=100
- The total number of points equally divided among classes.
- n_features : int, default=2
- The number of features for each sample.
- n_classes : int, default=3
- The number of classes.
- shuffle : bool, default=True
- Shuffle the samples.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape (n_samples, n_features)
- The generated samples.
- y : ndarray of shape (n_samples,)
- The integer labels for quantile membership of each sample.
- Notes
- -----
- The dataset is from Zhu et al [1].
- References
- ----------
- .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
- """
- if n_samples < n_classes:
- raise ValueError("n_samples must be at least n_classes")
- generator = check_random_state(random_state)
- if mean is None:
- mean = np.zeros(n_features)
- else:
- mean = np.array(mean)
- # Build multivariate normal distribution
- X = generator.multivariate_normal(mean, cov * np.identity(n_features), (n_samples,))
- # Sort by distance from origin
- idx = np.argsort(np.sum((X - mean[np.newaxis, :]) ** 2, axis=1))
- X = X[idx, :]
- # Label by quantile
- step = n_samples // n_classes
- y = np.hstack(
- [
- np.repeat(np.arange(n_classes), step),
- np.repeat(n_classes - 1, n_samples - step * n_classes),
- ]
- )
- if shuffle:
- X, y = util_shuffle(X, y, random_state=generator)
- return X, y
- def _shuffle(data, random_state=None):
- generator = check_random_state(random_state)
- n_rows, n_cols = data.shape
- row_idx = generator.permutation(n_rows)
- col_idx = generator.permutation(n_cols)
- result = data[row_idx][:, col_idx]
- return result, row_idx, col_idx
- @validate_params(
- {
- "shape": [tuple],
- "n_clusters": [Interval(Integral, 1, None, closed="left")],
- "noise": [Interval(Real, 0, None, closed="left")],
- "minval": [Interval(Real, None, None, closed="neither")],
- "maxval": [Interval(Real, None, None, closed="neither")],
- "shuffle": ["boolean"],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_biclusters(
- shape,
- n_clusters,
- *,
- noise=0.0,
- minval=10,
- maxval=100,
- shuffle=True,
- random_state=None,
- ):
- """Generate a constant block diagonal structure array for biclustering.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- shape : tuple of shape (n_rows, n_cols)
- The shape of the result.
- n_clusters : int
- The number of biclusters.
- noise : float, default=0.0
- The standard deviation of the gaussian noise.
- minval : float, default=10
- Minimum value of a bicluster.
- maxval : float, default=100
- Maximum value of a bicluster.
- shuffle : bool, default=True
- Shuffle the samples.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape `shape`
- The generated array.
- rows : ndarray of shape (n_clusters, X.shape[0])
- The indicators for cluster membership of each row.
- cols : ndarray of shape (n_clusters, X.shape[1])
- The indicators for cluster membership of each column.
- See Also
- --------
- make_checkerboard: Generate an array with block checkerboard structure for
- biclustering.
- References
- ----------
- .. [1] Dhillon, I. S. (2001, August). Co-clustering documents and
- words using bipartite spectral graph partitioning. In Proceedings
- of the seventh ACM SIGKDD international conference on Knowledge
- discovery and data mining (pp. 269-274). ACM.
- """
- generator = check_random_state(random_state)
- n_rows, n_cols = shape
- consts = generator.uniform(minval, maxval, n_clusters)
- # row and column clusters of approximately equal sizes
- row_sizes = generator.multinomial(n_rows, np.repeat(1.0 / n_clusters, n_clusters))
- col_sizes = generator.multinomial(n_cols, np.repeat(1.0 / n_clusters, n_clusters))
- row_labels = np.hstack(
- [np.repeat(val, rep) for val, rep in zip(range(n_clusters), row_sizes)]
- )
- col_labels = np.hstack(
- [np.repeat(val, rep) for val, rep in zip(range(n_clusters), col_sizes)]
- )
- result = np.zeros(shape, dtype=np.float64)
- for i in range(n_clusters):
- selector = np.outer(row_labels == i, col_labels == i)
- result[selector] += consts[i]
- if noise > 0:
- result += generator.normal(scale=noise, size=result.shape)
- if shuffle:
- result, row_idx, col_idx = _shuffle(result, random_state)
- row_labels = row_labels[row_idx]
- col_labels = col_labels[col_idx]
- rows = np.vstack([row_labels == c for c in range(n_clusters)])
- cols = np.vstack([col_labels == c for c in range(n_clusters)])
- return result, rows, cols
- @validate_params(
- {
- "shape": [tuple],
- "n_clusters": [Interval(Integral, 1, None, closed="left"), "array-like"],
- "noise": [Interval(Real, 0, None, closed="left")],
- "minval": [Interval(Real, None, None, closed="neither")],
- "maxval": [Interval(Real, None, None, closed="neither")],
- "shuffle": ["boolean"],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def make_checkerboard(
- shape,
- n_clusters,
- *,
- noise=0.0,
- minval=10,
- maxval=100,
- shuffle=True,
- random_state=None,
- ):
- """Generate an array with block checkerboard structure for biclustering.
- Read more in the :ref:`User Guide <sample_generators>`.
- Parameters
- ----------
- shape : tuple of shape (n_rows, n_cols)
- The shape of the result.
- n_clusters : int or array-like or shape (n_row_clusters, n_column_clusters)
- The number of row and column clusters.
- noise : float, default=0.0
- The standard deviation of the gaussian noise.
- minval : float, default=10
- Minimum value of a bicluster.
- maxval : float, default=100
- Maximum value of a bicluster.
- shuffle : bool, default=True
- Shuffle the samples.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for dataset creation. Pass an int
- for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- X : ndarray of shape `shape`
- The generated array.
- rows : ndarray of shape (n_clusters, X.shape[0])
- The indicators for cluster membership of each row.
- cols : ndarray of shape (n_clusters, X.shape[1])
- The indicators for cluster membership of each column.
- See Also
- --------
- make_biclusters : Generate an array with constant block diagonal structure
- for biclustering.
- References
- ----------
- .. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).
- Spectral biclustering of microarray data: coclustering genes
- and conditions. Genome research, 13(4), 703-716.
- """
- generator = check_random_state(random_state)
- if hasattr(n_clusters, "__len__"):
- n_row_clusters, n_col_clusters = n_clusters
- else:
- n_row_clusters = n_col_clusters = n_clusters
- # row and column clusters of approximately equal sizes
- n_rows, n_cols = shape
- row_sizes = generator.multinomial(
- n_rows, np.repeat(1.0 / n_row_clusters, n_row_clusters)
- )
- col_sizes = generator.multinomial(
- n_cols, np.repeat(1.0 / n_col_clusters, n_col_clusters)
- )
- row_labels = np.hstack(
- [np.repeat(val, rep) for val, rep in zip(range(n_row_clusters), row_sizes)]
- )
- col_labels = np.hstack(
- [np.repeat(val, rep) for val, rep in zip(range(n_col_clusters), col_sizes)]
- )
- result = np.zeros(shape, dtype=np.float64)
- for i in range(n_row_clusters):
- for j in range(n_col_clusters):
- selector = np.outer(row_labels == i, col_labels == j)
- result[selector] += generator.uniform(minval, maxval)
- if noise > 0:
- result += generator.normal(scale=noise, size=result.shape)
- if shuffle:
- result, row_idx, col_idx = _shuffle(result, random_state)
- row_labels = row_labels[row_idx]
- col_labels = col_labels[col_idx]
- rows = np.vstack(
- [
- row_labels == label
- for label in range(n_row_clusters)
- for _ in range(n_col_clusters)
- ]
- )
- cols = np.vstack(
- [
- col_labels == label
- for _ in range(n_row_clusters)
- for label in range(n_col_clusters)
- ]
- )
- return result, rows, cols
|