_samples_generator.py 69 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126
  1. """
  2. Generate samples of synthetic data sets.
  3. """
  4. # Authors: B. Thirion, G. Varoquaux, A. Gramfort, V. Michel, O. Grisel,
  5. # G. Louppe, J. Nothman
  6. # License: BSD 3 clause
  7. import array
  8. import numbers
  9. import warnings
  10. from collections.abc import Iterable
  11. from numbers import Integral, Real
  12. import numpy as np
  13. import scipy.sparse as sp
  14. from scipy import linalg
  15. from ..preprocessing import MultiLabelBinarizer
  16. from ..utils import check_array, check_random_state
  17. from ..utils import shuffle as util_shuffle
  18. from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
  19. from ..utils.random import sample_without_replacement
  20. def _generate_hypercube(samples, dimensions, rng):
  21. """Returns distinct binary samples of length dimensions."""
  22. if dimensions > 30:
  23. return np.hstack(
  24. [
  25. rng.randint(2, size=(samples, dimensions - 30)),
  26. _generate_hypercube(samples, 30, rng),
  27. ]
  28. )
  29. out = sample_without_replacement(2**dimensions, samples, random_state=rng).astype(
  30. dtype=">u4", copy=False
  31. )
  32. out = np.unpackbits(out.view(">u1")).reshape((-1, 32))[:, -dimensions:]
  33. return out
  34. @validate_params(
  35. {
  36. "n_samples": [Interval(Integral, 1, None, closed="left")],
  37. "n_features": [Interval(Integral, 1, None, closed="left")],
  38. "n_informative": [Interval(Integral, 1, None, closed="left")],
  39. "n_redundant": [Interval(Integral, 0, None, closed="left")],
  40. "n_repeated": [Interval(Integral, 0, None, closed="left")],
  41. "n_classes": [Interval(Integral, 1, None, closed="left")],
  42. "n_clusters_per_class": [Interval(Integral, 1, None, closed="left")],
  43. "weights": ["array-like", None],
  44. "flip_y": [Interval(Real, 0, 1, closed="both")],
  45. "class_sep": [Interval(Real, 0, None, closed="neither")],
  46. "hypercube": ["boolean"],
  47. "shift": [Interval(Real, None, None, closed="neither"), "array-like", None],
  48. "scale": [Interval(Real, 0, None, closed="neither"), "array-like", None],
  49. "shuffle": ["boolean"],
  50. "random_state": ["random_state"],
  51. },
  52. prefer_skip_nested_validation=True,
  53. )
  54. def make_classification(
  55. n_samples=100,
  56. n_features=20,
  57. *,
  58. n_informative=2,
  59. n_redundant=2,
  60. n_repeated=0,
  61. n_classes=2,
  62. n_clusters_per_class=2,
  63. weights=None,
  64. flip_y=0.01,
  65. class_sep=1.0,
  66. hypercube=True,
  67. shift=0.0,
  68. scale=1.0,
  69. shuffle=True,
  70. random_state=None,
  71. ):
  72. """Generate a random n-class classification problem.
  73. This initially creates clusters of points normally distributed (std=1)
  74. about vertices of an ``n_informative``-dimensional hypercube with sides of
  75. length ``2*class_sep`` and assigns an equal number of clusters to each
  76. class. It introduces interdependence between these features and adds
  77. various types of further noise to the data.
  78. Without shuffling, ``X`` horizontally stacks features in the following
  79. order: the primary ``n_informative`` features, followed by ``n_redundant``
  80. linear combinations of the informative features, followed by ``n_repeated``
  81. duplicates, drawn randomly with replacement from the informative and
  82. redundant features. The remaining features are filled with random noise.
  83. Thus, without shuffling, all useful features are contained in the columns
  84. ``X[:, :n_informative + n_redundant + n_repeated]``.
  85. Read more in the :ref:`User Guide <sample_generators>`.
  86. Parameters
  87. ----------
  88. n_samples : int, default=100
  89. The number of samples.
  90. n_features : int, default=20
  91. The total number of features. These comprise ``n_informative``
  92. informative features, ``n_redundant`` redundant features,
  93. ``n_repeated`` duplicated features and
  94. ``n_features-n_informative-n_redundant-n_repeated`` useless features
  95. drawn at random.
  96. n_informative : int, default=2
  97. The number of informative features. Each class is composed of a number
  98. of gaussian clusters each located around the vertices of a hypercube
  99. in a subspace of dimension ``n_informative``. For each cluster,
  100. informative features are drawn independently from N(0, 1) and then
  101. randomly linearly combined within each cluster in order to add
  102. covariance. The clusters are then placed on the vertices of the
  103. hypercube.
  104. n_redundant : int, default=2
  105. The number of redundant features. These features are generated as
  106. random linear combinations of the informative features.
  107. n_repeated : int, default=0
  108. The number of duplicated features, drawn randomly from the informative
  109. and the redundant features.
  110. n_classes : int, default=2
  111. The number of classes (or labels) of the classification problem.
  112. n_clusters_per_class : int, default=2
  113. The number of clusters per class.
  114. weights : array-like of shape (n_classes,) or (n_classes - 1,),\
  115. default=None
  116. The proportions of samples assigned to each class. If None, then
  117. classes are balanced. Note that if ``len(weights) == n_classes - 1``,
  118. then the last class weight is automatically inferred.
  119. More than ``n_samples`` samples may be returned if the sum of
  120. ``weights`` exceeds 1. Note that the actual class proportions will
  121. not exactly match ``weights`` when ``flip_y`` isn't 0.
  122. flip_y : float, default=0.01
  123. The fraction of samples whose class is assigned randomly. Larger
  124. values introduce noise in the labels and make the classification
  125. task harder. Note that the default setting flip_y > 0 might lead
  126. to less than ``n_classes`` in y in some cases.
  127. class_sep : float, default=1.0
  128. The factor multiplying the hypercube size. Larger values spread
  129. out the clusters/classes and make the classification task easier.
  130. hypercube : bool, default=True
  131. If True, the clusters are put on the vertices of a hypercube. If
  132. False, the clusters are put on the vertices of a random polytope.
  133. shift : float, ndarray of shape (n_features,) or None, default=0.0
  134. Shift features by the specified value. If None, then features
  135. are shifted by a random value drawn in [-class_sep, class_sep].
  136. scale : float, ndarray of shape (n_features,) or None, default=1.0
  137. Multiply features by the specified value. If None, then features
  138. are scaled by a random value drawn in [1, 100]. Note that scaling
  139. happens after shifting.
  140. shuffle : bool, default=True
  141. Shuffle the samples and the features.
  142. random_state : int, RandomState instance or None, default=None
  143. Determines random number generation for dataset creation. Pass an int
  144. for reproducible output across multiple function calls.
  145. See :term:`Glossary <random_state>`.
  146. Returns
  147. -------
  148. X : ndarray of shape (n_samples, n_features)
  149. The generated samples.
  150. y : ndarray of shape (n_samples,)
  151. The integer labels for class membership of each sample.
  152. See Also
  153. --------
  154. make_blobs : Simplified variant.
  155. make_multilabel_classification : Unrelated generator for multilabel tasks.
  156. Notes
  157. -----
  158. The algorithm is adapted from Guyon [1] and was designed to generate
  159. the "Madelon" dataset.
  160. References
  161. ----------
  162. .. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable
  163. selection benchmark", 2003.
  164. """
  165. generator = check_random_state(random_state)
  166. # Count features, clusters and samples
  167. if n_informative + n_redundant + n_repeated > n_features:
  168. raise ValueError(
  169. "Number of informative, redundant and repeated "
  170. "features must sum to less than the number of total"
  171. " features"
  172. )
  173. # Use log2 to avoid overflow errors
  174. if n_informative < np.log2(n_classes * n_clusters_per_class):
  175. msg = "n_classes({}) * n_clusters_per_class({}) must be"
  176. msg += " smaller or equal 2**n_informative({})={}"
  177. raise ValueError(
  178. msg.format(
  179. n_classes, n_clusters_per_class, n_informative, 2**n_informative
  180. )
  181. )
  182. if weights is not None:
  183. if len(weights) not in [n_classes, n_classes - 1]:
  184. raise ValueError(
  185. "Weights specified but incompatible with number of classes."
  186. )
  187. if len(weights) == n_classes - 1:
  188. if isinstance(weights, list):
  189. weights = weights + [1.0 - sum(weights)]
  190. else:
  191. weights = np.resize(weights, n_classes)
  192. weights[-1] = 1.0 - sum(weights[:-1])
  193. else:
  194. weights = [1.0 / n_classes] * n_classes
  195. n_useless = n_features - n_informative - n_redundant - n_repeated
  196. n_clusters = n_classes * n_clusters_per_class
  197. # Distribute samples among clusters by weight
  198. n_samples_per_cluster = [
  199. int(n_samples * weights[k % n_classes] / n_clusters_per_class)
  200. for k in range(n_clusters)
  201. ]
  202. for i in range(n_samples - sum(n_samples_per_cluster)):
  203. n_samples_per_cluster[i % n_clusters] += 1
  204. # Initialize X and y
  205. X = np.zeros((n_samples, n_features))
  206. y = np.zeros(n_samples, dtype=int)
  207. # Build the polytope whose vertices become cluster centroids
  208. centroids = _generate_hypercube(n_clusters, n_informative, generator).astype(
  209. float, copy=False
  210. )
  211. centroids *= 2 * class_sep
  212. centroids -= class_sep
  213. if not hypercube:
  214. centroids *= generator.uniform(size=(n_clusters, 1))
  215. centroids *= generator.uniform(size=(1, n_informative))
  216. # Initially draw informative features from the standard normal
  217. X[:, :n_informative] = generator.standard_normal(size=(n_samples, n_informative))
  218. # Create each cluster; a variant of make_blobs
  219. stop = 0
  220. for k, centroid in enumerate(centroids):
  221. start, stop = stop, stop + n_samples_per_cluster[k]
  222. y[start:stop] = k % n_classes # assign labels
  223. X_k = X[start:stop, :n_informative] # slice a view of the cluster
  224. A = 2 * generator.uniform(size=(n_informative, n_informative)) - 1
  225. X_k[...] = np.dot(X_k, A) # introduce random covariance
  226. X_k += centroid # shift the cluster to a vertex
  227. # Create redundant features
  228. if n_redundant > 0:
  229. B = 2 * generator.uniform(size=(n_informative, n_redundant)) - 1
  230. X[:, n_informative : n_informative + n_redundant] = np.dot(
  231. X[:, :n_informative], B
  232. )
  233. # Repeat some features
  234. if n_repeated > 0:
  235. n = n_informative + n_redundant
  236. indices = ((n - 1) * generator.uniform(size=n_repeated) + 0.5).astype(np.intp)
  237. X[:, n : n + n_repeated] = X[:, indices]
  238. # Fill useless features
  239. if n_useless > 0:
  240. X[:, -n_useless:] = generator.standard_normal(size=(n_samples, n_useless))
  241. # Randomly replace labels
  242. if flip_y >= 0.0:
  243. flip_mask = generator.uniform(size=n_samples) < flip_y
  244. y[flip_mask] = generator.randint(n_classes, size=flip_mask.sum())
  245. # Randomly shift and scale
  246. if shift is None:
  247. shift = (2 * generator.uniform(size=n_features) - 1) * class_sep
  248. X += shift
  249. if scale is None:
  250. scale = 1 + 100 * generator.uniform(size=n_features)
  251. X *= scale
  252. if shuffle:
  253. # Randomly permute samples
  254. X, y = util_shuffle(X, y, random_state=generator)
  255. # Randomly permute features
  256. indices = np.arange(n_features)
  257. generator.shuffle(indices)
  258. X[:, :] = X[:, indices]
  259. return X, y
  260. @validate_params(
  261. {
  262. "n_samples": [Interval(Integral, 1, None, closed="left")],
  263. "n_features": [Interval(Integral, 1, None, closed="left")],
  264. "n_classes": [Interval(Integral, 1, None, closed="left")],
  265. "n_labels": [Interval(Integral, 0, None, closed="left")],
  266. "length": [Interval(Integral, 1, None, closed="left")],
  267. "allow_unlabeled": ["boolean"],
  268. "sparse": ["boolean"],
  269. "return_indicator": [StrOptions({"dense", "sparse"}), "boolean"],
  270. "return_distributions": ["boolean"],
  271. "random_state": ["random_state"],
  272. },
  273. prefer_skip_nested_validation=True,
  274. )
  275. def make_multilabel_classification(
  276. n_samples=100,
  277. n_features=20,
  278. *,
  279. n_classes=5,
  280. n_labels=2,
  281. length=50,
  282. allow_unlabeled=True,
  283. sparse=False,
  284. return_indicator="dense",
  285. return_distributions=False,
  286. random_state=None,
  287. ):
  288. """Generate a random multilabel classification problem.
  289. For each sample, the generative process is:
  290. - pick the number of labels: n ~ Poisson(n_labels)
  291. - n times, choose a class c: c ~ Multinomial(theta)
  292. - pick the document length: k ~ Poisson(length)
  293. - k times, choose a word: w ~ Multinomial(theta_c)
  294. In the above process, rejection sampling is used to make sure that
  295. n is never zero or more than `n_classes`, and that the document length
  296. is never zero. Likewise, we reject classes which have already been chosen.
  297. Read more in the :ref:`User Guide <sample_generators>`.
  298. Parameters
  299. ----------
  300. n_samples : int, default=100
  301. The number of samples.
  302. n_features : int, default=20
  303. The total number of features.
  304. n_classes : int, default=5
  305. The number of classes of the classification problem.
  306. n_labels : int, default=2
  307. The average number of labels per instance. More precisely, the number
  308. of labels per sample is drawn from a Poisson distribution with
  309. ``n_labels`` as its expected value, but samples are bounded (using
  310. rejection sampling) by ``n_classes``, and must be nonzero if
  311. ``allow_unlabeled`` is False.
  312. length : int, default=50
  313. The sum of the features (number of words if documents) is drawn from
  314. a Poisson distribution with this expected value.
  315. allow_unlabeled : bool, default=True
  316. If ``True``, some instances might not belong to any class.
  317. sparse : bool, default=False
  318. If ``True``, return a sparse feature matrix.
  319. .. versionadded:: 0.17
  320. parameter to allow *sparse* output.
  321. return_indicator : {'dense', 'sparse'} or False, default='dense'
  322. If ``'dense'`` return ``Y`` in the dense binary indicator format. If
  323. ``'sparse'`` return ``Y`` in the sparse binary indicator format.
  324. ``False`` returns a list of lists of labels.
  325. return_distributions : bool, default=False
  326. If ``True``, return the prior class probability and conditional
  327. probabilities of features given classes, from which the data was
  328. drawn.
  329. random_state : int, RandomState instance or None, default=None
  330. Determines random number generation for dataset creation. Pass an int
  331. for reproducible output across multiple function calls.
  332. See :term:`Glossary <random_state>`.
  333. Returns
  334. -------
  335. X : ndarray of shape (n_samples, n_features)
  336. The generated samples.
  337. Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
  338. The label sets. Sparse matrix should be of CSR format.
  339. p_c : ndarray of shape (n_classes,)
  340. The probability of each class being drawn. Only returned if
  341. ``return_distributions=True``.
  342. p_w_c : ndarray of shape (n_features, n_classes)
  343. The probability of each feature being drawn given each class.
  344. Only returned if ``return_distributions=True``.
  345. """
  346. generator = check_random_state(random_state)
  347. p_c = generator.uniform(size=n_classes)
  348. p_c /= p_c.sum()
  349. cumulative_p_c = np.cumsum(p_c)
  350. p_w_c = generator.uniform(size=(n_features, n_classes))
  351. p_w_c /= np.sum(p_w_c, axis=0)
  352. def sample_example():
  353. _, n_classes = p_w_c.shape
  354. # pick a nonzero number of labels per document by rejection sampling
  355. y_size = n_classes + 1
  356. while (not allow_unlabeled and y_size == 0) or y_size > n_classes:
  357. y_size = generator.poisson(n_labels)
  358. # pick n classes
  359. y = set()
  360. while len(y) != y_size:
  361. # pick a class with probability P(c)
  362. c = np.searchsorted(cumulative_p_c, generator.uniform(size=y_size - len(y)))
  363. y.update(c)
  364. y = list(y)
  365. # pick a non-zero document length by rejection sampling
  366. n_words = 0
  367. while n_words == 0:
  368. n_words = generator.poisson(length)
  369. # generate a document of length n_words
  370. if len(y) == 0:
  371. # if sample does not belong to any class, generate noise word
  372. words = generator.randint(n_features, size=n_words)
  373. return words, y
  374. # sample words with replacement from selected classes
  375. cumulative_p_w_sample = p_w_c.take(y, axis=1).sum(axis=1).cumsum()
  376. cumulative_p_w_sample /= cumulative_p_w_sample[-1]
  377. words = np.searchsorted(cumulative_p_w_sample, generator.uniform(size=n_words))
  378. return words, y
  379. X_indices = array.array("i")
  380. X_indptr = array.array("i", [0])
  381. Y = []
  382. for i in range(n_samples):
  383. words, y = sample_example()
  384. X_indices.extend(words)
  385. X_indptr.append(len(X_indices))
  386. Y.append(y)
  387. X_data = np.ones(len(X_indices), dtype=np.float64)
  388. X = sp.csr_matrix((X_data, X_indices, X_indptr), shape=(n_samples, n_features))
  389. X.sum_duplicates()
  390. if not sparse:
  391. X = X.toarray()
  392. # return_indicator can be True due to backward compatibility
  393. if return_indicator in (True, "sparse", "dense"):
  394. lb = MultiLabelBinarizer(sparse_output=(return_indicator == "sparse"))
  395. Y = lb.fit([range(n_classes)]).transform(Y)
  396. if return_distributions:
  397. return X, Y, p_c, p_w_c
  398. return X, Y
  399. @validate_params(
  400. {
  401. "n_samples": [Interval(Integral, 1, None, closed="left")],
  402. "random_state": ["random_state"],
  403. },
  404. prefer_skip_nested_validation=True,
  405. )
  406. def make_hastie_10_2(n_samples=12000, *, random_state=None):
  407. """Generate data for binary classification used in Hastie et al. 2009, Example 10.2.
  408. The ten features are standard independent Gaussian and
  409. the target ``y`` is defined by::
  410. y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1
  411. Read more in the :ref:`User Guide <sample_generators>`.
  412. Parameters
  413. ----------
  414. n_samples : int, default=12000
  415. The number of samples.
  416. random_state : int, RandomState instance or None, default=None
  417. Determines random number generation for dataset creation. Pass an int
  418. for reproducible output across multiple function calls.
  419. See :term:`Glossary <random_state>`.
  420. Returns
  421. -------
  422. X : ndarray of shape (n_samples, 10)
  423. The input samples.
  424. y : ndarray of shape (n_samples,)
  425. The output values.
  426. See Also
  427. --------
  428. make_gaussian_quantiles : A generalization of this dataset approach.
  429. References
  430. ----------
  431. .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
  432. Learning Ed. 2", Springer, 2009.
  433. """
  434. rs = check_random_state(random_state)
  435. shape = (n_samples, 10)
  436. X = rs.normal(size=shape).reshape(shape)
  437. y = ((X**2.0).sum(axis=1) > 9.34).astype(np.float64, copy=False)
  438. y[y == 0.0] = -1.0
  439. return X, y
  440. @validate_params(
  441. {
  442. "n_samples": [Interval(Integral, 1, None, closed="left")],
  443. "n_features": [Interval(Integral, 1, None, closed="left")],
  444. "n_informative": [Interval(Integral, 0, None, closed="left")],
  445. "n_targets": [Interval(Integral, 1, None, closed="left")],
  446. "bias": [Interval(Real, None, None, closed="neither")],
  447. "effective_rank": [Interval(Integral, 1, None, closed="left"), None],
  448. "tail_strength": [Interval(Real, 0, 1, closed="both")],
  449. "noise": [Interval(Real, 0, None, closed="left")],
  450. "shuffle": ["boolean"],
  451. "coef": ["boolean"],
  452. "random_state": ["random_state"],
  453. },
  454. prefer_skip_nested_validation=True,
  455. )
  456. def make_regression(
  457. n_samples=100,
  458. n_features=100,
  459. *,
  460. n_informative=10,
  461. n_targets=1,
  462. bias=0.0,
  463. effective_rank=None,
  464. tail_strength=0.5,
  465. noise=0.0,
  466. shuffle=True,
  467. coef=False,
  468. random_state=None,
  469. ):
  470. """Generate a random regression problem.
  471. The input set can either be well conditioned (by default) or have a low
  472. rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
  473. more details.
  474. The output is generated by applying a (potentially biased) random linear
  475. regression model with `n_informative` nonzero regressors to the previously
  476. generated input and some gaussian centered noise with some adjustable
  477. scale.
  478. Read more in the :ref:`User Guide <sample_generators>`.
  479. Parameters
  480. ----------
  481. n_samples : int, default=100
  482. The number of samples.
  483. n_features : int, default=100
  484. The number of features.
  485. n_informative : int, default=10
  486. The number of informative features, i.e., the number of features used
  487. to build the linear model used to generate the output.
  488. n_targets : int, default=1
  489. The number of regression targets, i.e., the dimension of the y output
  490. vector associated with a sample. By default, the output is a scalar.
  491. bias : float, default=0.0
  492. The bias term in the underlying linear model.
  493. effective_rank : int, default=None
  494. If not None:
  495. The approximate number of singular vectors required to explain most
  496. of the input data by linear combinations. Using this kind of
  497. singular spectrum in the input allows the generator to reproduce
  498. the correlations often observed in practice.
  499. If None:
  500. The input set is well conditioned, centered and gaussian with
  501. unit variance.
  502. tail_strength : float, default=0.5
  503. The relative importance of the fat noisy tail of the singular values
  504. profile if `effective_rank` is not None. When a float, it should be
  505. between 0 and 1.
  506. noise : float, default=0.0
  507. The standard deviation of the gaussian noise applied to the output.
  508. shuffle : bool, default=True
  509. Shuffle the samples and the features.
  510. coef : bool, default=False
  511. If True, the coefficients of the underlying linear model are returned.
  512. random_state : int, RandomState instance or None, default=None
  513. Determines random number generation for dataset creation. Pass an int
  514. for reproducible output across multiple function calls.
  515. See :term:`Glossary <random_state>`.
  516. Returns
  517. -------
  518. X : ndarray of shape (n_samples, n_features)
  519. The input samples.
  520. y : ndarray of shape (n_samples,) or (n_samples, n_targets)
  521. The output values.
  522. coef : ndarray of shape (n_features,) or (n_features, n_targets)
  523. The coefficient of the underlying linear model. It is returned only if
  524. coef is True.
  525. Examples
  526. --------
  527. >>> from sklearn.datasets import make_regression
  528. >>> X, y = make_regression(n_samples=5, n_features=2, noise=1, random_state=42)
  529. >>> X
  530. array([[ 0.4967..., -0.1382... ],
  531. [ 0.6476..., 1.523...],
  532. [-0.2341..., -0.2341...],
  533. [-0.4694..., 0.5425...],
  534. [ 1.579..., 0.7674...]])
  535. >>> y
  536. array([ 6.737..., 37.79..., -10.27..., 0.4017..., 42.22...])
  537. """
  538. n_informative = min(n_features, n_informative)
  539. generator = check_random_state(random_state)
  540. if effective_rank is None:
  541. # Randomly generate a well conditioned input set
  542. X = generator.standard_normal(size=(n_samples, n_features))
  543. else:
  544. # Randomly generate a low rank, fat tail input set
  545. X = make_low_rank_matrix(
  546. n_samples=n_samples,
  547. n_features=n_features,
  548. effective_rank=effective_rank,
  549. tail_strength=tail_strength,
  550. random_state=generator,
  551. )
  552. # Generate a ground truth model with only n_informative features being non
  553. # zeros (the other features are not correlated to y and should be ignored
  554. # by a sparsifying regularizers such as L1 or elastic net)
  555. ground_truth = np.zeros((n_features, n_targets))
  556. ground_truth[:n_informative, :] = 100 * generator.uniform(
  557. size=(n_informative, n_targets)
  558. )
  559. y = np.dot(X, ground_truth) + bias
  560. # Add noise
  561. if noise > 0.0:
  562. y += generator.normal(scale=noise, size=y.shape)
  563. # Randomly permute samples and features
  564. if shuffle:
  565. X, y = util_shuffle(X, y, random_state=generator)
  566. indices = np.arange(n_features)
  567. generator.shuffle(indices)
  568. X[:, :] = X[:, indices]
  569. ground_truth = ground_truth[indices]
  570. y = np.squeeze(y)
  571. if coef:
  572. return X, y, np.squeeze(ground_truth)
  573. else:
  574. return X, y
  575. @validate_params(
  576. {
  577. "n_samples": [Interval(Integral, 0, None, closed="left"), tuple],
  578. "shuffle": ["boolean"],
  579. "noise": [Interval(Real, 0, None, closed="left"), None],
  580. "random_state": ["random_state"],
  581. "factor": [Interval(Real, 0, 1, closed="left")],
  582. },
  583. prefer_skip_nested_validation=True,
  584. )
  585. def make_circles(
  586. n_samples=100, *, shuffle=True, noise=None, random_state=None, factor=0.8
  587. ):
  588. """Make a large circle containing a smaller circle in 2d.
  589. A simple toy dataset to visualize clustering and classification
  590. algorithms.
  591. Read more in the :ref:`User Guide <sample_generators>`.
  592. Parameters
  593. ----------
  594. n_samples : int or tuple of shape (2,), dtype=int, default=100
  595. If int, it is the total number of points generated.
  596. For odd numbers, the inner circle will have one point more than the
  597. outer circle.
  598. If two-element tuple, number of points in outer circle and inner
  599. circle.
  600. .. versionchanged:: 0.23
  601. Added two-element tuple.
  602. shuffle : bool, default=True
  603. Whether to shuffle the samples.
  604. noise : float, default=None
  605. Standard deviation of Gaussian noise added to the data.
  606. random_state : int, RandomState instance or None, default=None
  607. Determines random number generation for dataset shuffling and noise.
  608. Pass an int for reproducible output across multiple function calls.
  609. See :term:`Glossary <random_state>`.
  610. factor : float, default=.8
  611. Scale factor between inner and outer circle in the range `[0, 1)`.
  612. Returns
  613. -------
  614. X : ndarray of shape (n_samples, 2)
  615. The generated samples.
  616. y : ndarray of shape (n_samples,)
  617. The integer labels (0 or 1) for class membership of each sample.
  618. """
  619. if isinstance(n_samples, numbers.Integral):
  620. n_samples_out = n_samples // 2
  621. n_samples_in = n_samples - n_samples_out
  622. else: # n_samples is a tuple
  623. if len(n_samples) != 2:
  624. raise ValueError("When a tuple, n_samples must have exactly two elements.")
  625. n_samples_out, n_samples_in = n_samples
  626. generator = check_random_state(random_state)
  627. # so as not to have the first point = last point, we set endpoint=False
  628. linspace_out = np.linspace(0, 2 * np.pi, n_samples_out, endpoint=False)
  629. linspace_in = np.linspace(0, 2 * np.pi, n_samples_in, endpoint=False)
  630. outer_circ_x = np.cos(linspace_out)
  631. outer_circ_y = np.sin(linspace_out)
  632. inner_circ_x = np.cos(linspace_in) * factor
  633. inner_circ_y = np.sin(linspace_in) * factor
  634. X = np.vstack(
  635. [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
  636. ).T
  637. y = np.hstack(
  638. [np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)]
  639. )
  640. if shuffle:
  641. X, y = util_shuffle(X, y, random_state=generator)
  642. if noise is not None:
  643. X += generator.normal(scale=noise, size=X.shape)
  644. return X, y
  645. @validate_params(
  646. {
  647. "n_samples": [Interval(Integral, 1, None, closed="left"), tuple],
  648. "shuffle": ["boolean"],
  649. "noise": [Interval(Real, 0, None, closed="left"), None],
  650. "random_state": ["random_state"],
  651. },
  652. prefer_skip_nested_validation=True,
  653. )
  654. def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
  655. """Make two interleaving half circles.
  656. A simple toy dataset to visualize clustering and classification
  657. algorithms. Read more in the :ref:`User Guide <sample_generators>`.
  658. Parameters
  659. ----------
  660. n_samples : int or tuple of shape (2,), dtype=int, default=100
  661. If int, the total number of points generated.
  662. If two-element tuple, number of points in each of two moons.
  663. .. versionchanged:: 0.23
  664. Added two-element tuple.
  665. shuffle : bool, default=True
  666. Whether to shuffle the samples.
  667. noise : float, default=None
  668. Standard deviation of Gaussian noise added to the data.
  669. random_state : int, RandomState instance or None, default=None
  670. Determines random number generation for dataset shuffling and noise.
  671. Pass an int for reproducible output across multiple function calls.
  672. See :term:`Glossary <random_state>`.
  673. Returns
  674. -------
  675. X : ndarray of shape (n_samples, 2)
  676. The generated samples.
  677. y : ndarray of shape (n_samples,)
  678. The integer labels (0 or 1) for class membership of each sample.
  679. """
  680. if isinstance(n_samples, numbers.Integral):
  681. n_samples_out = n_samples // 2
  682. n_samples_in = n_samples - n_samples_out
  683. else:
  684. try:
  685. n_samples_out, n_samples_in = n_samples
  686. except ValueError as e:
  687. raise ValueError(
  688. "`n_samples` can be either an int or a two-element tuple."
  689. ) from e
  690. generator = check_random_state(random_state)
  691. outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))
  692. outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))
  693. inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))
  694. inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.5
  695. X = np.vstack(
  696. [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
  697. ).T
  698. y = np.hstack(
  699. [np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)]
  700. )
  701. if shuffle:
  702. X, y = util_shuffle(X, y, random_state=generator)
  703. if noise is not None:
  704. X += generator.normal(scale=noise, size=X.shape)
  705. return X, y
  706. @validate_params(
  707. {
  708. "n_samples": [Interval(Integral, 1, None, closed="left"), "array-like"],
  709. "n_features": [Interval(Integral, 1, None, closed="left")],
  710. "centers": [Interval(Integral, 1, None, closed="left"), "array-like", None],
  711. "cluster_std": [Interval(Real, 0, None, closed="left"), "array-like"],
  712. "center_box": [tuple],
  713. "shuffle": ["boolean"],
  714. "random_state": ["random_state"],
  715. "return_centers": ["boolean"],
  716. },
  717. prefer_skip_nested_validation=True,
  718. )
  719. def make_blobs(
  720. n_samples=100,
  721. n_features=2,
  722. *,
  723. centers=None,
  724. cluster_std=1.0,
  725. center_box=(-10.0, 10.0),
  726. shuffle=True,
  727. random_state=None,
  728. return_centers=False,
  729. ):
  730. """Generate isotropic Gaussian blobs for clustering.
  731. Read more in the :ref:`User Guide <sample_generators>`.
  732. Parameters
  733. ----------
  734. n_samples : int or array-like, default=100
  735. If int, it is the total number of points equally divided among
  736. clusters.
  737. If array-like, each element of the sequence indicates
  738. the number of samples per cluster.
  739. .. versionchanged:: v0.20
  740. one can now pass an array-like to the ``n_samples`` parameter
  741. n_features : int, default=2
  742. The number of features for each sample.
  743. centers : int or array-like of shape (n_centers, n_features), default=None
  744. The number of centers to generate, or the fixed center locations.
  745. If n_samples is an int and centers is None, 3 centers are generated.
  746. If n_samples is array-like, centers must be
  747. either None or an array of length equal to the length of n_samples.
  748. cluster_std : float or array-like of float, default=1.0
  749. The standard deviation of the clusters.
  750. center_box : tuple of float (min, max), default=(-10.0, 10.0)
  751. The bounding box for each cluster center when centers are
  752. generated at random.
  753. shuffle : bool, default=True
  754. Shuffle the samples.
  755. random_state : int, RandomState instance or None, default=None
  756. Determines random number generation for dataset creation. Pass an int
  757. for reproducible output across multiple function calls.
  758. See :term:`Glossary <random_state>`.
  759. return_centers : bool, default=False
  760. If True, then return the centers of each cluster.
  761. .. versionadded:: 0.23
  762. Returns
  763. -------
  764. X : ndarray of shape (n_samples, n_features)
  765. The generated samples.
  766. y : ndarray of shape (n_samples,)
  767. The integer labels for cluster membership of each sample.
  768. centers : ndarray of shape (n_centers, n_features)
  769. The centers of each cluster. Only returned if
  770. ``return_centers=True``.
  771. See Also
  772. --------
  773. make_classification : A more intricate variant.
  774. Examples
  775. --------
  776. >>> from sklearn.datasets import make_blobs
  777. >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
  778. ... random_state=0)
  779. >>> print(X.shape)
  780. (10, 2)
  781. >>> y
  782. array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
  783. >>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,
  784. ... random_state=0)
  785. >>> print(X.shape)
  786. (10, 2)
  787. >>> y
  788. array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])
  789. """
  790. generator = check_random_state(random_state)
  791. if isinstance(n_samples, numbers.Integral):
  792. # Set n_centers by looking at centers arg
  793. if centers is None:
  794. centers = 3
  795. if isinstance(centers, numbers.Integral):
  796. n_centers = centers
  797. centers = generator.uniform(
  798. center_box[0], center_box[1], size=(n_centers, n_features)
  799. )
  800. else:
  801. centers = check_array(centers)
  802. n_features = centers.shape[1]
  803. n_centers = centers.shape[0]
  804. else:
  805. # Set n_centers by looking at [n_samples] arg
  806. n_centers = len(n_samples)
  807. if centers is None:
  808. centers = generator.uniform(
  809. center_box[0], center_box[1], size=(n_centers, n_features)
  810. )
  811. if not isinstance(centers, Iterable):
  812. raise ValueError(
  813. "Parameter `centers` must be array-like. Got {!r} instead".format(
  814. centers
  815. )
  816. )
  817. if len(centers) != n_centers:
  818. raise ValueError(
  819. "Length of `n_samples` not consistent with number of "
  820. f"centers. Got n_samples = {n_samples} and centers = {centers}"
  821. )
  822. centers = check_array(centers)
  823. n_features = centers.shape[1]
  824. # stds: if cluster_std is given as list, it must be consistent
  825. # with the n_centers
  826. if hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers:
  827. raise ValueError(
  828. "Length of `clusters_std` not consistent with "
  829. "number of centers. Got centers = {} "
  830. "and cluster_std = {}".format(centers, cluster_std)
  831. )
  832. if isinstance(cluster_std, numbers.Real):
  833. cluster_std = np.full(len(centers), cluster_std)
  834. if isinstance(n_samples, Iterable):
  835. n_samples_per_center = n_samples
  836. else:
  837. n_samples_per_center = [int(n_samples // n_centers)] * n_centers
  838. for i in range(n_samples % n_centers):
  839. n_samples_per_center[i] += 1
  840. cum_sum_n_samples = np.cumsum(n_samples_per_center)
  841. X = np.empty(shape=(sum(n_samples_per_center), n_features), dtype=np.float64)
  842. y = np.empty(shape=(sum(n_samples_per_center),), dtype=int)
  843. for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
  844. start_idx = cum_sum_n_samples[i - 1] if i > 0 else 0
  845. end_idx = cum_sum_n_samples[i]
  846. X[start_idx:end_idx] = generator.normal(
  847. loc=centers[i], scale=std, size=(n, n_features)
  848. )
  849. y[start_idx:end_idx] = i
  850. if shuffle:
  851. X, y = util_shuffle(X, y, random_state=generator)
  852. if return_centers:
  853. return X, y, centers
  854. else:
  855. return X, y
  856. @validate_params(
  857. {
  858. "n_samples": [Interval(Integral, 1, None, closed="left")],
  859. "n_features": [Interval(Integral, 5, None, closed="left")],
  860. "noise": [Interval(Real, 0.0, None, closed="left")],
  861. "random_state": ["random_state"],
  862. },
  863. prefer_skip_nested_validation=True,
  864. )
  865. def make_friedman1(n_samples=100, n_features=10, *, noise=0.0, random_state=None):
  866. """Generate the "Friedman #1" regression problem.
  867. This dataset is described in Friedman [1] and Breiman [2].
  868. Inputs `X` are independent features uniformly distributed on the interval
  869. [0, 1]. The output `y` is created according to the formula::
  870. y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
  871. + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).
  872. Out of the `n_features` features, only 5 are actually used to compute
  873. `y`. The remaining features are independent of `y`.
  874. The number of features has to be >= 5.
  875. Read more in the :ref:`User Guide <sample_generators>`.
  876. Parameters
  877. ----------
  878. n_samples : int, default=100
  879. The number of samples.
  880. n_features : int, default=10
  881. The number of features. Should be at least 5.
  882. noise : float, default=0.0
  883. The standard deviation of the gaussian noise applied to the output.
  884. random_state : int, RandomState instance or None, default=None
  885. Determines random number generation for dataset noise. Pass an int
  886. for reproducible output across multiple function calls.
  887. See :term:`Glossary <random_state>`.
  888. Returns
  889. -------
  890. X : ndarray of shape (n_samples, n_features)
  891. The input samples.
  892. y : ndarray of shape (n_samples,)
  893. The output values.
  894. References
  895. ----------
  896. .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
  897. of Statistics 19 (1), pages 1-67, 1991.
  898. .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
  899. pages 123-140, 1996.
  900. """
  901. generator = check_random_state(random_state)
  902. X = generator.uniform(size=(n_samples, n_features))
  903. y = (
  904. 10 * np.sin(np.pi * X[:, 0] * X[:, 1])
  905. + 20 * (X[:, 2] - 0.5) ** 2
  906. + 10 * X[:, 3]
  907. + 5 * X[:, 4]
  908. + noise * generator.standard_normal(size=(n_samples))
  909. )
  910. return X, y
  911. @validate_params(
  912. {
  913. "n_samples": [Interval(Integral, 1, None, closed="left")],
  914. "noise": [Interval(Real, 0, None, closed="left")],
  915. "random_state": ["random_state"],
  916. },
  917. prefer_skip_nested_validation=True,
  918. )
  919. def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
  920. """Generate the "Friedman #2" regression problem.
  921. This dataset is described in Friedman [1] and Breiman [2].
  922. Inputs `X` are 4 independent features uniformly distributed on the
  923. intervals::
  924. 0 <= X[:, 0] <= 100,
  925. 40 * pi <= X[:, 1] <= 560 * pi,
  926. 0 <= X[:, 2] <= 1,
  927. 1 <= X[:, 3] <= 11.
  928. The output `y` is created according to the formula::
  929. y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] \
  930. - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).
  931. Read more in the :ref:`User Guide <sample_generators>`.
  932. Parameters
  933. ----------
  934. n_samples : int, default=100
  935. The number of samples.
  936. noise : float, default=0.0
  937. The standard deviation of the gaussian noise applied to the output.
  938. random_state : int, RandomState instance or None, default=None
  939. Determines random number generation for dataset noise. Pass an int
  940. for reproducible output across multiple function calls.
  941. See :term:`Glossary <random_state>`.
  942. Returns
  943. -------
  944. X : ndarray of shape (n_samples, 4)
  945. The input samples.
  946. y : ndarray of shape (n_samples,)
  947. The output values.
  948. References
  949. ----------
  950. .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
  951. of Statistics 19 (1), pages 1-67, 1991.
  952. .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
  953. pages 123-140, 1996.
  954. """
  955. generator = check_random_state(random_state)
  956. X = generator.uniform(size=(n_samples, 4))
  957. X[:, 0] *= 100
  958. X[:, 1] *= 520 * np.pi
  959. X[:, 1] += 40 * np.pi
  960. X[:, 3] *= 10
  961. X[:, 3] += 1
  962. y = (
  963. X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2
  964. ) ** 0.5 + noise * generator.standard_normal(size=(n_samples))
  965. return X, y
  966. @validate_params(
  967. {
  968. "n_samples": [Interval(Integral, 1, None, closed="left")],
  969. "noise": [Interval(Real, 0, None, closed="left")],
  970. "random_state": ["random_state"],
  971. },
  972. prefer_skip_nested_validation=True,
  973. )
  974. def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
  975. """Generate the "Friedman #3" regression problem.
  976. This dataset is described in Friedman [1] and Breiman [2].
  977. Inputs `X` are 4 independent features uniformly distributed on the
  978. intervals::
  979. 0 <= X[:, 0] <= 100,
  980. 40 * pi <= X[:, 1] <= 560 * pi,
  981. 0 <= X[:, 2] <= 1,
  982. 1 <= X[:, 3] <= 11.
  983. The output `y` is created according to the formula::
  984. y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) \
  985. / X[:, 0]) + noise * N(0, 1).
  986. Read more in the :ref:`User Guide <sample_generators>`.
  987. Parameters
  988. ----------
  989. n_samples : int, default=100
  990. The number of samples.
  991. noise : float, default=0.0
  992. The standard deviation of the gaussian noise applied to the output.
  993. random_state : int, RandomState instance or None, default=None
  994. Determines random number generation for dataset noise. Pass an int
  995. for reproducible output across multiple function calls.
  996. See :term:`Glossary <random_state>`.
  997. Returns
  998. -------
  999. X : ndarray of shape (n_samples, 4)
  1000. The input samples.
  1001. y : ndarray of shape (n_samples,)
  1002. The output values.
  1003. References
  1004. ----------
  1005. .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
  1006. of Statistics 19 (1), pages 1-67, 1991.
  1007. .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
  1008. pages 123-140, 1996.
  1009. """
  1010. generator = check_random_state(random_state)
  1011. X = generator.uniform(size=(n_samples, 4))
  1012. X[:, 0] *= 100
  1013. X[:, 1] *= 520 * np.pi
  1014. X[:, 1] += 40 * np.pi
  1015. X[:, 3] *= 10
  1016. X[:, 3] += 1
  1017. y = np.arctan(
  1018. (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]
  1019. ) + noise * generator.standard_normal(size=(n_samples))
  1020. return X, y
  1021. @validate_params(
  1022. {
  1023. "n_samples": [Interval(Integral, 1, None, closed="left")],
  1024. "n_features": [Interval(Integral, 1, None, closed="left")],
  1025. "effective_rank": [Interval(Integral, 1, None, closed="left")],
  1026. "tail_strength": [Interval(Real, 0, 1, closed="both")],
  1027. "random_state": ["random_state"],
  1028. },
  1029. prefer_skip_nested_validation=True,
  1030. )
  1031. def make_low_rank_matrix(
  1032. n_samples=100,
  1033. n_features=100,
  1034. *,
  1035. effective_rank=10,
  1036. tail_strength=0.5,
  1037. random_state=None,
  1038. ):
  1039. """Generate a mostly low rank matrix with bell-shaped singular values.
  1040. Most of the variance can be explained by a bell-shaped curve of width
  1041. effective_rank: the low rank part of the singular values profile is::
  1042. (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)
  1043. The remaining singular values' tail is fat, decreasing as::
  1044. tail_strength * exp(-0.1 * i / effective_rank).
  1045. The low rank part of the profile can be considered the structured
  1046. signal part of the data while the tail can be considered the noisy
  1047. part of the data that cannot be summarized by a low number of linear
  1048. components (singular vectors).
  1049. This kind of singular profiles is often seen in practice, for instance:
  1050. - gray level pictures of faces
  1051. - TF-IDF vectors of text documents crawled from the web
  1052. Read more in the :ref:`User Guide <sample_generators>`.
  1053. Parameters
  1054. ----------
  1055. n_samples : int, default=100
  1056. The number of samples.
  1057. n_features : int, default=100
  1058. The number of features.
  1059. effective_rank : int, default=10
  1060. The approximate number of singular vectors required to explain most of
  1061. the data by linear combinations.
  1062. tail_strength : float, default=0.5
  1063. The relative importance of the fat noisy tail of the singular values
  1064. profile. The value should be between 0 and 1.
  1065. random_state : int, RandomState instance or None, default=None
  1066. Determines random number generation for dataset creation. Pass an int
  1067. for reproducible output across multiple function calls.
  1068. See :term:`Glossary <random_state>`.
  1069. Returns
  1070. -------
  1071. X : ndarray of shape (n_samples, n_features)
  1072. The matrix.
  1073. """
  1074. generator = check_random_state(random_state)
  1075. n = min(n_samples, n_features)
  1076. # Random (ortho normal) vectors
  1077. u, _ = linalg.qr(
  1078. generator.standard_normal(size=(n_samples, n)),
  1079. mode="economic",
  1080. check_finite=False,
  1081. )
  1082. v, _ = linalg.qr(
  1083. generator.standard_normal(size=(n_features, n)),
  1084. mode="economic",
  1085. check_finite=False,
  1086. )
  1087. # Index of the singular values
  1088. singular_ind = np.arange(n, dtype=np.float64)
  1089. # Build the singular profile by assembling signal and noise components
  1090. low_rank = (1 - tail_strength) * np.exp(-1.0 * (singular_ind / effective_rank) ** 2)
  1091. tail = tail_strength * np.exp(-0.1 * singular_ind / effective_rank)
  1092. s = np.identity(n) * (low_rank + tail)
  1093. return np.dot(np.dot(u, s), v.T)
  1094. @validate_params(
  1095. {
  1096. "n_samples": [Interval(Integral, 1, None, closed="left")],
  1097. "n_components": [Interval(Integral, 1, None, closed="left")],
  1098. "n_features": [Interval(Integral, 1, None, closed="left")],
  1099. "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left")],
  1100. "random_state": ["random_state"],
  1101. "data_transposed": ["boolean", Hidden(StrOptions({"deprecated"}))],
  1102. },
  1103. prefer_skip_nested_validation=True,
  1104. )
  1105. def make_sparse_coded_signal(
  1106. n_samples,
  1107. *,
  1108. n_components,
  1109. n_features,
  1110. n_nonzero_coefs,
  1111. random_state=None,
  1112. data_transposed="deprecated",
  1113. ):
  1114. """Generate a signal as a sparse combination of dictionary elements.
  1115. Returns a matrix `Y = DX`, such that `D` is of shape `(n_features, n_components)`,
  1116. `X` is of shape `(n_components, n_samples)` and each column of `X` has exactly
  1117. `n_nonzero_coefs` non-zero elements.
  1118. Read more in the :ref:`User Guide <sample_generators>`.
  1119. Parameters
  1120. ----------
  1121. n_samples : int
  1122. Number of samples to generate.
  1123. n_components : int
  1124. Number of components in the dictionary.
  1125. n_features : int
  1126. Number of features of the dataset to generate.
  1127. n_nonzero_coefs : int
  1128. Number of active (non-zero) coefficients in each sample.
  1129. random_state : int, RandomState instance or None, default=None
  1130. Determines random number generation for dataset creation. Pass an int
  1131. for reproducible output across multiple function calls.
  1132. See :term:`Glossary <random_state>`.
  1133. data_transposed : bool, default=False
  1134. By default, Y, D and X are not transposed.
  1135. .. versionadded:: 1.1
  1136. .. versionchanged:: 1.3
  1137. Default value changed from True to False.
  1138. .. deprecated:: 1.3
  1139. `data_transposed` is deprecated and will be removed in 1.5.
  1140. Returns
  1141. -------
  1142. data : ndarray of shape (n_features, n_samples) or (n_samples, n_features)
  1143. The encoded signal (Y). The shape is `(n_samples, n_features)` if
  1144. `data_transposed` is False, otherwise it's `(n_features, n_samples)`.
  1145. dictionary : ndarray of shape (n_features, n_components) or \
  1146. (n_components, n_features)
  1147. The dictionary with normalized components (D). The shape is
  1148. `(n_components, n_features)` if `data_transposed` is False, otherwise it's
  1149. `(n_features, n_components)`.
  1150. code : ndarray of shape (n_components, n_samples) or (n_samples, n_components)
  1151. The sparse code such that each column of this matrix has exactly
  1152. n_nonzero_coefs non-zero items (X). The shape is `(n_samples, n_components)`
  1153. if `data_transposed` is False, otherwise it's `(n_components, n_samples)`.
  1154. """
  1155. generator = check_random_state(random_state)
  1156. # generate dictionary
  1157. D = generator.standard_normal(size=(n_features, n_components))
  1158. D /= np.sqrt(np.sum((D**2), axis=0))
  1159. # generate code
  1160. X = np.zeros((n_components, n_samples))
  1161. for i in range(n_samples):
  1162. idx = np.arange(n_components)
  1163. generator.shuffle(idx)
  1164. idx = idx[:n_nonzero_coefs]
  1165. X[idx, i] = generator.standard_normal(size=n_nonzero_coefs)
  1166. # encode signal
  1167. Y = np.dot(D, X)
  1168. # TODO(1.5) remove data_transposed
  1169. # raise warning if data_transposed is not passed explicitly
  1170. if data_transposed != "deprecated":
  1171. warnings.warn(
  1172. "data_transposed was deprecated in version 1.3 and will be removed in 1.5.",
  1173. FutureWarning,
  1174. )
  1175. else:
  1176. data_transposed = False
  1177. # transpose if needed
  1178. if not data_transposed:
  1179. Y, D, X = Y.T, D.T, X.T
  1180. return map(np.squeeze, (Y, D, X))
  1181. @validate_params(
  1182. {
  1183. "n_samples": [Interval(Integral, 1, None, closed="left")],
  1184. "n_features": [Interval(Integral, 1, None, closed="left")],
  1185. "random_state": ["random_state"],
  1186. },
  1187. prefer_skip_nested_validation=True,
  1188. )
  1189. def make_sparse_uncorrelated(n_samples=100, n_features=10, *, random_state=None):
  1190. """Generate a random regression problem with sparse uncorrelated design.
  1191. This dataset is described in Celeux et al [1]. as::
  1192. X ~ N(0, 1)
  1193. y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]
  1194. Only the first 4 features are informative. The remaining features are
  1195. useless.
  1196. Read more in the :ref:`User Guide <sample_generators>`.
  1197. Parameters
  1198. ----------
  1199. n_samples : int, default=100
  1200. The number of samples.
  1201. n_features : int, default=10
  1202. The number of features.
  1203. random_state : int, RandomState instance or None, default=None
  1204. Determines random number generation for dataset creation. Pass an int
  1205. for reproducible output across multiple function calls.
  1206. See :term:`Glossary <random_state>`.
  1207. Returns
  1208. -------
  1209. X : ndarray of shape (n_samples, n_features)
  1210. The input samples.
  1211. y : ndarray of shape (n_samples,)
  1212. The output values.
  1213. References
  1214. ----------
  1215. .. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,
  1216. "Regularization in regression: comparing Bayesian and frequentist
  1217. methods in a poorly informative situation", 2009.
  1218. """
  1219. generator = check_random_state(random_state)
  1220. X = generator.normal(loc=0, scale=1, size=(n_samples, n_features))
  1221. y = generator.normal(
  1222. loc=(X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]),
  1223. scale=np.ones(n_samples),
  1224. )
  1225. return X, y
  1226. @validate_params(
  1227. {
  1228. "n_dim": [Interval(Integral, 1, None, closed="left")],
  1229. "random_state": ["random_state"],
  1230. },
  1231. prefer_skip_nested_validation=True,
  1232. )
  1233. def make_spd_matrix(n_dim, *, random_state=None):
  1234. """Generate a random symmetric, positive-definite matrix.
  1235. Read more in the :ref:`User Guide <sample_generators>`.
  1236. Parameters
  1237. ----------
  1238. n_dim : int
  1239. The matrix dimension.
  1240. random_state : int, RandomState instance or None, default=None
  1241. Determines random number generation for dataset creation. Pass an int
  1242. for reproducible output across multiple function calls.
  1243. See :term:`Glossary <random_state>`.
  1244. Returns
  1245. -------
  1246. X : ndarray of shape (n_dim, n_dim)
  1247. The random symmetric, positive-definite matrix.
  1248. See Also
  1249. --------
  1250. make_sparse_spd_matrix: Generate a sparse symmetric definite positive matrix.
  1251. """
  1252. generator = check_random_state(random_state)
  1253. A = generator.uniform(size=(n_dim, n_dim))
  1254. U, _, Vt = linalg.svd(np.dot(A.T, A), check_finite=False)
  1255. X = np.dot(np.dot(U, 1.0 + np.diag(generator.uniform(size=n_dim))), Vt)
  1256. return X
  1257. @validate_params(
  1258. {
  1259. "dim": [Interval(Integral, 1, None, closed="left")],
  1260. "alpha": [Interval(Real, 0, 1, closed="both")],
  1261. "norm_diag": ["boolean"],
  1262. "smallest_coef": [Interval(Real, 0, 1, closed="both")],
  1263. "largest_coef": [Interval(Real, 0, 1, closed="both")],
  1264. "random_state": ["random_state"],
  1265. },
  1266. prefer_skip_nested_validation=True,
  1267. )
  1268. def make_sparse_spd_matrix(
  1269. dim=1,
  1270. *,
  1271. alpha=0.95,
  1272. norm_diag=False,
  1273. smallest_coef=0.1,
  1274. largest_coef=0.9,
  1275. random_state=None,
  1276. ):
  1277. """Generate a sparse symmetric definite positive matrix.
  1278. Read more in the :ref:`User Guide <sample_generators>`.
  1279. Parameters
  1280. ----------
  1281. dim : int, default=1
  1282. The size of the random matrix to generate.
  1283. alpha : float, default=0.95
  1284. The probability that a coefficient is zero (see notes). Larger values
  1285. enforce more sparsity. The value should be in the range 0 and 1.
  1286. norm_diag : bool, default=False
  1287. Whether to normalize the output matrix to make the leading diagonal
  1288. elements all 1.
  1289. smallest_coef : float, default=0.1
  1290. The value of the smallest coefficient between 0 and 1.
  1291. largest_coef : float, default=0.9
  1292. The value of the largest coefficient between 0 and 1.
  1293. random_state : int, RandomState instance or None, default=None
  1294. Determines random number generation for dataset creation. Pass an int
  1295. for reproducible output across multiple function calls.
  1296. See :term:`Glossary <random_state>`.
  1297. Returns
  1298. -------
  1299. prec : sparse matrix of shape (dim, dim)
  1300. The generated matrix.
  1301. See Also
  1302. --------
  1303. make_spd_matrix : Generate a random symmetric, positive-definite matrix.
  1304. Notes
  1305. -----
  1306. The sparsity is actually imposed on the cholesky factor of the matrix.
  1307. Thus alpha does not translate directly into the filling fraction of
  1308. the matrix itself.
  1309. """
  1310. random_state = check_random_state(random_state)
  1311. chol = -np.eye(dim)
  1312. aux = random_state.uniform(size=(dim, dim))
  1313. aux[aux < alpha] = 0
  1314. aux[aux > alpha] = smallest_coef + (
  1315. largest_coef - smallest_coef
  1316. ) * random_state.uniform(size=np.sum(aux > alpha))
  1317. aux = np.tril(aux, k=-1)
  1318. # Permute the lines: we don't want to have asymmetries in the final
  1319. # SPD matrix
  1320. permutation = random_state.permutation(dim)
  1321. aux = aux[permutation].T[permutation]
  1322. chol += aux
  1323. prec = np.dot(chol.T, chol)
  1324. if norm_diag:
  1325. # Form the diagonal vector into a row matrix
  1326. d = np.diag(prec).reshape(1, prec.shape[0])
  1327. d = 1.0 / np.sqrt(d)
  1328. prec *= d
  1329. prec *= d.T
  1330. return prec
  1331. @validate_params(
  1332. {
  1333. "n_samples": [Interval(Integral, 1, None, closed="left")],
  1334. "noise": [Interval(Real, 0, None, closed="left")],
  1335. "random_state": ["random_state"],
  1336. "hole": ["boolean"],
  1337. },
  1338. prefer_skip_nested_validation=True,
  1339. )
  1340. def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None, hole=False):
  1341. """Generate a swiss roll dataset.
  1342. Read more in the :ref:`User Guide <sample_generators>`.
  1343. Parameters
  1344. ----------
  1345. n_samples : int, default=100
  1346. The number of sample points on the Swiss Roll.
  1347. noise : float, default=0.0
  1348. The standard deviation of the gaussian noise.
  1349. random_state : int, RandomState instance or None, default=None
  1350. Determines random number generation for dataset creation. Pass an int
  1351. for reproducible output across multiple function calls.
  1352. See :term:`Glossary <random_state>`.
  1353. hole : bool, default=False
  1354. If True generates the swiss roll with hole dataset.
  1355. Returns
  1356. -------
  1357. X : ndarray of shape (n_samples, 3)
  1358. The points.
  1359. t : ndarray of shape (n_samples,)
  1360. The univariate position of the sample according to the main dimension
  1361. of the points in the manifold.
  1362. Notes
  1363. -----
  1364. The algorithm is from Marsland [1].
  1365. References
  1366. ----------
  1367. .. [1] S. Marsland, "Machine Learning: An Algorithmic Perspective", 2nd edition,
  1368. Chapter 6, 2014.
  1369. https://homepages.ecs.vuw.ac.nz/~marslast/Code/Ch6/lle.py
  1370. """
  1371. generator = check_random_state(random_state)
  1372. if not hole:
  1373. t = 1.5 * np.pi * (1 + 2 * generator.uniform(size=n_samples))
  1374. y = 21 * generator.uniform(size=n_samples)
  1375. else:
  1376. corners = np.array(
  1377. [[np.pi * (1.5 + i), j * 7] for i in range(3) for j in range(3)]
  1378. )
  1379. corners = np.delete(corners, 4, axis=0)
  1380. corner_index = generator.choice(8, n_samples)
  1381. parameters = generator.uniform(size=(2, n_samples)) * np.array([[np.pi], [7]])
  1382. t, y = corners[corner_index].T + parameters
  1383. x = t * np.cos(t)
  1384. z = t * np.sin(t)
  1385. X = np.vstack((x, y, z))
  1386. X += noise * generator.standard_normal(size=(3, n_samples))
  1387. X = X.T
  1388. t = np.squeeze(t)
  1389. return X, t
  1390. @validate_params(
  1391. {
  1392. "n_samples": [Interval(Integral, 1, None, closed="left")],
  1393. "noise": [Interval(Real, 0, None, closed="left")],
  1394. "random_state": ["random_state"],
  1395. },
  1396. prefer_skip_nested_validation=True,
  1397. )
  1398. def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
  1399. """Generate an S curve dataset.
  1400. Read more in the :ref:`User Guide <sample_generators>`.
  1401. Parameters
  1402. ----------
  1403. n_samples : int, default=100
  1404. The number of sample points on the S curve.
  1405. noise : float, default=0.0
  1406. The standard deviation of the gaussian noise.
  1407. random_state : int, RandomState instance or None, default=None
  1408. Determines random number generation for dataset creation. Pass an int
  1409. for reproducible output across multiple function calls.
  1410. See :term:`Glossary <random_state>`.
  1411. Returns
  1412. -------
  1413. X : ndarray of shape (n_samples, 3)
  1414. The points.
  1415. t : ndarray of shape (n_samples,)
  1416. The univariate position of the sample according to the main dimension
  1417. of the points in the manifold.
  1418. """
  1419. generator = check_random_state(random_state)
  1420. t = 3 * np.pi * (generator.uniform(size=(1, n_samples)) - 0.5)
  1421. X = np.empty(shape=(n_samples, 3), dtype=np.float64)
  1422. X[:, 0] = np.sin(t)
  1423. X[:, 1] = 2.0 * generator.uniform(size=n_samples)
  1424. X[:, 2] = np.sign(t) * (np.cos(t) - 1)
  1425. X += noise * generator.standard_normal(size=(3, n_samples)).T
  1426. t = np.squeeze(t)
  1427. return X, t
  1428. @validate_params(
  1429. {
  1430. "mean": ["array-like", None],
  1431. "cov": [Interval(Real, 0, None, closed="left")],
  1432. "n_samples": [Interval(Integral, 1, None, closed="left")],
  1433. "n_features": [Interval(Integral, 1, None, closed="left")],
  1434. "n_classes": [Interval(Integral, 1, None, closed="left")],
  1435. "shuffle": ["boolean"],
  1436. "random_state": ["random_state"],
  1437. },
  1438. prefer_skip_nested_validation=True,
  1439. )
  1440. def make_gaussian_quantiles(
  1441. *,
  1442. mean=None,
  1443. cov=1.0,
  1444. n_samples=100,
  1445. n_features=2,
  1446. n_classes=3,
  1447. shuffle=True,
  1448. random_state=None,
  1449. ):
  1450. r"""Generate isotropic Gaussian and label samples by quantile.
  1451. This classification dataset is constructed by taking a multi-dimensional
  1452. standard normal distribution and defining classes separated by nested
  1453. concentric multi-dimensional spheres such that roughly equal numbers of
  1454. samples are in each class (quantiles of the :math:`\chi^2` distribution).
  1455. Read more in the :ref:`User Guide <sample_generators>`.
  1456. Parameters
  1457. ----------
  1458. mean : ndarray of shape (n_features,), default=None
  1459. The mean of the multi-dimensional normal distribution.
  1460. If None then use the origin (0, 0, ...).
  1461. cov : float, default=1.0
  1462. The covariance matrix will be this value times the unit matrix. This
  1463. dataset only produces symmetric normal distributions.
  1464. n_samples : int, default=100
  1465. The total number of points equally divided among classes.
  1466. n_features : int, default=2
  1467. The number of features for each sample.
  1468. n_classes : int, default=3
  1469. The number of classes.
  1470. shuffle : bool, default=True
  1471. Shuffle the samples.
  1472. random_state : int, RandomState instance or None, default=None
  1473. Determines random number generation for dataset creation. Pass an int
  1474. for reproducible output across multiple function calls.
  1475. See :term:`Glossary <random_state>`.
  1476. Returns
  1477. -------
  1478. X : ndarray of shape (n_samples, n_features)
  1479. The generated samples.
  1480. y : ndarray of shape (n_samples,)
  1481. The integer labels for quantile membership of each sample.
  1482. Notes
  1483. -----
  1484. The dataset is from Zhu et al [1].
  1485. References
  1486. ----------
  1487. .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
  1488. """
  1489. if n_samples < n_classes:
  1490. raise ValueError("n_samples must be at least n_classes")
  1491. generator = check_random_state(random_state)
  1492. if mean is None:
  1493. mean = np.zeros(n_features)
  1494. else:
  1495. mean = np.array(mean)
  1496. # Build multivariate normal distribution
  1497. X = generator.multivariate_normal(mean, cov * np.identity(n_features), (n_samples,))
  1498. # Sort by distance from origin
  1499. idx = np.argsort(np.sum((X - mean[np.newaxis, :]) ** 2, axis=1))
  1500. X = X[idx, :]
  1501. # Label by quantile
  1502. step = n_samples // n_classes
  1503. y = np.hstack(
  1504. [
  1505. np.repeat(np.arange(n_classes), step),
  1506. np.repeat(n_classes - 1, n_samples - step * n_classes),
  1507. ]
  1508. )
  1509. if shuffle:
  1510. X, y = util_shuffle(X, y, random_state=generator)
  1511. return X, y
  1512. def _shuffle(data, random_state=None):
  1513. generator = check_random_state(random_state)
  1514. n_rows, n_cols = data.shape
  1515. row_idx = generator.permutation(n_rows)
  1516. col_idx = generator.permutation(n_cols)
  1517. result = data[row_idx][:, col_idx]
  1518. return result, row_idx, col_idx
  1519. @validate_params(
  1520. {
  1521. "shape": [tuple],
  1522. "n_clusters": [Interval(Integral, 1, None, closed="left")],
  1523. "noise": [Interval(Real, 0, None, closed="left")],
  1524. "minval": [Interval(Real, None, None, closed="neither")],
  1525. "maxval": [Interval(Real, None, None, closed="neither")],
  1526. "shuffle": ["boolean"],
  1527. "random_state": ["random_state"],
  1528. },
  1529. prefer_skip_nested_validation=True,
  1530. )
  1531. def make_biclusters(
  1532. shape,
  1533. n_clusters,
  1534. *,
  1535. noise=0.0,
  1536. minval=10,
  1537. maxval=100,
  1538. shuffle=True,
  1539. random_state=None,
  1540. ):
  1541. """Generate a constant block diagonal structure array for biclustering.
  1542. Read more in the :ref:`User Guide <sample_generators>`.
  1543. Parameters
  1544. ----------
  1545. shape : tuple of shape (n_rows, n_cols)
  1546. The shape of the result.
  1547. n_clusters : int
  1548. The number of biclusters.
  1549. noise : float, default=0.0
  1550. The standard deviation of the gaussian noise.
  1551. minval : float, default=10
  1552. Minimum value of a bicluster.
  1553. maxval : float, default=100
  1554. Maximum value of a bicluster.
  1555. shuffle : bool, default=True
  1556. Shuffle the samples.
  1557. random_state : int, RandomState instance or None, default=None
  1558. Determines random number generation for dataset creation. Pass an int
  1559. for reproducible output across multiple function calls.
  1560. See :term:`Glossary <random_state>`.
  1561. Returns
  1562. -------
  1563. X : ndarray of shape `shape`
  1564. The generated array.
  1565. rows : ndarray of shape (n_clusters, X.shape[0])
  1566. The indicators for cluster membership of each row.
  1567. cols : ndarray of shape (n_clusters, X.shape[1])
  1568. The indicators for cluster membership of each column.
  1569. See Also
  1570. --------
  1571. make_checkerboard: Generate an array with block checkerboard structure for
  1572. biclustering.
  1573. References
  1574. ----------
  1575. .. [1] Dhillon, I. S. (2001, August). Co-clustering documents and
  1576. words using bipartite spectral graph partitioning. In Proceedings
  1577. of the seventh ACM SIGKDD international conference on Knowledge
  1578. discovery and data mining (pp. 269-274). ACM.
  1579. """
  1580. generator = check_random_state(random_state)
  1581. n_rows, n_cols = shape
  1582. consts = generator.uniform(minval, maxval, n_clusters)
  1583. # row and column clusters of approximately equal sizes
  1584. row_sizes = generator.multinomial(n_rows, np.repeat(1.0 / n_clusters, n_clusters))
  1585. col_sizes = generator.multinomial(n_cols, np.repeat(1.0 / n_clusters, n_clusters))
  1586. row_labels = np.hstack(
  1587. [np.repeat(val, rep) for val, rep in zip(range(n_clusters), row_sizes)]
  1588. )
  1589. col_labels = np.hstack(
  1590. [np.repeat(val, rep) for val, rep in zip(range(n_clusters), col_sizes)]
  1591. )
  1592. result = np.zeros(shape, dtype=np.float64)
  1593. for i in range(n_clusters):
  1594. selector = np.outer(row_labels == i, col_labels == i)
  1595. result[selector] += consts[i]
  1596. if noise > 0:
  1597. result += generator.normal(scale=noise, size=result.shape)
  1598. if shuffle:
  1599. result, row_idx, col_idx = _shuffle(result, random_state)
  1600. row_labels = row_labels[row_idx]
  1601. col_labels = col_labels[col_idx]
  1602. rows = np.vstack([row_labels == c for c in range(n_clusters)])
  1603. cols = np.vstack([col_labels == c for c in range(n_clusters)])
  1604. return result, rows, cols
  1605. @validate_params(
  1606. {
  1607. "shape": [tuple],
  1608. "n_clusters": [Interval(Integral, 1, None, closed="left"), "array-like"],
  1609. "noise": [Interval(Real, 0, None, closed="left")],
  1610. "minval": [Interval(Real, None, None, closed="neither")],
  1611. "maxval": [Interval(Real, None, None, closed="neither")],
  1612. "shuffle": ["boolean"],
  1613. "random_state": ["random_state"],
  1614. },
  1615. prefer_skip_nested_validation=True,
  1616. )
  1617. def make_checkerboard(
  1618. shape,
  1619. n_clusters,
  1620. *,
  1621. noise=0.0,
  1622. minval=10,
  1623. maxval=100,
  1624. shuffle=True,
  1625. random_state=None,
  1626. ):
  1627. """Generate an array with block checkerboard structure for biclustering.
  1628. Read more in the :ref:`User Guide <sample_generators>`.
  1629. Parameters
  1630. ----------
  1631. shape : tuple of shape (n_rows, n_cols)
  1632. The shape of the result.
  1633. n_clusters : int or array-like or shape (n_row_clusters, n_column_clusters)
  1634. The number of row and column clusters.
  1635. noise : float, default=0.0
  1636. The standard deviation of the gaussian noise.
  1637. minval : float, default=10
  1638. Minimum value of a bicluster.
  1639. maxval : float, default=100
  1640. Maximum value of a bicluster.
  1641. shuffle : bool, default=True
  1642. Shuffle the samples.
  1643. random_state : int, RandomState instance or None, default=None
  1644. Determines random number generation for dataset creation. Pass an int
  1645. for reproducible output across multiple function calls.
  1646. See :term:`Glossary <random_state>`.
  1647. Returns
  1648. -------
  1649. X : ndarray of shape `shape`
  1650. The generated array.
  1651. rows : ndarray of shape (n_clusters, X.shape[0])
  1652. The indicators for cluster membership of each row.
  1653. cols : ndarray of shape (n_clusters, X.shape[1])
  1654. The indicators for cluster membership of each column.
  1655. See Also
  1656. --------
  1657. make_biclusters : Generate an array with constant block diagonal structure
  1658. for biclustering.
  1659. References
  1660. ----------
  1661. .. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).
  1662. Spectral biclustering of microarray data: coclustering genes
  1663. and conditions. Genome research, 13(4), 703-716.
  1664. """
  1665. generator = check_random_state(random_state)
  1666. if hasattr(n_clusters, "__len__"):
  1667. n_row_clusters, n_col_clusters = n_clusters
  1668. else:
  1669. n_row_clusters = n_col_clusters = n_clusters
  1670. # row and column clusters of approximately equal sizes
  1671. n_rows, n_cols = shape
  1672. row_sizes = generator.multinomial(
  1673. n_rows, np.repeat(1.0 / n_row_clusters, n_row_clusters)
  1674. )
  1675. col_sizes = generator.multinomial(
  1676. n_cols, np.repeat(1.0 / n_col_clusters, n_col_clusters)
  1677. )
  1678. row_labels = np.hstack(
  1679. [np.repeat(val, rep) for val, rep in zip(range(n_row_clusters), row_sizes)]
  1680. )
  1681. col_labels = np.hstack(
  1682. [np.repeat(val, rep) for val, rep in zip(range(n_col_clusters), col_sizes)]
  1683. )
  1684. result = np.zeros(shape, dtype=np.float64)
  1685. for i in range(n_row_clusters):
  1686. for j in range(n_col_clusters):
  1687. selector = np.outer(row_labels == i, col_labels == j)
  1688. result[selector] += generator.uniform(minval, maxval)
  1689. if noise > 0:
  1690. result += generator.normal(scale=noise, size=result.shape)
  1691. if shuffle:
  1692. result, row_idx, col_idx = _shuffle(result, random_state)
  1693. row_labels = row_labels[row_idx]
  1694. col_labels = col_labels[col_idx]
  1695. rows = np.vstack(
  1696. [
  1697. row_labels == label
  1698. for label in range(n_row_clusters)
  1699. for _ in range(n_col_clusters)
  1700. ]
  1701. )
  1702. cols = np.vstack(
  1703. [
  1704. col_labels == label
  1705. for _ in range(n_row_clusters)
  1706. for label in range(n_col_clusters)
  1707. ]
  1708. )
  1709. return result, rows, cols