| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231 |
- """
- Test the ColumnTransformer.
- """
- import pickle
- import re
- import numpy as np
- import pytest
- from numpy.testing import assert_allclose
- from scipy import sparse
- from sklearn.base import BaseEstimator, TransformerMixin
- from sklearn.compose import (
- ColumnTransformer,
- make_column_selector,
- make_column_transformer,
- )
- from sklearn.exceptions import NotFittedError
- from sklearn.feature_selection import VarianceThreshold
- from sklearn.preprocessing import (
- FunctionTransformer,
- Normalizer,
- OneHotEncoder,
- StandardScaler,
- )
- from sklearn.utils._testing import (
- assert_allclose_dense_sparse,
- assert_almost_equal,
- assert_array_equal,
- )
- class Trans(TransformerMixin, BaseEstimator):
- def fit(self, X, y=None):
- return self
- def transform(self, X, y=None):
- # 1D Series -> 2D DataFrame
- if hasattr(X, "to_frame"):
- return X.to_frame()
- # 1D array -> 2D array
- if X.ndim == 1:
- return np.atleast_2d(X).T
- return X
- class DoubleTrans(BaseEstimator):
- def fit(self, X, y=None):
- return self
- def transform(self, X):
- return 2 * X
- class SparseMatrixTrans(BaseEstimator):
- def fit(self, X, y=None):
- return self
- def transform(self, X, y=None):
- n_samples = len(X)
- return sparse.eye(n_samples, n_samples).tocsr()
- class TransNo2D(BaseEstimator):
- def fit(self, X, y=None):
- return self
- def transform(self, X, y=None):
- return X
- class TransRaise(BaseEstimator):
- def fit(self, X, y=None):
- raise ValueError("specific message")
- def transform(self, X, y=None):
- raise ValueError("specific message")
- def test_column_transformer():
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_res_first1D = np.array([0, 1, 2])
- X_res_second1D = np.array([2, 4, 6])
- X_res_first = X_res_first1D.reshape(-1, 1)
- X_res_both = X_array
- cases = [
- # single column 1D / 2D
- (0, X_res_first),
- ([0], X_res_first),
- # list-like
- ([0, 1], X_res_both),
- (np.array([0, 1]), X_res_both),
- # slice
- (slice(0, 1), X_res_first),
- (slice(0, 2), X_res_both),
- # boolean mask
- (np.array([True, False]), X_res_first),
- ([True, False], X_res_first),
- (np.array([True, True]), X_res_both),
- ([True, True], X_res_both),
- ]
- for selection, res in cases:
- ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
- assert_array_equal(ct.fit_transform(X_array), res)
- assert_array_equal(ct.fit(X_array).transform(X_array), res)
- # callable that returns any of the allowed specifiers
- ct = ColumnTransformer(
- [("trans", Trans(), lambda x: selection)], remainder="drop"
- )
- assert_array_equal(ct.fit_transform(X_array), res)
- assert_array_equal(ct.fit(X_array).transform(X_array), res)
- ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
- assert_array_equal(ct.fit_transform(X_array), X_res_both)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
- assert len(ct.transformers_) == 2
- # test with transformer_weights
- transformer_weights = {"trans1": 0.1, "trans2": 10}
- both = ColumnTransformer(
- [("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
- transformer_weights=transformer_weights,
- )
- res = np.vstack(
- [
- transformer_weights["trans1"] * X_res_first1D,
- transformer_weights["trans2"] * X_res_second1D,
- ]
- ).T
- assert_array_equal(both.fit_transform(X_array), res)
- assert_array_equal(both.fit(X_array).transform(X_array), res)
- assert len(both.transformers_) == 2
- both = ColumnTransformer(
- [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
- )
- assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both)
- assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both)
- assert len(both.transformers_) == 1
- def test_column_transformer_tuple_transformers_parameter():
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- transformers = [("trans1", Trans(), [0]), ("trans2", Trans(), [1])]
- ct_with_list = ColumnTransformer(transformers)
- ct_with_tuple = ColumnTransformer(tuple(transformers))
- assert_array_equal(
- ct_with_list.fit_transform(X_array), ct_with_tuple.fit_transform(X_array)
- )
- assert_array_equal(
- ct_with_list.fit(X_array).transform(X_array),
- ct_with_tuple.fit(X_array).transform(X_array),
- )
- def test_column_transformer_dataframe():
- pd = pytest.importorskip("pandas")
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_df = pd.DataFrame(X_array, columns=["first", "second"])
- X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
- X_res_both = X_array
- cases = [
- # String keys: label based
- # scalar
- ("first", X_res_first),
- # list
- (["first"], X_res_first),
- (["first", "second"], X_res_both),
- # slice
- (slice("first", "second"), X_res_both),
- # int keys: positional
- # scalar
- (0, X_res_first),
- # list
- ([0], X_res_first),
- ([0, 1], X_res_both),
- (np.array([0, 1]), X_res_both),
- # slice
- (slice(0, 1), X_res_first),
- (slice(0, 2), X_res_both),
- # boolean mask
- (np.array([True, False]), X_res_first),
- (pd.Series([True, False], index=["first", "second"]), X_res_first),
- ([True, False], X_res_first),
- ]
- for selection, res in cases:
- ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
- assert_array_equal(ct.fit_transform(X_df), res)
- assert_array_equal(ct.fit(X_df).transform(X_df), res)
- # callable that returns any of the allowed specifiers
- ct = ColumnTransformer(
- [("trans", Trans(), lambda X: selection)], remainder="drop"
- )
- assert_array_equal(ct.fit_transform(X_df), res)
- assert_array_equal(ct.fit(X_df).transform(X_df), res)
- ct = ColumnTransformer(
- [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])]
- )
- assert_array_equal(ct.fit_transform(X_df), X_res_both)
- assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] != "remainder"
- ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
- assert_array_equal(ct.fit_transform(X_df), X_res_both)
- assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] != "remainder"
- # test with transformer_weights
- transformer_weights = {"trans1": 0.1, "trans2": 10}
- both = ColumnTransformer(
- [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])],
- transformer_weights=transformer_weights,
- )
- res = np.vstack(
- [
- transformer_weights["trans1"] * X_df["first"],
- transformer_weights["trans2"] * X_df["second"],
- ]
- ).T
- assert_array_equal(both.fit_transform(X_df), res)
- assert_array_equal(both.fit(X_df).transform(X_df), res)
- assert len(both.transformers_) == 2
- assert both.transformers_[-1][0] != "remainder"
- # test multiple columns
- both = ColumnTransformer(
- [("trans", Trans(), ["first", "second"])], transformer_weights={"trans": 0.1}
- )
- assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
- assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
- assert len(both.transformers_) == 1
- assert both.transformers_[-1][0] != "remainder"
- both = ColumnTransformer(
- [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
- )
- assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
- assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
- assert len(both.transformers_) == 1
- assert both.transformers_[-1][0] != "remainder"
- # ensure pandas object is passed through
- class TransAssert(BaseEstimator):
- def __init__(self, expected_type_transform):
- self.expected_type_transform = expected_type_transform
- def fit(self, X, y=None):
- return self
- def transform(self, X, y=None):
- assert isinstance(X, self.expected_type_transform)
- if isinstance(X, pd.Series):
- X = X.to_frame()
- return X
- ct = ColumnTransformer(
- [("trans", TransAssert(expected_type_transform=pd.Series), "first")],
- remainder="drop",
- )
- ct.fit_transform(X_df)
- ct = ColumnTransformer(
- [
- (
- "trans",
- TransAssert(expected_type_transform=pd.DataFrame),
- ["first", "second"],
- )
- ]
- )
- ct.fit_transform(X_df)
- # integer column spec + integer column names -> still use positional
- X_df2 = X_df.copy()
- X_df2.columns = [1, 0]
- ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop")
- assert_array_equal(ct.fit_transform(X_df2), X_res_first)
- assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert ct.transformers_[-1][1] == "drop"
- assert_array_equal(ct.transformers_[-1][2], [1])
- @pytest.mark.parametrize("pandas", [True, False], ids=["pandas", "numpy"])
- @pytest.mark.parametrize(
- "column_selection",
- [[], np.array([False, False]), [False, False]],
- ids=["list", "bool", "bool_int"],
- )
- @pytest.mark.parametrize("callable_column", [False, True])
- def test_column_transformer_empty_columns(pandas, column_selection, callable_column):
- # test case that ensures that the column transformer does also work when
- # a given transformer doesn't have any columns to work on
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_res_both = X_array
- if pandas:
- pd = pytest.importorskip("pandas")
- X = pd.DataFrame(X_array, columns=["first", "second"])
- else:
- X = X_array
- if callable_column:
- column = lambda X: column_selection # noqa
- else:
- column = column_selection
- ct = ColumnTransformer(
- [("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), column)]
- )
- assert_array_equal(ct.fit_transform(X), X_res_both)
- assert_array_equal(ct.fit(X).transform(X), X_res_both)
- assert len(ct.transformers_) == 2
- assert isinstance(ct.transformers_[1][1], TransRaise)
- ct = ColumnTransformer(
- [("trans1", TransRaise(), column), ("trans2", Trans(), [0, 1])]
- )
- assert_array_equal(ct.fit_transform(X), X_res_both)
- assert_array_equal(ct.fit(X).transform(X), X_res_both)
- assert len(ct.transformers_) == 2
- assert isinstance(ct.transformers_[0][1], TransRaise)
- ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="passthrough")
- assert_array_equal(ct.fit_transform(X), X_res_both)
- assert_array_equal(ct.fit(X).transform(X), X_res_both)
- assert len(ct.transformers_) == 2 # including remainder
- assert isinstance(ct.transformers_[0][1], TransRaise)
- fixture = np.array([[], [], []])
- ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="drop")
- assert_array_equal(ct.fit_transform(X), fixture)
- assert_array_equal(ct.fit(X).transform(X), fixture)
- assert len(ct.transformers_) == 2 # including remainder
- assert isinstance(ct.transformers_[0][1], TransRaise)
- def test_column_transformer_output_indices():
- # Checks for the output_indices_ attribute
- X_array = np.arange(6).reshape(3, 2)
- ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
- X_trans = ct.fit_transform(X_array)
- assert ct.output_indices_ == {
- "trans1": slice(0, 1),
- "trans2": slice(1, 2),
- "remainder": slice(0, 0),
- }
- assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
- assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
- # test with transformer_weights and multiple columns
- ct = ColumnTransformer(
- [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
- )
- X_trans = ct.fit_transform(X_array)
- assert ct.output_indices_ == {"trans": slice(0, 2), "remainder": slice(0, 0)}
- assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans"]])
- assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
- # test case that ensures that the attribute does also work when
- # a given transformer doesn't have any columns to work on
- ct = ColumnTransformer([("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), [])])
- X_trans = ct.fit_transform(X_array)
- assert ct.output_indices_ == {
- "trans1": slice(0, 2),
- "trans2": slice(0, 0),
- "remainder": slice(0, 0),
- }
- assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans1"]])
- assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans2"]])
- assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
- ct = ColumnTransformer([("trans", TransRaise(), [])], remainder="passthrough")
- X_trans = ct.fit_transform(X_array)
- assert ct.output_indices_ == {"trans": slice(0, 0), "remainder": slice(0, 2)}
- assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans"]])
- assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["remainder"]])
- def test_column_transformer_output_indices_df():
- # Checks for the output_indices_ attribute with data frames
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame(np.arange(6).reshape(3, 2), columns=["first", "second"])
- ct = ColumnTransformer(
- [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])]
- )
- X_trans = ct.fit_transform(X_df)
- assert ct.output_indices_ == {
- "trans1": slice(0, 1),
- "trans2": slice(1, 2),
- "remainder": slice(0, 0),
- }
- assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
- assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
- assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
- ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
- X_trans = ct.fit_transform(X_df)
- assert ct.output_indices_ == {
- "trans1": slice(0, 1),
- "trans2": slice(1, 2),
- "remainder": slice(0, 0),
- }
- assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
- assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
- assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
- def test_column_transformer_sparse_array():
- X_sparse = sparse.eye(3, 2).tocsr()
- # no distinction between 1D and 2D
- X_res_first = X_sparse[:, 0]
- X_res_both = X_sparse
- for col in [0, [0], slice(0, 1)]:
- for remainder, res in [("drop", X_res_first), ("passthrough", X_res_both)]:
- ct = ColumnTransformer(
- [("trans", Trans(), col)], remainder=remainder, sparse_threshold=0.8
- )
- assert sparse.issparse(ct.fit_transform(X_sparse))
- assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
- assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res)
- for col in [[0, 1], slice(0, 2)]:
- ct = ColumnTransformer([("trans", Trans(), col)], sparse_threshold=0.8)
- assert sparse.issparse(ct.fit_transform(X_sparse))
- assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
- assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
- def test_column_transformer_list():
- X_list = [[1, float("nan"), "a"], [0, 0, "b"]]
- expected_result = np.array(
- [
- [1, float("nan"), 1, 0],
- [-1, 0, 0, 1],
- ]
- )
- ct = ColumnTransformer(
- [
- ("numerical", StandardScaler(), [0, 1]),
- ("categorical", OneHotEncoder(), [2]),
- ]
- )
- assert_array_equal(ct.fit_transform(X_list), expected_result)
- assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
- def test_column_transformer_sparse_stacking():
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- col_trans = ColumnTransformer(
- [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(), 1)],
- sparse_threshold=0.8,
- )
- col_trans.fit(X_array)
- X_trans = col_trans.transform(X_array)
- assert sparse.issparse(X_trans)
- assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
- assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
- assert len(col_trans.transformers_) == 2
- assert col_trans.transformers_[-1][0] != "remainder"
- col_trans = ColumnTransformer(
- [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(), 1)],
- sparse_threshold=0.1,
- )
- col_trans.fit(X_array)
- X_trans = col_trans.transform(X_array)
- assert not sparse.issparse(X_trans)
- assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
- assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
- def test_column_transformer_mixed_cols_sparse():
- df = np.array([["a", 1, True], ["b", 2, False]], dtype="O")
- ct = make_column_transformer(
- (OneHotEncoder(), [0]), ("passthrough", [1, 2]), sparse_threshold=1.0
- )
- # this shouldn't fail, since boolean can be coerced into a numeric
- # See: https://github.com/scikit-learn/scikit-learn/issues/11912
- X_trans = ct.fit_transform(df)
- assert X_trans.getformat() == "csr"
- assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2, 0]]))
- ct = make_column_transformer(
- (OneHotEncoder(), [0]), ("passthrough", [0]), sparse_threshold=1.0
- )
- with pytest.raises(ValueError, match="For a sparse output, all columns should"):
- # this fails since strings `a` and `b` cannot be
- # coerced into a numeric.
- ct.fit_transform(df)
- def test_column_transformer_sparse_threshold():
- X_array = np.array([["a", "b"], ["A", "B"]], dtype=object).T
- # above data has sparsity of 4 / 8 = 0.5
- # apply threshold even if all sparse
- col_trans = ColumnTransformer(
- [("trans1", OneHotEncoder(), [0]), ("trans2", OneHotEncoder(), [1])],
- sparse_threshold=0.2,
- )
- res = col_trans.fit_transform(X_array)
- assert not sparse.issparse(res)
- assert not col_trans.sparse_output_
- # mixed -> sparsity of (4 + 2) / 8 = 0.75
- for thres in [0.75001, 1]:
- col_trans = ColumnTransformer(
- [
- ("trans1", OneHotEncoder(sparse_output=True), [0]),
- ("trans2", OneHotEncoder(sparse_output=False), [1]),
- ],
- sparse_threshold=thres,
- )
- res = col_trans.fit_transform(X_array)
- assert sparse.issparse(res)
- assert col_trans.sparse_output_
- for thres in [0.75, 0]:
- col_trans = ColumnTransformer(
- [
- ("trans1", OneHotEncoder(sparse_output=True), [0]),
- ("trans2", OneHotEncoder(sparse_output=False), [1]),
- ],
- sparse_threshold=thres,
- )
- res = col_trans.fit_transform(X_array)
- assert not sparse.issparse(res)
- assert not col_trans.sparse_output_
- # if nothing is sparse -> no sparse
- for thres in [0.33, 0, 1]:
- col_trans = ColumnTransformer(
- [
- ("trans1", OneHotEncoder(sparse_output=False), [0]),
- ("trans2", OneHotEncoder(sparse_output=False), [1]),
- ],
- sparse_threshold=thres,
- )
- res = col_trans.fit_transform(X_array)
- assert not sparse.issparse(res)
- assert not col_trans.sparse_output_
- def test_column_transformer_error_msg_1D():
- X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
- col_trans = ColumnTransformer([("trans", StandardScaler(), 0)])
- msg = "1D data passed to a transformer"
- with pytest.raises(ValueError, match=msg):
- col_trans.fit(X_array)
- with pytest.raises(ValueError, match=msg):
- col_trans.fit_transform(X_array)
- col_trans = ColumnTransformer([("trans", TransRaise(), 0)])
- for func in [col_trans.fit, col_trans.fit_transform]:
- with pytest.raises(ValueError, match="specific message"):
- func(X_array)
- def test_2D_transformer_output():
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- # if one transformer is dropped, test that name is still correct
- ct = ColumnTransformer([("trans1", "drop", 0), ("trans2", TransNo2D(), 1)])
- msg = "the 'trans2' transformer should be 2D"
- with pytest.raises(ValueError, match=msg):
- ct.fit_transform(X_array)
- # because fit is also doing transform, this raises already on fit
- with pytest.raises(ValueError, match=msg):
- ct.fit(X_array)
- def test_2D_transformer_output_pandas():
- pd = pytest.importorskip("pandas")
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_df = pd.DataFrame(X_array, columns=["col1", "col2"])
- # if one transformer is dropped, test that name is still correct
- ct = ColumnTransformer([("trans1", TransNo2D(), "col1")])
- msg = "the 'trans1' transformer should be 2D"
- with pytest.raises(ValueError, match=msg):
- ct.fit_transform(X_df)
- # because fit is also doing transform, this raises already on fit
- with pytest.raises(ValueError, match=msg):
- ct.fit(X_df)
- @pytest.mark.parametrize("remainder", ["drop", "passthrough"])
- def test_column_transformer_invalid_columns(remainder):
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- # general invalid
- for col in [1.5, ["string", 1], slice(1, "s"), np.array([1.0])]:
- ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
- with pytest.raises(ValueError, match="No valid specification"):
- ct.fit(X_array)
- # invalid for arrays
- for col in ["string", ["string", "other"], slice("a", "b")]:
- ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
- with pytest.raises(ValueError, match="Specifying the columns"):
- ct.fit(X_array)
- # transformed n_features does not match fitted n_features
- col = [0, 1]
- ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
- ct.fit(X_array)
- X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
- msg = "X has 3 features, but ColumnTransformer is expecting 2 features as input."
- with pytest.raises(ValueError, match=msg):
- ct.transform(X_array_more)
- X_array_fewer = np.array(
- [
- [0, 1, 2],
- ]
- ).T
- err_msg = (
- "X has 1 features, but ColumnTransformer is expecting 2 features as input."
- )
- with pytest.raises(ValueError, match=err_msg):
- ct.transform(X_array_fewer)
- def test_column_transformer_invalid_transformer():
- class NoTrans(BaseEstimator):
- def fit(self, X, y=None):
- return self
- def predict(self, X):
- return X
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- ct = ColumnTransformer([("trans", NoTrans(), [0])])
- msg = "All estimators should implement fit and transform"
- with pytest.raises(TypeError, match=msg):
- ct.fit(X_array)
- def test_make_column_transformer():
- scaler = StandardScaler()
- norm = Normalizer()
- ct = make_column_transformer((scaler, "first"), (norm, ["second"]))
- names, transformers, columns = zip(*ct.transformers)
- assert names == ("standardscaler", "normalizer")
- assert transformers == (scaler, norm)
- assert columns == ("first", ["second"])
- def test_make_column_transformer_pandas():
- pd = pytest.importorskip("pandas")
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_df = pd.DataFrame(X_array, columns=["first", "second"])
- norm = Normalizer()
- ct1 = ColumnTransformer([("norm", Normalizer(), X_df.columns)])
- ct2 = make_column_transformer((norm, X_df.columns))
- assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
- def test_make_column_transformer_kwargs():
- scaler = StandardScaler()
- norm = Normalizer()
- ct = make_column_transformer(
- (scaler, "first"),
- (norm, ["second"]),
- n_jobs=3,
- remainder="drop",
- sparse_threshold=0.5,
- )
- assert (
- ct.transformers
- == make_column_transformer((scaler, "first"), (norm, ["second"])).transformers
- )
- assert ct.n_jobs == 3
- assert ct.remainder == "drop"
- assert ct.sparse_threshold == 0.5
- # invalid keyword parameters should raise an error message
- msg = re.escape(
- "make_column_transformer() got an unexpected "
- "keyword argument 'transformer_weights'"
- )
- with pytest.raises(TypeError, match=msg):
- make_column_transformer(
- (scaler, "first"),
- (norm, ["second"]),
- transformer_weights={"pca": 10, "Transf": 1},
- )
- def test_make_column_transformer_remainder_transformer():
- scaler = StandardScaler()
- norm = Normalizer()
- remainder = StandardScaler()
- ct = make_column_transformer(
- (scaler, "first"), (norm, ["second"]), remainder=remainder
- )
- assert ct.remainder == remainder
- def test_column_transformer_get_set_params():
- ct = ColumnTransformer(
- [("trans1", StandardScaler(), [0]), ("trans2", StandardScaler(), [1])]
- )
- exp = {
- "n_jobs": None,
- "remainder": "drop",
- "sparse_threshold": 0.3,
- "trans1": ct.transformers[0][1],
- "trans1__copy": True,
- "trans1__with_mean": True,
- "trans1__with_std": True,
- "trans2": ct.transformers[1][1],
- "trans2__copy": True,
- "trans2__with_mean": True,
- "trans2__with_std": True,
- "transformers": ct.transformers,
- "transformer_weights": None,
- "verbose_feature_names_out": True,
- "verbose": False,
- }
- assert ct.get_params() == exp
- ct.set_params(trans1__with_mean=False)
- assert not ct.get_params()["trans1__with_mean"]
- ct.set_params(trans1="passthrough")
- exp = {
- "n_jobs": None,
- "remainder": "drop",
- "sparse_threshold": 0.3,
- "trans1": "passthrough",
- "trans2": ct.transformers[1][1],
- "trans2__copy": True,
- "trans2__with_mean": True,
- "trans2__with_std": True,
- "transformers": ct.transformers,
- "transformer_weights": None,
- "verbose_feature_names_out": True,
- "verbose": False,
- }
- assert ct.get_params() == exp
- def test_column_transformer_named_estimators():
- X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
- ct = ColumnTransformer(
- [
- ("trans1", StandardScaler(), [0]),
- ("trans2", StandardScaler(with_std=False), [1]),
- ]
- )
- assert not hasattr(ct, "transformers_")
- ct.fit(X_array)
- assert hasattr(ct, "transformers_")
- assert isinstance(ct.named_transformers_["trans1"], StandardScaler)
- assert isinstance(ct.named_transformers_.trans1, StandardScaler)
- assert isinstance(ct.named_transformers_["trans2"], StandardScaler)
- assert isinstance(ct.named_transformers_.trans2, StandardScaler)
- assert not ct.named_transformers_.trans2.with_std
- # check it are fitted transformers
- assert ct.named_transformers_.trans1.mean_ == 1.0
- def test_column_transformer_cloning():
- X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
- ct = ColumnTransformer([("trans", StandardScaler(), [0])])
- ct.fit(X_array)
- assert not hasattr(ct.transformers[0][1], "mean_")
- assert hasattr(ct.transformers_[0][1], "mean_")
- ct = ColumnTransformer([("trans", StandardScaler(), [0])])
- ct.fit_transform(X_array)
- assert not hasattr(ct.transformers[0][1], "mean_")
- assert hasattr(ct.transformers_[0][1], "mean_")
- def test_column_transformer_get_feature_names():
- X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
- ct = ColumnTransformer([("trans", Trans(), [0, 1])])
- # raise correct error when not fitted
- with pytest.raises(NotFittedError):
- ct.get_feature_names_out()
- # raise correct error when no feature names are available
- ct.fit(X_array)
- msg = re.escape(
- "Transformer trans (type Trans) does not provide get_feature_names_out"
- )
- with pytest.raises(AttributeError, match=msg):
- ct.get_feature_names_out()
- def test_column_transformer_special_strings():
- # one 'drop' -> ignore
- X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
- ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "drop", [1])])
- exp = np.array([[0.0], [1.0], [2.0]])
- assert_array_equal(ct.fit_transform(X_array), exp)
- assert_array_equal(ct.fit(X_array).transform(X_array), exp)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] != "remainder"
- # all 'drop' -> return shape 0 array
- ct = ColumnTransformer([("trans1", "drop", [0]), ("trans2", "drop", [1])])
- assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0))
- assert_array_equal(ct.fit_transform(X_array).shape, (3, 0))
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] != "remainder"
- # 'passthrough'
- X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
- ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "passthrough", [1])])
- exp = X_array
- assert_array_equal(ct.fit_transform(X_array), exp)
- assert_array_equal(ct.fit(X_array).transform(X_array), exp)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] != "remainder"
- def test_column_transformer_remainder():
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
- X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
- X_res_both = X_array
- # default drop
- ct = ColumnTransformer([("trans1", Trans(), [0])])
- assert_array_equal(ct.fit_transform(X_array), X_res_first)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert ct.transformers_[-1][1] == "drop"
- assert_array_equal(ct.transformers_[-1][2], [1])
- # specify passthrough
- ct = ColumnTransformer([("trans", Trans(), [0])], remainder="passthrough")
- assert_array_equal(ct.fit_transform(X_array), X_res_both)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert ct.transformers_[-1][1] == "passthrough"
- assert_array_equal(ct.transformers_[-1][2], [1])
- # column order is not preserved (passed through added to end)
- ct = ColumnTransformer([("trans1", Trans(), [1])], remainder="passthrough")
- assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert ct.transformers_[-1][1] == "passthrough"
- assert_array_equal(ct.transformers_[-1][2], [0])
- # passthrough when all actual transformers are skipped
- ct = ColumnTransformer([("trans1", "drop", [0])], remainder="passthrough")
- assert_array_equal(ct.fit_transform(X_array), X_res_second)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert ct.transformers_[-1][1] == "passthrough"
- assert_array_equal(ct.transformers_[-1][2], [1])
- # check default for make_column_transformer
- ct = make_column_transformer((Trans(), [0]))
- assert ct.remainder == "drop"
- @pytest.mark.parametrize(
- "key", [[0], np.array([0]), slice(0, 1), np.array([True, False])]
- )
- def test_column_transformer_remainder_numpy(key):
- # test different ways that columns are specified with passthrough
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_res_both = X_array
- ct = ColumnTransformer([("trans1", Trans(), key)], remainder="passthrough")
- assert_array_equal(ct.fit_transform(X_array), X_res_both)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert ct.transformers_[-1][1] == "passthrough"
- assert_array_equal(ct.transformers_[-1][2], [1])
- @pytest.mark.parametrize(
- "key",
- [
- [0],
- slice(0, 1),
- np.array([True, False]),
- ["first"],
- "pd-index",
- np.array(["first"]),
- np.array(["first"], dtype=object),
- slice(None, "first"),
- slice("first", "first"),
- ],
- )
- def test_column_transformer_remainder_pandas(key):
- # test different ways that columns are specified with passthrough
- pd = pytest.importorskip("pandas")
- if isinstance(key, str) and key == "pd-index":
- key = pd.Index(["first"])
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_df = pd.DataFrame(X_array, columns=["first", "second"])
- X_res_both = X_array
- ct = ColumnTransformer([("trans1", Trans(), key)], remainder="passthrough")
- assert_array_equal(ct.fit_transform(X_df), X_res_both)
- assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert ct.transformers_[-1][1] == "passthrough"
- assert_array_equal(ct.transformers_[-1][2], [1])
- @pytest.mark.parametrize(
- "key", [[0], np.array([0]), slice(0, 1), np.array([True, False, False])]
- )
- def test_column_transformer_remainder_transformer(key):
- X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
- X_res_both = X_array.copy()
- # second and third columns are doubled when remainder = DoubleTrans
- X_res_both[:, 1:3] *= 2
- ct = ColumnTransformer([("trans1", Trans(), key)], remainder=DoubleTrans())
- assert_array_equal(ct.fit_transform(X_array), X_res_both)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert isinstance(ct.transformers_[-1][1], DoubleTrans)
- assert_array_equal(ct.transformers_[-1][2], [1, 2])
- def test_column_transformer_no_remaining_remainder_transformer():
- X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
- ct = ColumnTransformer([("trans1", Trans(), [0, 1, 2])], remainder=DoubleTrans())
- assert_array_equal(ct.fit_transform(X_array), X_array)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_array)
- assert len(ct.transformers_) == 1
- assert ct.transformers_[-1][0] != "remainder"
- def test_column_transformer_drops_all_remainder_transformer():
- X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
- # columns are doubled when remainder = DoubleTrans
- X_res_both = 2 * X_array.copy()[:, 1:3]
- ct = ColumnTransformer([("trans1", "drop", [0])], remainder=DoubleTrans())
- assert_array_equal(ct.fit_transform(X_array), X_res_both)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert isinstance(ct.transformers_[-1][1], DoubleTrans)
- assert_array_equal(ct.transformers_[-1][2], [1, 2])
- def test_column_transformer_sparse_remainder_transformer():
- X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
- ct = ColumnTransformer(
- [("trans1", Trans(), [0])], remainder=SparseMatrixTrans(), sparse_threshold=0.8
- )
- X_trans = ct.fit_transform(X_array)
- assert sparse.issparse(X_trans)
- # SparseMatrixTrans creates 3 features for each column. There is
- # one column in ``transformers``, thus:
- assert X_trans.shape == (3, 3 + 1)
- exp_array = np.hstack((X_array[:, 0].reshape(-1, 1), np.eye(3)))
- assert_array_equal(X_trans.toarray(), exp_array)
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
- assert_array_equal(ct.transformers_[-1][2], [1, 2])
- def test_column_transformer_drop_all_sparse_remainder_transformer():
- X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
- ct = ColumnTransformer(
- [("trans1", "drop", [0])], remainder=SparseMatrixTrans(), sparse_threshold=0.8
- )
- X_trans = ct.fit_transform(X_array)
- assert sparse.issparse(X_trans)
- # SparseMatrixTrans creates 3 features for each column, thus:
- assert X_trans.shape == (3, 3)
- assert_array_equal(X_trans.toarray(), np.eye(3))
- assert len(ct.transformers_) == 2
- assert ct.transformers_[-1][0] == "remainder"
- assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
- assert_array_equal(ct.transformers_[-1][2], [1, 2])
- def test_column_transformer_get_set_params_with_remainder():
- ct = ColumnTransformer(
- [("trans1", StandardScaler(), [0])], remainder=StandardScaler()
- )
- exp = {
- "n_jobs": None,
- "remainder": ct.remainder,
- "remainder__copy": True,
- "remainder__with_mean": True,
- "remainder__with_std": True,
- "sparse_threshold": 0.3,
- "trans1": ct.transformers[0][1],
- "trans1__copy": True,
- "trans1__with_mean": True,
- "trans1__with_std": True,
- "transformers": ct.transformers,
- "transformer_weights": None,
- "verbose_feature_names_out": True,
- "verbose": False,
- }
- assert ct.get_params() == exp
- ct.set_params(remainder__with_std=False)
- assert not ct.get_params()["remainder__with_std"]
- ct.set_params(trans1="passthrough")
- exp = {
- "n_jobs": None,
- "remainder": ct.remainder,
- "remainder__copy": True,
- "remainder__with_mean": True,
- "remainder__with_std": False,
- "sparse_threshold": 0.3,
- "trans1": "passthrough",
- "transformers": ct.transformers,
- "transformer_weights": None,
- "verbose_feature_names_out": True,
- "verbose": False,
- }
- assert ct.get_params() == exp
- def test_column_transformer_no_estimators():
- X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).astype("float").T
- ct = ColumnTransformer([], remainder=StandardScaler())
- params = ct.get_params()
- assert params["remainder__with_mean"]
- X_trans = ct.fit_transform(X_array)
- assert X_trans.shape == X_array.shape
- assert len(ct.transformers_) == 1
- assert ct.transformers_[-1][0] == "remainder"
- assert ct.transformers_[-1][2] == [0, 1, 2]
- @pytest.mark.parametrize(
- ["est", "pattern"],
- [
- (
- ColumnTransformer(
- [("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
- remainder=DoubleTrans(),
- ),
- (
- r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
- r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
- r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
- ),
- ),
- (
- ColumnTransformer(
- [("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
- remainder="passthrough",
- ),
- (
- r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
- r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
- r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
- ),
- ),
- (
- ColumnTransformer(
- [("trans1", Trans(), [0]), ("trans2", "drop", [1])],
- remainder="passthrough",
- ),
- (
- r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
- r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$"
- ),
- ),
- (
- ColumnTransformer(
- [("trans1", Trans(), [0]), ("trans2", "passthrough", [1])],
- remainder="passthrough",
- ),
- (
- r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
- r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
- r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
- ),
- ),
- (
- ColumnTransformer([("trans1", Trans(), [0])], remainder="passthrough"),
- (
- r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
- r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$"
- ),
- ),
- (
- ColumnTransformer(
- [("trans1", Trans(), [0]), ("trans2", Trans(), [1])], remainder="drop"
- ),
- (
- r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
- r"\[ColumnTransformer\].*\(2 of 2\) Processing trans2.* total=.*\n$"
- ),
- ),
- (
- ColumnTransformer([("trans1", Trans(), [0])], remainder="drop"),
- r"\[ColumnTransformer\].*\(1 of 1\) Processing trans1.* total=.*\n$",
- ),
- ],
- )
- @pytest.mark.parametrize("method", ["fit", "fit_transform"])
- def test_column_transformer_verbose(est, pattern, method, capsys):
- X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
- func = getattr(est, method)
- est.set_params(verbose=False)
- func(X_array)
- assert not capsys.readouterr().out, "Got output for verbose=False"
- est.set_params(verbose=True)
- func(X_array)
- assert re.match(pattern, capsys.readouterr()[0])
- def test_column_transformer_no_estimators_set_params():
- ct = ColumnTransformer([]).set_params(n_jobs=2)
- assert ct.n_jobs == 2
- def test_column_transformer_callable_specifier():
- # assert that function gets the full array
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_res_first = np.array([[0, 1, 2]]).T
- def func(X):
- assert_array_equal(X, X_array)
- return [0]
- ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop")
- assert_array_equal(ct.fit_transform(X_array), X_res_first)
- assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
- assert callable(ct.transformers[0][2])
- assert ct.transformers_[0][2] == [0]
- def test_column_transformer_callable_specifier_dataframe():
- # assert that function gets the full dataframe
- pd = pytest.importorskip("pandas")
- X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_res_first = np.array([[0, 1, 2]]).T
- X_df = pd.DataFrame(X_array, columns=["first", "second"])
- def func(X):
- assert_array_equal(X.columns, X_df.columns)
- assert_array_equal(X.values, X_df.values)
- return ["first"]
- ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop")
- assert_array_equal(ct.fit_transform(X_df), X_res_first)
- assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
- assert callable(ct.transformers[0][2])
- assert ct.transformers_[0][2] == ["first"]
- def test_column_transformer_negative_column_indexes():
- X = np.random.randn(2, 2)
- X_categories = np.array([[1], [2]])
- X = np.concatenate([X, X_categories], axis=1)
- ohe = OneHotEncoder()
- tf_1 = ColumnTransformer([("ohe", ohe, [-1])], remainder="passthrough")
- tf_2 = ColumnTransformer([("ohe", ohe, [2])], remainder="passthrough")
- assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
- @pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
- def test_column_transformer_mask_indexing(array_type):
- # Regression test for #14510
- # Boolean array-like does not behave as boolean array with sparse matrices.
- X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]])
- X = array_type(X)
- column_transformer = ColumnTransformer(
- [("identity", FunctionTransformer(), [False, True, False, True])]
- )
- X_trans = column_transformer.fit_transform(X)
- assert X_trans.shape == (3, 2)
- def test_n_features_in():
- # make sure n_features_in is what is passed as input to the column
- # transformer.
- X = [[1, 2], [3, 4], [5, 6]]
- ct = ColumnTransformer([("a", DoubleTrans(), [0]), ("b", DoubleTrans(), [1])])
- assert not hasattr(ct, "n_features_in_")
- ct.fit(X)
- assert ct.n_features_in_ == 2
- @pytest.mark.parametrize(
- "cols, pattern, include, exclude",
- [
- (["col_int", "col_float"], None, np.number, None),
- (["col_int", "col_float"], None, None, object),
- (["col_int", "col_float"], None, [int, float], None),
- (["col_str"], None, [object], None),
- (["col_str"], None, object, None),
- (["col_float"], None, float, None),
- (["col_float"], "at$", [np.number], None),
- (["col_int"], None, [int], None),
- (["col_int"], "^col_int", [np.number], None),
- (["col_float", "col_str"], "float|str", None, None),
- (["col_str"], "^col_s", None, [int]),
- ([], "str$", float, None),
- (["col_int", "col_float", "col_str"], None, [np.number, object], None),
- ],
- )
- def test_make_column_selector_with_select_dtypes(cols, pattern, include, exclude):
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame(
- {
- "col_int": np.array([0, 1, 2], dtype=int),
- "col_float": np.array([0.0, 1.0, 2.0], dtype=float),
- "col_str": ["one", "two", "three"],
- },
- columns=["col_int", "col_float", "col_str"],
- )
- selector = make_column_selector(
- dtype_include=include, dtype_exclude=exclude, pattern=pattern
- )
- assert_array_equal(selector(X_df), cols)
- def test_column_transformer_with_make_column_selector():
- # Functional test for column transformer + column selector
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame(
- {
- "col_int": np.array([0, 1, 2], dtype=int),
- "col_float": np.array([0.0, 1.0, 2.0], dtype=float),
- "col_cat": ["one", "two", "one"],
- "col_str": ["low", "middle", "high"],
- },
- columns=["col_int", "col_float", "col_cat", "col_str"],
- )
- X_df["col_str"] = X_df["col_str"].astype("category")
- cat_selector = make_column_selector(dtype_include=["category", object])
- num_selector = make_column_selector(dtype_include=np.number)
- ohe = OneHotEncoder()
- scaler = StandardScaler()
- ct_selector = make_column_transformer((ohe, cat_selector), (scaler, num_selector))
- ct_direct = make_column_transformer(
- (ohe, ["col_cat", "col_str"]), (scaler, ["col_float", "col_int"])
- )
- X_selector = ct_selector.fit_transform(X_df)
- X_direct = ct_direct.fit_transform(X_df)
- assert_allclose(X_selector, X_direct)
- def test_make_column_selector_error():
- selector = make_column_selector(dtype_include=np.number)
- X = np.array([[0.1, 0.2]])
- msg = "make_column_selector can only be applied to pandas dataframes"
- with pytest.raises(ValueError, match=msg):
- selector(X)
- def test_make_column_selector_pickle():
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame(
- {
- "col_int": np.array([0, 1, 2], dtype=int),
- "col_float": np.array([0.0, 1.0, 2.0], dtype=float),
- "col_str": ["one", "two", "three"],
- },
- columns=["col_int", "col_float", "col_str"],
- )
- selector = make_column_selector(dtype_include=[object])
- selector_picked = pickle.loads(pickle.dumps(selector))
- assert_array_equal(selector(X_df), selector_picked(X_df))
- @pytest.mark.parametrize(
- "empty_col",
- [[], np.array([], dtype=int), lambda x: []],
- ids=["list", "array", "callable"],
- )
- def test_feature_names_empty_columns(empty_col):
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})
- ct = ColumnTransformer(
- transformers=[
- ("ohe", OneHotEncoder(), ["col1", "col2"]),
- ("empty_features", OneHotEncoder(), empty_col),
- ],
- )
- ct.fit(df)
- assert_array_equal(
- ct.get_feature_names_out(), ["ohe__col1_a", "ohe__col1_b", "ohe__col2_z"]
- )
- @pytest.mark.parametrize(
- "selector",
- [
- [1],
- lambda x: [1],
- ["col2"],
- lambda x: ["col2"],
- [False, True],
- lambda x: [False, True],
- ],
- )
- def test_feature_names_out_pandas(selector):
- """Checks name when selecting only the second column"""
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})
- ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)])
- ct.fit(df)
- assert_array_equal(ct.get_feature_names_out(), ["ohe__col2_z"])
- @pytest.mark.parametrize(
- "selector", [[1], lambda x: [1], [False, True], lambda x: [False, True]]
- )
- def test_feature_names_out_non_pandas(selector):
- """Checks name when selecting the second column with numpy array"""
- X = [["a", "z"], ["a", "z"], ["b", "z"]]
- ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)])
- ct.fit(X)
- assert_array_equal(ct.get_feature_names_out(), ["ohe__x1_z"])
- @pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
- def test_sk_visual_block_remainder(remainder):
- # remainder='passthrough' or an estimator will be shown in repr_html
- ohe = OneHotEncoder()
- ct = ColumnTransformer(
- transformers=[("ohe", ohe, ["col1", "col2"])], remainder=remainder
- )
- visual_block = ct._sk_visual_block_()
- assert visual_block.names == ("ohe", "remainder")
- assert visual_block.name_details == (["col1", "col2"], "")
- assert visual_block.estimators == (ohe, remainder)
- def test_sk_visual_block_remainder_drop():
- # remainder='drop' is not shown in repr_html
- ohe = OneHotEncoder()
- ct = ColumnTransformer(transformers=[("ohe", ohe, ["col1", "col2"])])
- visual_block = ct._sk_visual_block_()
- assert visual_block.names == ("ohe",)
- assert visual_block.name_details == (["col1", "col2"],)
- assert visual_block.estimators == (ohe,)
- @pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
- def test_sk_visual_block_remainder_fitted_pandas(remainder):
- # Remainder shows the columns after fitting
- pd = pytest.importorskip("pandas")
- ohe = OneHotEncoder()
- ct = ColumnTransformer(
- transformers=[("ohe", ohe, ["col1", "col2"])], remainder=remainder
- )
- df = pd.DataFrame(
- {
- "col1": ["a", "b", "c"],
- "col2": ["z", "z", "z"],
- "col3": [1, 2, 3],
- "col4": [3, 4, 5],
- }
- )
- ct.fit(df)
- visual_block = ct._sk_visual_block_()
- assert visual_block.names == ("ohe", "remainder")
- assert visual_block.name_details == (["col1", "col2"], ["col3", "col4"])
- assert visual_block.estimators == (ohe, remainder)
- @pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
- def test_sk_visual_block_remainder_fitted_numpy(remainder):
- # Remainder shows the indices after fitting
- X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
- scaler = StandardScaler()
- ct = ColumnTransformer(
- transformers=[("scale", scaler, [0, 2])], remainder=remainder
- )
- ct.fit(X)
- visual_block = ct._sk_visual_block_()
- assert visual_block.names == ("scale", "remainder")
- assert visual_block.name_details == ([0, 2], [1])
- assert visual_block.estimators == (scaler, remainder)
- @pytest.mark.parametrize("explicit_colname", ["first", "second", 0, 1])
- @pytest.mark.parametrize("remainder", [Trans(), "passthrough", "drop"])
- def test_column_transformer_reordered_column_names_remainder(
- explicit_colname, remainder
- ):
- """Test the interaction between remainder and column transformer"""
- pd = pytest.importorskip("pandas")
- X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
- X_fit_df = pd.DataFrame(X_fit_array, columns=["first", "second"])
- X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
- X_trans_df = pd.DataFrame(X_trans_array, columns=["second", "first"])
- tf = ColumnTransformer([("bycol", Trans(), explicit_colname)], remainder=remainder)
- tf.fit(X_fit_df)
- X_fit_trans = tf.transform(X_fit_df)
- # Changing the order still works
- X_trans = tf.transform(X_trans_df)
- assert_allclose(X_trans, X_fit_trans)
- # extra columns are ignored
- X_extended_df = X_fit_df.copy()
- X_extended_df["third"] = [3, 6, 9]
- X_trans = tf.transform(X_extended_df)
- assert_allclose(X_trans, X_fit_trans)
- if isinstance(explicit_colname, str):
- # Raise error if columns are specified by names but input only allows
- # to specify by position, e.g. numpy array instead of a pandas df.
- X_array = X_fit_array.copy()
- err_msg = "Specifying the columns"
- with pytest.raises(ValueError, match=err_msg):
- tf.transform(X_array)
- def test_feature_name_validation_missing_columns_drop_passthough():
- """Test the interaction between {'drop', 'passthrough'} and
- missing column names."""
- pd = pytest.importorskip("pandas")
- X = np.ones(shape=(3, 4))
- df = pd.DataFrame(X, columns=["a", "b", "c", "d"])
- df_dropped = df.drop("c", axis=1)
- # with remainder='passthrough', all columns seen during `fit` must be
- # present
- tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="passthrough")
- tf.fit(df)
- msg = r"columns are missing: {'c'}"
- with pytest.raises(ValueError, match=msg):
- tf.transform(df_dropped)
- # with remainder='drop', it is allowed to have column 'c' missing
- tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="drop")
- tf.fit(df)
- df_dropped_trans = tf.transform(df_dropped)
- df_fit_trans = tf.transform(df)
- assert_allclose(df_dropped_trans, df_fit_trans)
- # bycol drops 'c', thus it is allowed for 'c' to be missing
- tf = ColumnTransformer([("bycol", "drop", ["c"])], remainder="passthrough")
- tf.fit(df)
- df_dropped_trans = tf.transform(df_dropped)
- df_fit_trans = tf.transform(df)
- assert_allclose(df_dropped_trans, df_fit_trans)
- def test_feature_names_in_():
- """Feature names are stored in column transformer.
- Column transformer deliberately does not check for column name consistency.
- It only checks that the non-dropped names seen in `fit` are seen
- in `transform`. This behavior is already tested in
- `test_feature_name_validation_missing_columns_drop_passthough`"""
- pd = pytest.importorskip("pandas")
- feature_names = ["a", "c", "d"]
- df = pd.DataFrame([[1, 2, 3]], columns=feature_names)
- ct = ColumnTransformer([("bycol", Trans(), ["a", "d"])], remainder="passthrough")
- ct.fit(df)
- assert_array_equal(ct.feature_names_in_, feature_names)
- assert isinstance(ct.feature_names_in_, np.ndarray)
- assert ct.feature_names_in_.dtype == object
- class TransWithNames(Trans):
- def __init__(self, feature_names_out=None):
- self.feature_names_out = feature_names_out
- def get_feature_names_out(self, input_features=None):
- if self.feature_names_out is not None:
- return np.asarray(self.feature_names_out, dtype=object)
- return input_features
- @pytest.mark.parametrize(
- "transformers, remainder, expected_names",
- [
- (
- [
- ("bycol1", TransWithNames(), ["d", "c"]),
- ("bycol2", "passthrough", ["d"]),
- ],
- "passthrough",
- ["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["d", "c"]),
- ("bycol2", "passthrough", ["d"]),
- ],
- "drop",
- ["bycol1__d", "bycol1__c", "bycol2__d"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["b"]),
- ("bycol2", "drop", ["d"]),
- ],
- "passthrough",
- ["bycol1__b", "remainder__a", "remainder__c"],
- ),
- (
- [
- ("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]),
- ],
- "passthrough",
- ["bycol1__pca1", "bycol1__pca2", "remainder__c"],
- ),
- (
- [
- ("bycol1", TransWithNames(["a", "b"]), ["d"]),
- ("bycol2", "passthrough", ["b"]),
- ],
- "drop",
- ["bycol1__a", "bycol1__b", "bycol2__b"],
- ),
- (
- [
- ("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
- ("bycol2", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
- ],
- "passthrough",
- [
- "bycol1__pca0",
- "bycol1__pca1",
- "bycol2__pca0",
- "bycol2__pca1",
- "remainder__a",
- "remainder__c",
- "remainder__d",
- ],
- ),
- (
- [
- ("bycol1", "drop", ["d"]),
- ],
- "drop",
- [],
- ),
- (
- [
- ("bycol1", TransWithNames(), slice(1, 3)),
- ],
- "drop",
- ["bycol1__b", "bycol1__c"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["b"]),
- ("bycol2", "drop", slice(3, 4)),
- ],
- "passthrough",
- ["bycol1__b", "remainder__a", "remainder__c"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["d", "c"]),
- ("bycol2", "passthrough", slice(3, 4)),
- ],
- "passthrough",
- ["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"],
- ),
- (
- [
- ("bycol1", TransWithNames(), slice("b", "c")),
- ],
- "drop",
- ["bycol1__b", "bycol1__c"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["b"]),
- ("bycol2", "drop", slice("c", "d")),
- ],
- "passthrough",
- ["bycol1__b", "remainder__a"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["d", "c"]),
- ("bycol2", "passthrough", slice("c", "d")),
- ],
- "passthrough",
- [
- "bycol1__d",
- "bycol1__c",
- "bycol2__c",
- "bycol2__d",
- "remainder__a",
- "remainder__b",
- ],
- ),
- ],
- )
- def test_verbose_feature_names_out_true(transformers, remainder, expected_names):
- """Check feature_names_out for verbose_feature_names_out=True (default)"""
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
- ct = ColumnTransformer(
- transformers,
- remainder=remainder,
- )
- ct.fit(df)
- names = ct.get_feature_names_out()
- assert isinstance(names, np.ndarray)
- assert names.dtype == object
- assert_array_equal(names, expected_names)
- @pytest.mark.parametrize(
- "transformers, remainder, expected_names",
- [
- (
- [
- ("bycol1", TransWithNames(), ["d", "c"]),
- ("bycol2", "passthrough", ["a"]),
- ],
- "passthrough",
- ["d", "c", "a", "b"],
- ),
- (
- [
- ("bycol1", TransWithNames(["a"]), ["d", "c"]),
- ("bycol2", "passthrough", ["d"]),
- ],
- "drop",
- ["a", "d"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["b"]),
- ("bycol2", "drop", ["d"]),
- ],
- "passthrough",
- ["b", "a", "c"],
- ),
- (
- [
- ("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]),
- ],
- "passthrough",
- ["pca1", "pca2", "c"],
- ),
- (
- [
- ("bycol1", TransWithNames(["a", "c"]), ["d"]),
- ("bycol2", "passthrough", ["d"]),
- ],
- "drop",
- ["a", "c", "d"],
- ),
- (
- [
- ("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
- ("bycol2", TransWithNames([f"kpca{i}" for i in range(2)]), ["b"]),
- ],
- "passthrough",
- ["pca0", "pca1", "kpca0", "kpca1", "a", "c", "d"],
- ),
- (
- [
- ("bycol1", "drop", ["d"]),
- ],
- "drop",
- [],
- ),
- (
- [
- ("bycol1", TransWithNames(), slice(1, 2)),
- ("bycol2", "drop", ["d"]),
- ],
- "passthrough",
- ["b", "a", "c"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["b"]),
- ("bycol2", "drop", slice(3, 4)),
- ],
- "passthrough",
- ["b", "a", "c"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["d", "c"]),
- ("bycol2", "passthrough", slice(0, 2)),
- ],
- "drop",
- ["d", "c", "a", "b"],
- ),
- (
- [
- ("bycol1", TransWithNames(), slice("a", "b")),
- ("bycol2", "drop", ["d"]),
- ],
- "passthrough",
- ["a", "b", "c"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["b"]),
- ("bycol2", "drop", slice("c", "d")),
- ],
- "passthrough",
- ["b", "a"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["d", "c"]),
- ("bycol2", "passthrough", slice("a", "b")),
- ],
- "drop",
- ["d", "c", "a", "b"],
- ),
- (
- [
- ("bycol1", TransWithNames(), ["d", "c"]),
- ("bycol2", "passthrough", slice("b", "b")),
- ],
- "drop",
- ["d", "c", "b"],
- ),
- ],
- )
- def test_verbose_feature_names_out_false(transformers, remainder, expected_names):
- """Check feature_names_out for verbose_feature_names_out=False"""
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
- ct = ColumnTransformer(
- transformers,
- remainder=remainder,
- verbose_feature_names_out=False,
- )
- ct.fit(df)
- names = ct.get_feature_names_out()
- assert isinstance(names, np.ndarray)
- assert names.dtype == object
- assert_array_equal(names, expected_names)
- @pytest.mark.parametrize(
- "transformers, remainder, colliding_columns",
- [
- (
- [
- ("bycol1", TransWithNames(), ["b"]),
- ("bycol2", "passthrough", ["b"]),
- ],
- "drop",
- "['b']",
- ),
- (
- [
- ("bycol1", TransWithNames(["c", "d"]), ["c"]),
- ("bycol2", "passthrough", ["c"]),
- ],
- "drop",
- "['c']",
- ),
- (
- [
- ("bycol1", TransWithNames(["a"]), ["b"]),
- ("bycol2", "passthrough", ["b"]),
- ],
- "passthrough",
- "['a']",
- ),
- (
- [
- ("bycol1", TransWithNames(["a"]), ["b"]),
- ("bycol2", "drop", ["b"]),
- ],
- "passthrough",
- "['a']",
- ),
- (
- [
- ("bycol1", TransWithNames(["c", "b"]), ["b"]),
- ("bycol2", "passthrough", ["c", "b"]),
- ],
- "drop",
- "['b', 'c']",
- ),
- (
- [
- ("bycol1", TransWithNames(["a"]), ["b"]),
- ("bycol2", "passthrough", ["a"]),
- ("bycol3", TransWithNames(["a"]), ["b"]),
- ],
- "passthrough",
- "['a']",
- ),
- (
- [
- ("bycol1", TransWithNames(["a", "b"]), ["b"]),
- ("bycol2", "passthrough", ["a"]),
- ("bycol3", TransWithNames(["b"]), ["c"]),
- ],
- "passthrough",
- "['a', 'b']",
- ),
- (
- [
- ("bycol1", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]),
- ("bycol2", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]),
- ],
- "passthrough",
- "['pca0', 'pca1', 'pca2', 'pca3', 'pca4', ...]",
- ),
- (
- [
- ("bycol1", TransWithNames(["a", "b"]), slice(1, 2)),
- ("bycol2", "passthrough", ["a"]),
- ("bycol3", TransWithNames(["b"]), ["c"]),
- ],
- "passthrough",
- "['a', 'b']",
- ),
- (
- [
- ("bycol1", TransWithNames(["a", "b"]), ["b"]),
- ("bycol2", "passthrough", slice(0, 1)),
- ("bycol3", TransWithNames(["b"]), ["c"]),
- ],
- "passthrough",
- "['a', 'b']",
- ),
- (
- [
- ("bycol1", TransWithNames(["a", "b"]), slice("b", "c")),
- ("bycol2", "passthrough", ["a"]),
- ("bycol3", TransWithNames(["b"]), ["c"]),
- ],
- "passthrough",
- "['a', 'b']",
- ),
- (
- [
- ("bycol1", TransWithNames(["a", "b"]), ["b"]),
- ("bycol2", "passthrough", slice("a", "a")),
- ("bycol3", TransWithNames(["b"]), ["c"]),
- ],
- "passthrough",
- "['a', 'b']",
- ),
- ],
- )
- def test_verbose_feature_names_out_false_errors(
- transformers, remainder, colliding_columns
- ):
- """Check feature_names_out for verbose_feature_names_out=False"""
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
- ct = ColumnTransformer(
- transformers,
- remainder=remainder,
- verbose_feature_names_out=False,
- )
- ct.fit(df)
- msg = re.escape(
- f"Output feature names: {colliding_columns} are not unique. Please set "
- "verbose_feature_names_out=True to add prefixes to feature names"
- )
- with pytest.raises(ValueError, match=msg):
- ct.get_feature_names_out()
- @pytest.mark.parametrize("verbose_feature_names_out", [True, False])
- @pytest.mark.parametrize("remainder", ["drop", "passthrough"])
- def test_column_transformer_set_output(verbose_feature_names_out, remainder):
- """Check column transformer behavior with set_output."""
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"], index=[10])
- ct = ColumnTransformer(
- [("first", TransWithNames(), ["a", "c"]), ("second", TransWithNames(), ["d"])],
- remainder=remainder,
- verbose_feature_names_out=verbose_feature_names_out,
- )
- X_trans = ct.fit_transform(df)
- assert isinstance(X_trans, np.ndarray)
- ct.set_output(transform="pandas")
- df_test = pd.DataFrame([[1, 2, 3, 4]], columns=df.columns, index=[20])
- X_trans = ct.transform(df_test)
- assert isinstance(X_trans, pd.DataFrame)
- feature_names_out = ct.get_feature_names_out()
- assert_array_equal(X_trans.columns, feature_names_out)
- assert_array_equal(X_trans.index, df_test.index)
- @pytest.mark.parametrize("remainder", ["drop", "passthrough"])
- @pytest.mark.parametrize("fit_transform", [True, False])
- def test_column_transform_set_output_mixed(remainder, fit_transform):
- """Check ColumnTransformer outputs mixed types correctly."""
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame(
- {
- "pet": pd.Series(["dog", "cat", "snake"], dtype="category"),
- "color": pd.Series(["green", "blue", "red"], dtype="object"),
- "age": [1.4, 2.1, 4.4],
- "height": [20, 40, 10],
- "distance": pd.Series([20, pd.NA, 100], dtype="Int32"),
- }
- )
- ct = ColumnTransformer(
- [
- (
- "color_encode",
- OneHotEncoder(sparse_output=False, dtype="int8"),
- ["color"],
- ),
- ("age", StandardScaler(), ["age"]),
- ],
- remainder=remainder,
- verbose_feature_names_out=False,
- ).set_output(transform="pandas")
- if fit_transform:
- X_trans = ct.fit_transform(df)
- else:
- X_trans = ct.fit(df).transform(df)
- assert isinstance(X_trans, pd.DataFrame)
- assert_array_equal(X_trans.columns, ct.get_feature_names_out())
- expected_dtypes = {
- "color_blue": "int8",
- "color_green": "int8",
- "color_red": "int8",
- "age": "float64",
- "pet": "category",
- "height": "int64",
- "distance": "Int32",
- }
- for col, dtype in X_trans.dtypes.items():
- assert dtype == expected_dtypes[col]
- @pytest.mark.parametrize("remainder", ["drop", "passthrough"])
- def test_column_transform_set_output_after_fitting(remainder):
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame(
- {
- "pet": pd.Series(["dog", "cat", "snake"], dtype="category"),
- "age": [1.4, 2.1, 4.4],
- "height": [20, 40, 10],
- }
- )
- ct = ColumnTransformer(
- [
- (
- "color_encode",
- OneHotEncoder(sparse_output=False, dtype="int16"),
- ["pet"],
- ),
- ("age", StandardScaler(), ["age"]),
- ],
- remainder=remainder,
- verbose_feature_names_out=False,
- )
- # fit without calling set_output
- X_trans = ct.fit_transform(df)
- assert isinstance(X_trans, np.ndarray)
- assert X_trans.dtype == "float64"
- ct.set_output(transform="pandas")
- X_trans_df = ct.transform(df)
- expected_dtypes = {
- "pet_cat": "int16",
- "pet_dog": "int16",
- "pet_snake": "int16",
- "height": "int64",
- "age": "float64",
- }
- for col, dtype in X_trans_df.dtypes.items():
- assert dtype == expected_dtypes[col]
- # PandasOutTransformer that does not define get_feature_names_out and always expects
- # the input to be a DataFrame.
- class PandasOutTransformer(BaseEstimator):
- def __init__(self, offset=1.0):
- self.offset = offset
- def fit(self, X, y=None):
- pd = pytest.importorskip("pandas")
- assert isinstance(X, pd.DataFrame)
- return self
- def transform(self, X, y=None):
- pd = pytest.importorskip("pandas")
- assert isinstance(X, pd.DataFrame)
- return X - self.offset
- def set_output(self, transform=None):
- # This transformer will always output a DataFrame regardless of the
- # configuration.
- return self
- @pytest.mark.parametrize(
- "trans_1, expected_verbose_names, expected_non_verbose_names",
- [
- (
- PandasOutTransformer(offset=2.0),
- ["trans_0__feat1", "trans_1__feat0"],
- ["feat1", "feat0"],
- ),
- (
- "drop",
- ["trans_0__feat1"],
- ["feat1"],
- ),
- (
- "passthrough",
- ["trans_0__feat1", "trans_1__feat0"],
- ["feat1", "feat0"],
- ),
- ],
- )
- def test_transformers_with_pandas_out_but_not_feature_names_out(
- trans_1, expected_verbose_names, expected_non_verbose_names
- ):
- """Check that set_config(transform="pandas") is compatible with more transformers.
- Specifically, if transformers returns a DataFrame, but does not define
- `get_feature_names_out`.
- """
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]})
- ct = ColumnTransformer(
- [
- ("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]),
- ("trans_1", trans_1, ["feat0"]),
- ]
- )
- X_trans_np = ct.fit_transform(X_df)
- assert isinstance(X_trans_np, np.ndarray)
- # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does
- # not define the method.
- with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
- ct.get_feature_names_out()
- # The feature names are prefixed because verbose_feature_names_out=True is default
- ct.set_output(transform="pandas")
- X_trans_df0 = ct.fit_transform(X_df)
- assert_array_equal(X_trans_df0.columns, expected_verbose_names)
- ct.set_params(verbose_feature_names_out=False)
- X_trans_df1 = ct.fit_transform(X_df)
- assert_array_equal(X_trans_df1.columns, expected_non_verbose_names)
- @pytest.mark.parametrize(
- "empty_selection",
- [[], np.array([False, False]), [False, False]],
- ids=["list", "bool", "bool_int"],
- )
- def test_empty_selection_pandas_output(empty_selection):
- """Check that pandas output works when there is an empty selection.
- Non-regression test for gh-25487
- """
- pd = pytest.importorskip("pandas")
- X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"])
- ct = ColumnTransformer(
- [
- ("categorical", "passthrough", empty_selection),
- ("numerical", StandardScaler(), ["a", "b"]),
- ],
- verbose_feature_names_out=True,
- )
- ct.set_output(transform="pandas")
- X_out = ct.fit_transform(X)
- assert_array_equal(X_out.columns, ["numerical__a", "numerical__b"])
- ct.set_params(verbose_feature_names_out=False)
- X_out = ct.fit_transform(X)
- assert_array_equal(X_out.columns, ["a", "b"])
- def test_raise_error_if_index_not_aligned():
- """Check column transformer raises error if indices are not aligned.
- Non-regression test for gh-26210.
- """
- pd = pytest.importorskip("pandas")
- X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"], index=[8, 3])
- reset_index_transformer = FunctionTransformer(
- lambda x: x.reset_index(drop=True), feature_names_out="one-to-one"
- )
- ct = ColumnTransformer(
- [
- ("num1", "passthrough", ["a"]),
- ("num2", reset_index_transformer, ["b"]),
- ],
- )
- ct.set_output(transform="pandas")
- msg = (
- "Concatenating DataFrames from the transformer's output lead to"
- " an inconsistent number of samples. The output may have Pandas"
- " Indexes that do not match."
- )
- with pytest.raises(ValueError, match=msg):
- ct.fit_transform(X)
- def test_remainder_set_output():
- """Check that the output is set for the remainder.
- Non-regression test for #26306.
- """
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame({"a": [True, False, True], "b": [1, 2, 3]})
- ct = make_column_transformer(
- (VarianceThreshold(), make_column_selector(dtype_include=bool)),
- remainder=VarianceThreshold(),
- verbose_feature_names_out=False,
- )
- ct.set_output(transform="pandas")
- out = ct.fit_transform(df)
- pd.testing.assert_frame_equal(out, df)
- ct.set_output(transform="default")
- out = ct.fit_transform(df)
- assert isinstance(out, np.ndarray)
|