| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404 |
- import warnings
- from numbers import Integral, Real
- import numpy as np
- from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone
- from ..utils import safe_mask
- from ..utils._param_validation import HasMethods, Interval, StrOptions
- from ..utils.metaestimators import available_if
- from ..utils.validation import check_is_fitted
- __all__ = ["SelfTrainingClassifier"]
- # Authors: Oliver Rausch <rauscho@ethz.ch>
- # Patrice Becker <beckerp@ethz.ch>
- # License: BSD 3 clause
- def _estimator_has(attr):
- """Check if `self.base_estimator_ `or `self.base_estimator_` has `attr`."""
- return lambda self: (
- hasattr(self.base_estimator_, attr)
- if hasattr(self, "base_estimator_")
- else hasattr(self.base_estimator, attr)
- )
- class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
- """Self-training classifier.
- This :term:`metaestimator` allows a given supervised classifier to function as a
- semi-supervised classifier, allowing it to learn from unlabeled data. It
- does this by iteratively predicting pseudo-labels for the unlabeled data
- and adding them to the training set.
- The classifier will continue iterating until either max_iter is reached, or
- no pseudo-labels were added to the training set in the previous iteration.
- Read more in the :ref:`User Guide <self_training>`.
- Parameters
- ----------
- base_estimator : estimator object
- An estimator object implementing `fit` and `predict_proba`.
- Invoking the `fit` method will fit a clone of the passed estimator,
- which will be stored in the `base_estimator_` attribute.
- threshold : float, default=0.75
- The decision threshold for use with `criterion='threshold'`.
- Should be in [0, 1). When using the `'threshold'` criterion, a
- :ref:`well calibrated classifier <calibration>` should be used.
- criterion : {'threshold', 'k_best'}, default='threshold'
- The selection criterion used to select which labels to add to the
- training set. If `'threshold'`, pseudo-labels with prediction
- probabilities above `threshold` are added to the dataset. If `'k_best'`,
- the `k_best` pseudo-labels with highest prediction probabilities are
- added to the dataset. When using the 'threshold' criterion, a
- :ref:`well calibrated classifier <calibration>` should be used.
- k_best : int, default=10
- The amount of samples to add in each iteration. Only used when
- `criterion='k_best'`.
- max_iter : int or None, default=10
- Maximum number of iterations allowed. Should be greater than or equal
- to 0. If it is `None`, the classifier will continue to predict labels
- until no new pseudo-labels are added, or all unlabeled samples have
- been labeled.
- verbose : bool, default=False
- Enable verbose output.
- Attributes
- ----------
- base_estimator_ : estimator object
- The fitted estimator.
- classes_ : ndarray or list of ndarray of shape (n_classes,)
- Class labels for each output. (Taken from the trained
- `base_estimator_`).
- transduction_ : ndarray of shape (n_samples,)
- The labels used for the final fit of the classifier, including
- pseudo-labels added during fit.
- labeled_iter_ : ndarray of shape (n_samples,)
- The iteration in which each sample was labeled. When a sample has
- iteration 0, the sample was already labeled in the original dataset.
- When a sample has iteration -1, the sample was not labeled in any
- iteration.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_iter_ : int
- The number of rounds of self-training, that is the number of times the
- base estimator is fitted on relabeled variants of the training set.
- termination_condition_ : {'max_iter', 'no_change', 'all_labeled'}
- The reason that fitting was stopped.
- - `'max_iter'`: `n_iter_` reached `max_iter`.
- - `'no_change'`: no new labels were predicted.
- - `'all_labeled'`: all unlabeled samples were labeled before `max_iter`
- was reached.
- See Also
- --------
- LabelPropagation : Label propagation classifier.
- LabelSpreading : Label spreading model for semi-supervised learning.
- References
- ----------
- :doi:`David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling
- supervised methods. In Proceedings of the 33rd annual meeting on
- Association for Computational Linguistics (ACL '95). Association for
- Computational Linguistics, Stroudsburg, PA, USA, 189-196.
- <10.3115/981658.981684>`
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn import datasets
- >>> from sklearn.semi_supervised import SelfTrainingClassifier
- >>> from sklearn.svm import SVC
- >>> rng = np.random.RandomState(42)
- >>> iris = datasets.load_iris()
- >>> random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3
- >>> iris.target[random_unlabeled_points] = -1
- >>> svc = SVC(probability=True, gamma="auto")
- >>> self_training_model = SelfTrainingClassifier(svc)
- >>> self_training_model.fit(iris.data, iris.target)
- SelfTrainingClassifier(...)
- """
- _estimator_type = "classifier"
- _parameter_constraints: dict = {
- # We don't require `predic_proba` here to allow passing a meta-estimator
- # that only exposes `predict_proba` after fitting.
- "base_estimator": [HasMethods(["fit"])],
- "threshold": [Interval(Real, 0.0, 1.0, closed="left")],
- "criterion": [StrOptions({"threshold", "k_best"})],
- "k_best": [Interval(Integral, 1, None, closed="left")],
- "max_iter": [Interval(Integral, 0, None, closed="left"), None],
- "verbose": ["verbose"],
- }
- def __init__(
- self,
- base_estimator,
- threshold=0.75,
- criterion="threshold",
- k_best=10,
- max_iter=10,
- verbose=False,
- ):
- self.base_estimator = base_estimator
- self.threshold = threshold
- self.criterion = criterion
- self.k_best = k_best
- self.max_iter = max_iter
- self.verbose = verbose
- @_fit_context(
- # SelfTrainingClassifier.base_estimator is not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y):
- """
- Fit self-training classifier using `X`, `y` as training data.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Array representing the data.
- y : {array-like, sparse matrix} of shape (n_samples,)
- Array representing the labels. Unlabeled samples should have the
- label -1.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- # we need row slicing support for sparce matrices, but costly finiteness check
- # can be delegated to the base estimator.
- X, y = self._validate_data(
- X, y, accept_sparse=["csr", "csc", "lil", "dok"], force_all_finite=False
- )
- self.base_estimator_ = clone(self.base_estimator)
- if y.dtype.kind in ["U", "S"]:
- raise ValueError(
- "y has dtype string. If you wish to predict on "
- "string targets, use dtype object, and use -1"
- " as the label for unlabeled samples."
- )
- has_label = y != -1
- if np.all(has_label):
- warnings.warn("y contains no unlabeled samples", UserWarning)
- if self.criterion == "k_best" and (
- self.k_best > X.shape[0] - np.sum(has_label)
- ):
- warnings.warn(
- (
- "k_best is larger than the amount of unlabeled "
- "samples. All unlabeled samples will be labeled in "
- "the first iteration"
- ),
- UserWarning,
- )
- self.transduction_ = np.copy(y)
- self.labeled_iter_ = np.full_like(y, -1)
- self.labeled_iter_[has_label] = 0
- self.n_iter_ = 0
- while not np.all(has_label) and (
- self.max_iter is None or self.n_iter_ < self.max_iter
- ):
- self.n_iter_ += 1
- self.base_estimator_.fit(
- X[safe_mask(X, has_label)], self.transduction_[has_label]
- )
- # Predict on the unlabeled samples
- prob = self.base_estimator_.predict_proba(X[safe_mask(X, ~has_label)])
- pred = self.base_estimator_.classes_[np.argmax(prob, axis=1)]
- max_proba = np.max(prob, axis=1)
- # Select new labeled samples
- if self.criterion == "threshold":
- selected = max_proba > self.threshold
- else:
- n_to_select = min(self.k_best, max_proba.shape[0])
- if n_to_select == max_proba.shape[0]:
- selected = np.ones_like(max_proba, dtype=bool)
- else:
- # NB these are indices, not a mask
- selected = np.argpartition(-max_proba, n_to_select)[:n_to_select]
- # Map selected indices into original array
- selected_full = np.nonzero(~has_label)[0][selected]
- # Add newly labeled confident predictions to the dataset
- self.transduction_[selected_full] = pred[selected]
- has_label[selected_full] = True
- self.labeled_iter_[selected_full] = self.n_iter_
- if selected_full.shape[0] == 0:
- # no changed labels
- self.termination_condition_ = "no_change"
- break
- if self.verbose:
- print(
- f"End of iteration {self.n_iter_},"
- f" added {selected_full.shape[0]} new labels."
- )
- if self.n_iter_ == self.max_iter:
- self.termination_condition_ = "max_iter"
- if np.all(has_label):
- self.termination_condition_ = "all_labeled"
- self.base_estimator_.fit(
- X[safe_mask(X, has_label)], self.transduction_[has_label]
- )
- self.classes_ = self.base_estimator_.classes_
- return self
- @available_if(_estimator_has("predict"))
- def predict(self, X):
- """Predict the classes of `X`.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Array representing the data.
- Returns
- -------
- y : ndarray of shape (n_samples,)
- Array with predicted labels.
- """
- check_is_fitted(self)
- X = self._validate_data(
- X,
- accept_sparse=True,
- force_all_finite=False,
- reset=False,
- )
- return self.base_estimator_.predict(X)
- @available_if(_estimator_has("predict_proba"))
- def predict_proba(self, X):
- """Predict probability for each possible outcome.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Array representing the data.
- Returns
- -------
- y : ndarray of shape (n_samples, n_features)
- Array with prediction probabilities.
- """
- check_is_fitted(self)
- X = self._validate_data(
- X,
- accept_sparse=True,
- force_all_finite=False,
- reset=False,
- )
- return self.base_estimator_.predict_proba(X)
- @available_if(_estimator_has("decision_function"))
- def decision_function(self, X):
- """Call decision function of the `base_estimator`.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Array representing the data.
- Returns
- -------
- y : ndarray of shape (n_samples, n_features)
- Result of the decision function of the `base_estimator`.
- """
- check_is_fitted(self)
- X = self._validate_data(
- X,
- accept_sparse=True,
- force_all_finite=False,
- reset=False,
- )
- return self.base_estimator_.decision_function(X)
- @available_if(_estimator_has("predict_log_proba"))
- def predict_log_proba(self, X):
- """Predict log probability for each possible outcome.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Array representing the data.
- Returns
- -------
- y : ndarray of shape (n_samples, n_features)
- Array with log prediction probabilities.
- """
- check_is_fitted(self)
- X = self._validate_data(
- X,
- accept_sparse=True,
- force_all_finite=False,
- reset=False,
- )
- return self.base_estimator_.predict_log_proba(X)
- @available_if(_estimator_has("score"))
- def score(self, X, y):
- """Call score on the `base_estimator`.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Array representing the data.
- y : array-like of shape (n_samples,)
- Array representing the labels.
- Returns
- -------
- score : float
- Result of calling score on the `base_estimator`.
- """
- check_is_fitted(self)
- X = self._validate_data(
- X,
- accept_sparse=True,
- force_all_finite=False,
- reset=False,
- )
- return self.base_estimator_.score(X, y)
|