Source code for mlbox.model.classification.stacking_classifier

# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause


import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from copy import copy as make_copy
from .classifier import Classifier
import warnings


[docs]class StackingClassifier(): """A stacking classifier. A stacking classifier is a classifier that uses the predictions of several first layer estimators (generated with a cross validation method) for a second layer estimator. Parameters ---------- base_estimators : list, default = [Classifier(strategy="LightGBM"), Classifier(strategy="RandomForest"),Classifier(strategy="ExtraTrees")] List of estimators to fit in the first level using a cross validation. level_estimator : object, default = LogisticRegression() The estimator used in second and last level. n_folds : int, default = 5 Number of folds used to generate the meta features for the training set copy : bool, default = False If true, meta features are added to the original dataset drop_first : bool, default = True If True, each estimator output n_classes-1 probabilities random_state : None or int or RandomState. default = 1 Pseudo-random number generator state used for shuffling. If None, use default numpy RNG for shuffling. verbose : bool, default = True Verbose mode. """ def __init__(self, base_estimators=[Classifier(strategy="LightGBM"), Classifier(strategy="RandomForest"), Classifier(strategy="ExtraTrees")], level_estimator=LogisticRegression(n_jobs=-1), n_folds=5, copy=False, drop_first=True, random_state=1, verbose=True): self.base_estimators = base_estimators if(type(self.base_estimators) != list): raise ValueError("base_estimators must be a list") else: for i, est in enumerate(self.base_estimators): self.base_estimators[i] = make_copy(est) self.level_estimator = level_estimator self.n_folds = n_folds if(type(self.n_folds) != int): raise ValueError("n_folds must be an integer") self.copy = copy if(type(self.copy) != bool): raise ValueError("copy must be a boolean") self.drop_first = drop_first if(type(self.drop_first) != bool): raise ValueError("drop_first must be a boolean") self.random_state = random_state if((type(self.random_state) != int) and (self.random_state is not None)): raise ValueError("random_state must be either None or an integer") self.verbose = verbose if(type(self.verbose) != bool): raise ValueError("verbose must be a boolean") self.__fitOK = False self.__fittransformOK = False def get_params(self, deep=True): return {'level_estimator': self.level_estimator, 'base_estimators': self.base_estimators, 'n_folds': self.n_folds, 'copy': self.copy, 'drop_first': self.drop_first, 'random_state': self.random_state, 'verbose': self.verbose} def set_params(self, **params): self.__fitOK = False self.__fittransformOK = False for k, v in params.items(): if k not in self.get_params(): warnings.warn("Invalid parameter a for stacking_classifier " "StackingClassifier. Parameter IGNORED. Check " "the list of available parameters with " "`stacking_classifier.get_params().keys()`") else: setattr(self, k, v) def __cross_val_predict_proba(self, estimator, df, y, cv): """Evaluates the target by cross-validation Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. df : pandas DataFrame The data to fit. y : pandas Serie The target variable to try to predict in the case of supervised learning. cv : a STRATIFIED cross-validation generator Returns ------- y_pred : array-like of shape = (n_samples, n_classes) The predicted class probabilities for X. """ classes = y.value_counts() classes_to_drop = classes[classes < 2].index indexes_to_drop = y[y.apply(lambda x: x in classes_to_drop)].index y_pred = np.zeros((len(y), len(classes) - len(classes_to_drop))) for train_index, test_index in cv.split(df, y): # defining train and validation sets for each fold df_train, df_test = df.iloc[train_index], df.iloc[test_index] y_train = y.iloc[train_index] try: df_train = df_train.drop(indexes_to_drop) y_train = y_train.drop(indexes_to_drop) except Exception: pass # learning the model estimator.fit(df_train, y_train) # predicting the probability y_pred[test_index] = estimator.predict_proba(df_test)[:, ] return y_pred
[docs] def fit_transform(self, df_train, y_train): """Creates meta-features for the training dataset. Parameters ---------- df_train : pandas dataframe of shape = (n_samples, n_features) The training dataset. y_train : pandas series of shape = (n_samples, ) The target. Returns ------- pandas dataframe of shape = (n_samples, n_features*int(copy)+n_metafeatures) The transformed training dataset. """ # noqa # sanity checks if((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if(type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") # stratified k fold cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state) preds = pd.DataFrame([], index=y_train.index) classes = y_train.value_counts() classes_to_drop = classes[classes < 2].index indexes_to_drop = y_train[y_train.apply(lambda x: x in classes_to_drop)].index if(self.verbose): print("") print("[==========================================================" "===================] LAYER [===============================" "====================================================]") print("") for c, clf in enumerate(self.base_estimators): if(self.verbose): print("> fitting estimator n°" + str(c+1) + " : " + str(clf.get_params()) + " ...") print("") # for each base estimator, we create the meta feature on train set y_pred = self.__cross_val_predict_proba(clf, df_train, y_train, cv) for i in range(0, y_pred.shape[1] - int(self.drop_first)): preds["est" + str(c+1) + "_class" + str(i)] = y_pred[:, i] # and we refit the base estimator on entire train set clf.fit(df_train.drop(indexes_to_drop), y_train.drop(indexes_to_drop)) layer = 1 columns = ["layer" + str(layer) + "_" + s for s in preds.columns] while(len(np.intersect1d(df_train.columns, columns)) > 0): layer = layer + 1 columns = ["layer" + str(layer) + "_" + s for s in preds.columns] preds.columns = ["layer" + str(layer) + "_" + s for s in preds.columns] self.__fittransformOK = True if(self.copy): # we keep also the initial features return pd.concat([df_train, preds], axis=1) else: # we keep only the meta features return preds
[docs] def transform(self, df_test): """Creates meta-features for the test dataset. Parameters ---------- df_test : pandas dataframe of shape = (n_samples_test, n_features) The test dataset. Returns ------- pandas dataframe of shape = (n_samples_test, n_features*int(copy)+n_metafeatures) The transformed test dataset. """ # sanity checks if((type(df_test) != pd.SparseDataFrame) and (type(df_test) != pd.DataFrame)): raise ValueError("df_test must be a DataFrame") if(self.__fittransformOK): preds_test = pd.DataFrame([], index=df_test.index) # for each base estimator, we predict the meta feature on test set for c, clf in enumerate(self.base_estimators): y_pred_test = clf.predict_proba(df_test) for i in range(0, y_pred_test.shape[1] - int(self.drop_first)): idx_name = "est" + str(c+1) + "_class" + str(i) preds_test[idx_name] = y_pred_test[:, i] layer = 1 columns = ["layer" + str(layer) + "_" + s for s in preds_test.columns] while(len(np.intersect1d(df_test.columns, columns)) > 0): layer = layer + 1 columns = ["layer" + str(layer) + "_" + s for s in preds_test.columns] preds_test.columns = ["layer" + str(layer) + "_" + s for s in preds_test.columns] if(self.copy): # we keep also the initial features return pd.concat([df_test, preds_test], axis=1) else: # we keep only the meta features return preds_test else: raise ValueError("Call fit_transform before !")
[docs] def fit(self, df_train, y_train): """Fits the first level estimators and the second level estimator on X. Parameters ---------- df_train : pandas dataframe of shape (n_samples, n_features) Input data y_train : pandas series of shape = (n_samples, ) The target Returns ------- object self. """ df_train = self.fit_transform(df_train, y_train) # we fit the base estimators if(self.verbose): print("") print("[==========================================================" "===============] PREDICTION LAYER [========================" "====================================================]") print("") print("> fitting estimator : ") print(str(self.level_estimator.get_params()) + " ...") print("") # we fit the second level estimator self.level_estimator.fit(df_train.values, y_train.values) self.__fitOK = True return self
[docs] def predict_proba(self, df_test): """Predicts class probabilities for the test set using the meta-features. Parameters ---------- df_test : pandas DataFrame of shape = (n_samples_test, n_features) The testing samples Returns ------- array of shape = (n_samples_test, n_classes) The class probabilities of the testing samples. """ if(self.__fitOK): # we predict the meta features on test set df_test = self.transform(df_test) # we predict the probability of class 1 using the meta features return self.level_estimator.predict_proba(df_test) else: raise ValueError("Call fit before !")
[docs] def predict(self, df_test): """Predicts class for the test set using the meta-features. Parameters ---------- df_test : pandas DataFrame of shape = (n_samples_test, n_features) The testing samples Returns ------- array of shape = (n_samples_test,) The predicted classes. """ if(self.__fitOK): # we predict the meta features on test set df_test = self.transform(df_test) # we predict the target using the meta features return self.level_estimator.predict(df_test) else: raise ValueError("Call fit before !")