Source code for mlbox.model.regression.stacking_regressor

# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause


import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_predict
from copy import copy as make_copy
from .regressor import Regressor
import warnings


[docs]class StackingRegressor(): """A Stacking regressor. A stacking regressor is a regressor that uses the predictions of several first layer estimators (generated with a cross validation method) for a second layer estimator. Parameters ---------- base_estimators : list, default = [Regressor(strategy="LightGBM"), Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")] List of estimators to fit in the first level using a cross validation. level_estimator : object, default = LinearRegression() The estimator used in second and last level n_folds : int, default = 5 Number of folds used to generate the meta features for the training set copy : bool, default = False If true, meta features are added to the original dataset random_state : None, int or RandomState. default = 1 Pseudo-random number generator state used for shuffling. If None, use default numpy RNG for shuffling. verbose : bool, default = True Verbose mode. """ def __init__(self, base_estimators=[Regressor(strategy="LightGBM"), Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], level_estimator=LinearRegression(), n_folds=5, copy=False, random_state=1, verbose=True): """Init method for StackingRegressor.""" self.base_estimators = base_estimators if(type(base_estimators) != list): raise ValueError("base_estimators must be a list") else: for i, est in enumerate(self.base_estimators): self.base_estimators[i] = make_copy(est) self.level_estimator = level_estimator self.n_folds = n_folds if(type(n_folds) != int): raise ValueError("n_folds must be an integer") self.copy = copy if(type(copy) != bool): raise ValueError("copy must be a boolean") self.random_state = random_state if((type(self.random_state) != int) and (self.random_state is not None)): raise ValueError("random_state must be either None or an integer") self.verbose = verbose if(type(self.verbose) != bool): raise ValueError("verbose must be a boolean") self.__fitOK = False self.__fittransformOK = False
[docs] def get_params(self, deep=True): """Get parameters of a StackingRegressor object.""" return {'level_estimator': self.level_estimator, 'base_estimators': self.base_estimators, 'n_folds': self.n_folds, 'copy': self.copy, 'random_state': self.random_state, 'verbose': self.verbose}
[docs] def set_params(self, **params): """Set parameters of a StackingRegressor object.""" self.__fitOK = False self.__fittransformOK = False for k, v in params.items(): if k not in self.get_params(): warnings.warn("Invalid parameter a for stacking_regressor " "StackingRegressor. Parameter IGNORED. Check the" " list of available parameters with " "`stacking_regressor.get_params().keys()`") else: setattr(self, k, v)
[docs] def fit_transform(self, df_train, y_train): """Create meta-features for the training dataset. Parameters ---------- df_train : pandas DataFrame of shape = (n_samples, n_features) The training dataset. y_train : pandas series of shape = (n_samples, ) The target Returns ------- pandas DataFrame of shape = (n_samples, n_features*int(copy)+n_metafeatures) The transformed training dataset. """ # sanity checks if((type(df_train) != pd.SparseDataFrame) & (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if(type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") cv = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state) preds = pd.DataFrame([], index=y_train.index) if(self.verbose): print("") print("[==========================================================" "===================] LAYER [===============================" "====================================================]") print("") for c, reg in enumerate(self.base_estimators): if(self.verbose): print("> fitting estimator n°" + str(c + 1) + " : " + str(reg.get_params()) + " ...") print("") # for each base estimator, we create the meta feature on train set y_pred = cross_val_predict(estimator=reg, X=df_train, y=y_train, cv=cv) preds["est" + str(c + 1)] = y_pred # and we refit the base estimator on entire train set reg.fit(df_train, y_train) layer = 1 columns = ["layer" + str(layer) + "_" + s for s in preds.columns] while(len(np.intersect1d(df_train.columns, columns)) > 0): layer = layer + 1 columns = ["layer" + str(layer) + "_" + s for s in preds.columns] preds.columns = ["layer" + str(layer) + "_" + s for s in preds.columns] self.__fittransformOK = True if(self.copy): # we keep also the initial features return pd.concat([df_train, preds], axis=1) else: return preds # we keep only the meta features
[docs] def transform(self, df_test): """Create meta-features for the test dataset. Parameters ---------- df_test : pandas DataFrame of shape = (n_samples_test, n_features) The test dataset. Returns ------- pandas DataFrame of shape = (n_samples_test, n_features*int(copy)+n_metafeatures) The transformed test dataset. """ # sanity checks if((type(df_test) != pd.SparseDataFrame) and (type(df_test) != pd.DataFrame)): raise ValueError("df_test must be a DataFrame") if(self.__fittransformOK): preds_test = pd.DataFrame([], index=df_test.index) for c, reg in enumerate(self.base_estimators): # we predict the meta feature on test set y_pred_test = reg.predict(df_test) preds_test["est" + str(c + 1)] = y_pred_test layer = 1 columns = ["layer" + str(layer) + "_" + s for s in preds_test.columns] while(len(np.intersect1d(df_test.columns, columns)) > 0): layer = layer + 1 columns = ["layer" + str(layer) + "_" + s for s in preds_test.columns] preds_test.columns = [ "layer" + str(layer) + "_" + s for s in preds_test.columns] if(self.copy): # we keep also the initial features return pd.concat([df_test, preds_test], axis=1) else: return preds_test # we keep only the meta features else: raise ValueError("Call fit_transform before !")
[docs] def fit(self, df_train, y_train): """Fit the first level estimators and the second level estimator on X. Parameters ---------- df_train : pandas DataFrame of shape (n_samples, n_features) Input data y_train : pandas series of shape = (n_samples, ) The target Returns ------- object self """ # Fit the base estimators df_train = self.fit_transform(df_train, y_train) if(self.verbose): print("") print("[==========================================================" "===============] PREDICTION LAYER [========================" "====================================================]") print("") print("> fitting estimator : " + str(self.level_estimator.get_params()) + " ...") print("") # we fit the second level estimator self.level_estimator.fit(df_train.values, y_train.values) self.__fitOK = True return self
[docs] def predict(self, df_test): """Predict regression target for X_test using the meta-features. Parameters ---------- df_test : pandas DataFrame of shape = (n_samples_test, n_features) The testing samples Returns ------- array of shape = (n_samples_test, ) The predicted values. """ if(self.__fitOK): # we predict the meta features on test set df_test = self.transform(df_test) # we predict the target using the meta features return self.level_estimator.predict(df_test) else: raise ValueError("Call fit before !")