Source code for mlbox.model.classification.stacking_classifier
# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from copy import copy as make_copy
from .classifier import Classifier
import warnings
[docs]class StackingClassifier():
"""A stacking classifier.
A stacking classifier is a classifier that uses the predictions of
several first layer estimators (generated with a cross validation method)
for a second layer estimator.
Parameters
----------
base_estimators : list, default = [Classifier(strategy="LightGBM"), Classifier(strategy="RandomForest"),Classifier(strategy="ExtraTrees")]
List of estimators to fit in the first level using a cross validation.
level_estimator : object, default = LogisticRegression()
The estimator used in second and last level.
n_folds : int, default = 5
Number of folds used to generate the meta features for the training set
copy : bool, default = False
If true, meta features are added to the original dataset
drop_first : bool, default = True
If True, each estimator output n_classes-1 probabilities
random_state : None or int or RandomState. default = 1
Pseudo-random number generator state used for shuffling. If None, use
default numpy RNG for shuffling.
verbose : bool, default = True
Verbose mode.
"""
def __init__(self,
base_estimators=[Classifier(strategy="LightGBM"),
Classifier(strategy="RandomForest"),
Classifier(strategy="ExtraTrees")],
level_estimator=LogisticRegression(n_jobs=-1),
n_folds=5, copy=False, drop_first=True, random_state=1,
verbose=True):
self.base_estimators = base_estimators
if(type(self.base_estimators) != list):
raise ValueError("base_estimators must be a list")
else:
for i, est in enumerate(self.base_estimators):
self.base_estimators[i] = make_copy(est)
self.level_estimator = level_estimator
self.n_folds = n_folds
if(type(self.n_folds) != int):
raise ValueError("n_folds must be an integer")
self.copy = copy
if(type(self.copy) != bool):
raise ValueError("copy must be a boolean")
self.drop_first = drop_first
if(type(self.drop_first) != bool):
raise ValueError("drop_first must be a boolean")
self.random_state = random_state
if((type(self.random_state) != int) and
(self.random_state is not None)):
raise ValueError("random_state must be either None or an integer")
self.verbose = verbose
if(type(self.verbose) != bool):
raise ValueError("verbose must be a boolean")
self.__fitOK = False
self.__fittransformOK = False
def get_params(self, deep=True):
return {'level_estimator': self.level_estimator,
'base_estimators': self.base_estimators,
'n_folds': self.n_folds,
'copy': self.copy,
'drop_first': self.drop_first,
'random_state': self.random_state,
'verbose': self.verbose}
def set_params(self, **params):
self.__fitOK = False
self.__fittransformOK = False
for k, v in params.items():
if k not in self.get_params():
warnings.warn("Invalid parameter a for stacking_classifier "
"StackingClassifier. Parameter IGNORED. Check "
"the list of available parameters with "
"`stacking_classifier.get_params().keys()`")
else:
setattr(self, k, v)
def __cross_val_predict_proba(self, estimator, df, y, cv):
"""Evaluates the target by cross-validation
Parameters
----------
estimator : estimator object implementing 'fit'
The object to use to fit the data.
df : pandas DataFrame
The data to fit.
y : pandas Serie
The target variable to try to predict in the case of
supervised learning.
cv : a STRATIFIED cross-validation generator
Returns
-------
y_pred : array-like of shape = (n_samples, n_classes)
The predicted class probabilities for X.
"""
classes = y.value_counts()
classes_to_drop = classes[classes < 2].index
indexes_to_drop = y[y.apply(lambda x: x in classes_to_drop)].index
y_pred = np.zeros((len(y), len(classes) - len(classes_to_drop)))
for train_index, test_index in cv.split(df, y):
# defining train and validation sets for each fold
df_train, df_test = df.iloc[train_index], df.iloc[test_index]
y_train = y.iloc[train_index]
try:
df_train = df_train.drop(indexes_to_drop)
y_train = y_train.drop(indexes_to_drop)
except Exception:
pass
# learning the model
estimator.fit(df_train, y_train)
# predicting the probability
y_pred[test_index] = estimator.predict_proba(df_test)[:, ]
return y_pred
[docs] def fit_transform(self, df_train, y_train):
"""Creates meta-features for the training dataset.
Parameters
----------
df_train : pandas dataframe of shape = (n_samples, n_features)
The training dataset.
y_train : pandas series of shape = (n_samples, )
The target.
Returns
-------
pandas dataframe of shape = (n_samples, n_features*int(copy)+n_metafeatures)
The transformed training dataset.
"""
# noqa
# sanity checks
if((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)):
raise ValueError("df_train must be a DataFrame")
if(type(y_train) != pd.core.series.Series):
raise ValueError("y_train must be a Series")
# stratified k fold
cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True,
random_state=self.random_state)
preds = pd.DataFrame([], index=y_train.index)
classes = y_train.value_counts()
classes_to_drop = classes[classes < 2].index
indexes_to_drop = y_train[y_train.apply(lambda x: x in classes_to_drop)].index
if(self.verbose):
print("")
print("[=========================================================="
"===================] LAYER [==============================="
"====================================================]")
print("")
for c, clf in enumerate(self.base_estimators):
if(self.verbose):
print("> fitting estimator n°" + str(c+1) + " : " +
str(clf.get_params()) + " ...")
print("")
# for each base estimator, we create the meta feature on train set
y_pred = self.__cross_val_predict_proba(clf, df_train, y_train, cv)
for i in range(0, y_pred.shape[1] - int(self.drop_first)):
preds["est" + str(c+1) + "_class" + str(i)] = y_pred[:, i]
# and we refit the base estimator on entire train set
clf.fit(df_train.drop(indexes_to_drop), y_train.drop(indexes_to_drop))
layer = 1
columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
while(len(np.intersect1d(df_train.columns, columns)) > 0):
layer = layer + 1
columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
preds.columns = ["layer" + str(layer) + "_" + s for s in preds.columns]
self.__fittransformOK = True
if(self.copy):
# we keep also the initial features
return pd.concat([df_train, preds], axis=1)
else:
# we keep only the meta features
return preds
[docs] def transform(self, df_test):
"""Creates meta-features for the test dataset.
Parameters
----------
df_test : pandas dataframe of shape = (n_samples_test, n_features)
The test dataset.
Returns
-------
pandas dataframe of shape = (n_samples_test, n_features*int(copy)+n_metafeatures)
The transformed test dataset.
"""
# sanity checks
if((type(df_test) != pd.SparseDataFrame) and
(type(df_test) != pd.DataFrame)):
raise ValueError("df_test must be a DataFrame")
if(self.__fittransformOK):
preds_test = pd.DataFrame([], index=df_test.index)
# for each base estimator, we predict the meta feature on test set
for c, clf in enumerate(self.base_estimators):
y_pred_test = clf.predict_proba(df_test)
for i in range(0, y_pred_test.shape[1] - int(self.drop_first)):
idx_name = "est" + str(c+1) + "_class" + str(i)
preds_test[idx_name] = y_pred_test[:, i]
layer = 1
columns = ["layer" + str(layer) + "_" + s
for s in preds_test.columns]
while(len(np.intersect1d(df_test.columns, columns)) > 0):
layer = layer + 1
columns = ["layer" + str(layer) + "_" + s
for s in preds_test.columns]
preds_test.columns = ["layer" + str(layer) + "_" + s
for s in preds_test.columns]
if(self.copy):
# we keep also the initial features
return pd.concat([df_test, preds_test], axis=1)
else:
# we keep only the meta features
return preds_test
else:
raise ValueError("Call fit_transform before !")
[docs] def fit(self, df_train, y_train):
"""Fits the first level estimators and the second level estimator on X.
Parameters
----------
df_train : pandas dataframe of shape (n_samples, n_features)
Input data
y_train : pandas series of shape = (n_samples, )
The target
Returns
-------
object
self.
"""
df_train = self.fit_transform(df_train, y_train) # we fit the base estimators
if(self.verbose):
print("")
print("[=========================================================="
"===============] PREDICTION LAYER [========================"
"====================================================]")
print("")
print("> fitting estimator : ")
print(str(self.level_estimator.get_params()) + " ...")
print("")
# we fit the second level estimator
self.level_estimator.fit(df_train.values, y_train.values)
self.__fitOK = True
return self
[docs] def predict_proba(self, df_test):
"""Predicts class probabilities for the test set using the meta-features.
Parameters
----------
df_test : pandas DataFrame of shape = (n_samples_test, n_features)
The testing samples
Returns
-------
array of shape = (n_samples_test, n_classes)
The class probabilities of the testing samples.
"""
if(self.__fitOK):
# we predict the meta features on test set
df_test = self.transform(df_test)
# we predict the probability of class 1 using the meta features
return self.level_estimator.predict_proba(df_test)
else:
raise ValueError("Call fit before !")
[docs] def predict(self, df_test):
"""Predicts class for the test set using the meta-features.
Parameters
----------
df_test : pandas DataFrame of shape = (n_samples_test, n_features)
The testing samples
Returns
-------
array of shape = (n_samples_test,)
The predicted classes.
"""
if(self.__fitOK):
# we predict the meta features on test set
df_test = self.transform(df_test)
# we predict the target using the meta features
return self.level_estimator.predict(df_test)
else:
raise ValueError("Call fit before !")