Source code for mlbox.model.regression.regressor

# coding: utf-8
# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause

import warnings
from copy import copy

import numpy as np
import pandas as pd
from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor,
                              ExtraTreesRegressor, RandomForestRegressor)
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor


[docs]class Regressor(): """Wrap scikitlearn regressors. Parameters ---------- strategy : str, default = "LightGBM" The choice for the regressor. Available strategies = {"LightGBM", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear"} **params : default = None Parameters of the corresponding regressor. Examples : n_estimators, max_depth... """ def __init__(self, **params): """Init Regressor object where user can pass a strategy.""" if ("strategy" in params): self.__strategy = params["strategy"] else: self.__strategy = "LightGBM" self.__regress_params = {} self.__regressor = None self.__set_regressor(self.__strategy) self.__col = None self.set_params(**params) self.__fitOK = False
[docs] def get_params(self, deep=True): """Get parameters of Regressor object.""" params = {} params["strategy"] = self.__strategy params.update(self.__regress_params) return params
[docs] def set_params(self, **params): """Set parameters of Regressor object.""" self.__fitOK = False if 'strategy' in params.keys(): self.__set_regressor(params['strategy']) for k, v in self.__regress_params.items(): if k not in self.get_params().keys(): warnings.warn("Invalid parameter for regressor " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`regressor.get_params().keys()`") else: setattr(self.__regressor, k, v) for k, v in params.items(): if(k == "strategy"): pass else: if k not in self.__regressor.get_params().keys(): warnings.warn("Invalid parameter for regressor " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`regressor.get_params().keys()`") else: setattr(self.__regressor, k, v) self.__regress_params[k] = v
def __set_regressor(self, strategy): """Set strategy of a regressor object.""" self.__strategy = strategy if(strategy == 'RandomForest'): self.__regressor = RandomForestRegressor( n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif(strategy == "LightGBM"): self.__regressor = LGBMRegressor( n_estimators=500, learning_rate=0.05, colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0) elif(strategy == 'ExtraTrees'): self.__regressor = ExtraTreesRegressor( n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif(strategy == 'Tree'): self.__regressor = DecisionTreeRegressor( criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, presort=False) elif(strategy == "Bagging"): self.__regressor = BaggingRegressor( base_estimator=None, n_estimators=500, max_samples=.9, max_features=.85, bootstrap=False, bootstrap_features=False, n_jobs=-1, random_state=0) elif(strategy == "AdaBoost"): self.__regressor = AdaBoostRegressor( base_estimator=None, n_estimators=400, learning_rate=.05, random_state=0) elif(strategy == "Linear"): self.__regressor = Ridge( alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=0) else: raise ValueError( "Strategy invalid. Please choose between 'LightGBM'" ", 'RandomForest', 'ExtraTrees', " "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")
[docs] def fit(self, df_train, y_train): """Fits Regressor. Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features. y_train : pandas series of shape = (n_train, ) The target for regression tasks. Returns ------- object self """ # sanity checks if((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if (type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") self.__regressor.fit(df_train.values, y_train) self.__col = df_train.columns self.__fitOK = True return self
[docs] def feature_importances(self): """Computes feature importances. Regressor must be fitted before. Returns ------- dict Dictionnary containing a measure of feature importance (value) for each feature (key). """ if self.__fitOK: if (self.get_params()["strategy"] in ["Linear"]): importance = {} f = np.abs(self.get_estimator().coef_) for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest", "ExtraTrees", "Tree"]): importance = {} f = self.get_estimator().feature_importances_ for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["AdaBoost"]): importance = {} norm = self.get_estimator().estimator_weights_.sum() try: # LGB, RF, ET, Tree and AdaBoost # TODO: Refactor this part f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa except Exception: f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["Bagging"]): importance = {} importance_bag = [] for i, b in enumerate(self.get_estimator().estimators_): d = {} try: # LGB, RF, ET, Tree and AdaBoost f = b.feature_importances_ except Exception: f = np.abs(b.coef_) # Linear estimator = self.get_estimator() items = enumerate(estimator.estimators_features_[i]) for j, c in items: d[self.__col[c]] = f[j] importance_bag.append(d.copy()) for i, col in enumerate(self.__col): list_filtered = filter(lambda x: x != 0, [k[col] if col in k else 0 for k in importance_bag]) importance[col] = np.mean(list(list_filtered)) else: importance = {} return importance else: raise ValueError("You must call the fit function before !")
[docs] def predict(self, df): """Predicts the target. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- array of shape = (n, ) The target to be predicted. """ try: if not callable(getattr(self.__regressor, "predict")): raise ValueError("predict attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.predict(df.values) else: raise ValueError("You must call the fit function before !")
[docs] def transform(self, df): """Transform dataframe df. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- pandas dataframe of shape = (n, n_selected_features) The transformed dataset with its most important features. """ try: if not callable(getattr(self.__regressor, "transform")): raise ValueError("transform attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.transform(df.values) else: raise ValueError("You must call the fit function before !")
[docs] def score(self, df, y, sample_weight=None): """Return R^2 coefficient of determination of the prediction. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. y : pandas series of shape = (n,) The numerical encoded target for classification tasks. Returns ------- float R^2 of self.predict(df) wrt. y. """ try: if not callable(getattr(self.__regressor, "score")): raise ValueError("score attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") if (type(y) != pd.core.series.Series): raise ValueError("y must be a Series") return self.__regressor.score(df.values, y, sample_weight) else: raise ValueError("You must call the fit function before !")
[docs] def get_estimator(self): """Return classfier.""" return copy(self.__regressor)