# coding: utf-8
# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause
import warnings
from copy import copy
import numpy as np
import pandas as pd
from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor,
ExtraTreesRegressor, RandomForestRegressor)
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
[docs]class Regressor():
"""Wrap scikitlearn regressors.
Parameters
----------
strategy : str, default = "LightGBM"
The choice for the regressor.
Available strategies = {"LightGBM", "RandomForest", "ExtraTrees",
"Tree", "Bagging", "AdaBoost" or "Linear"}
**params : default = None
Parameters of the corresponding regressor.
Examples : n_estimators, max_depth...
"""
def __init__(self, **params):
"""Init Regressor object where user can pass a strategy."""
if ("strategy" in params):
self.__strategy = params["strategy"]
else:
self.__strategy = "LightGBM"
self.__regress_params = {}
self.__regressor = None
self.__set_regressor(self.__strategy)
self.__col = None
self.set_params(**params)
self.__fitOK = False
[docs] def get_params(self, deep=True):
"""Get parameters of Regressor object."""
params = {}
params["strategy"] = self.__strategy
params.update(self.__regress_params)
return params
[docs] def set_params(self, **params):
"""Set parameters of Regressor object."""
self.__fitOK = False
if 'strategy' in params.keys():
self.__set_regressor(params['strategy'])
for k, v in self.__regress_params.items():
if k not in self.get_params().keys():
warnings.warn("Invalid parameter for regressor "
+ str(self.__strategy)
+ ". Parameter IGNORED. Check the list of "
"available parameters with "
"`regressor.get_params().keys()`")
else:
setattr(self.__regressor, k, v)
for k, v in params.items():
if(k == "strategy"):
pass
else:
if k not in self.__regressor.get_params().keys():
warnings.warn("Invalid parameter for regressor "
+ str(self.__strategy)
+ ". Parameter IGNORED. Check the list of "
"available parameters with "
"`regressor.get_params().keys()`")
else:
setattr(self.__regressor, k, v)
self.__regress_params[k] = v
def __set_regressor(self, strategy):
"""Set strategy of a regressor object."""
self.__strategy = strategy
if(strategy == 'RandomForest'):
self.__regressor = RandomForestRegressor(
n_estimators=400, max_depth=10, max_features='sqrt',
bootstrap=True, n_jobs=-1, random_state=0)
elif(strategy == "LightGBM"):
self.__regressor = LGBMRegressor(
n_estimators=500, learning_rate=0.05,
colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0)
elif(strategy == 'ExtraTrees'):
self.__regressor = ExtraTreesRegressor(
n_estimators=400, max_depth=10, max_features='sqrt',
bootstrap=True, n_jobs=-1, random_state=0)
elif(strategy == 'Tree'):
self.__regressor = DecisionTreeRegressor(
criterion='mse', splitter='best', max_depth=None,
min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.0, max_features=None,
random_state=0, max_leaf_nodes=None, presort=False)
elif(strategy == "Bagging"):
self.__regressor = BaggingRegressor(
base_estimator=None, n_estimators=500, max_samples=.9,
max_features=.85, bootstrap=False, bootstrap_features=False,
n_jobs=-1, random_state=0)
elif(strategy == "AdaBoost"):
self.__regressor = AdaBoostRegressor(
base_estimator=None, n_estimators=400, learning_rate=.05,
random_state=0)
elif(strategy == "Linear"):
self.__regressor = Ridge(
alpha=1.0, fit_intercept=True, normalize=False, copy_X=True,
max_iter=None, tol=0.001, solver='auto', random_state=0)
else:
raise ValueError(
"Strategy invalid. Please choose between 'LightGBM'"
", 'RandomForest', 'ExtraTrees', "
"'Tree', 'Bagging', 'AdaBoost' or 'Linear'")
[docs] def fit(self, df_train, y_train):
"""Fits Regressor.
Parameters
----------
df_train : pandas dataframe of shape = (n_train, n_features)
The train dataset with numerical features.
y_train : pandas series of shape = (n_train, )
The target for regression tasks.
Returns
-------
object
self
"""
# sanity checks
if((type(df_train) != pd.SparseDataFrame) and
(type(df_train) != pd.DataFrame)):
raise ValueError("df_train must be a DataFrame")
if (type(y_train) != pd.core.series.Series):
raise ValueError("y_train must be a Series")
self.__regressor.fit(df_train.values, y_train)
self.__col = df_train.columns
self.__fitOK = True
return self
[docs] def feature_importances(self):
"""Computes feature importances.
Regressor must be fitted before.
Returns
-------
dict
Dictionnary containing a measure of feature importance (value)
for each feature (key).
"""
if self.__fitOK:
if (self.get_params()["strategy"] in ["Linear"]):
importance = {}
f = np.abs(self.get_estimator().coef_)
for i, col in enumerate(self.__col):
importance[col] = f[i]
elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest",
"ExtraTrees", "Tree"]):
importance = {}
f = self.get_estimator().feature_importances_
for i, col in enumerate(self.__col):
importance[col] = f[i]
elif (self.get_params()["strategy"] in ["AdaBoost"]):
importance = {}
norm = self.get_estimator().estimator_weights_.sum()
try:
# LGB, RF, ET, Tree and AdaBoost
# TODO: Refactor this part
f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa
except Exception:
f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa
for i, col in enumerate(self.__col):
importance[col] = f[i]
elif (self.get_params()["strategy"] in ["Bagging"]):
importance = {}
importance_bag = []
for i, b in enumerate(self.get_estimator().estimators_):
d = {}
try:
# LGB, RF, ET, Tree and AdaBoost
f = b.feature_importances_
except Exception:
f = np.abs(b.coef_) # Linear
estimator = self.get_estimator()
items = enumerate(estimator.estimators_features_[i])
for j, c in items:
d[self.__col[c]] = f[j]
importance_bag.append(d.copy())
for i, col in enumerate(self.__col):
list_filtered = filter(lambda x: x != 0,
[k[col] if col in k else 0
for k in importance_bag])
importance[col] = np.mean(list(list_filtered))
else:
importance = {}
return importance
else:
raise ValueError("You must call the fit function before !")
[docs] def predict(self, df):
"""Predicts the target.
Parameters
----------
df : pandas dataframe of shape = (n, n_features)
The dataset with numerical features.
Returns
-------
array of shape = (n, )
The target to be predicted.
"""
try:
if not callable(getattr(self.__regressor, "predict")):
raise ValueError("predict attribute is not callable")
except Exception as e:
raise e
if self.__fitOK:
# sanity checks
if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
raise ValueError("df must be a DataFrame")
return self.__regressor.predict(df.values)
else:
raise ValueError("You must call the fit function before !")
[docs] def score(self, df, y, sample_weight=None):
"""Return R^2 coefficient of determination of the prediction.
Parameters
----------
df : pandas dataframe of shape = (n, n_features)
The dataset with numerical features.
y : pandas series of shape = (n,)
The numerical encoded target for classification tasks.
Returns
-------
float
R^2 of self.predict(df) wrt. y.
"""
try:
if not callable(getattr(self.__regressor, "score")):
raise ValueError("score attribute is not callable")
except Exception as e:
raise e
if self.__fitOK:
# sanity checks
if((type(df) != pd.SparseDataFrame) and
(type(df) != pd.DataFrame)):
raise ValueError("df must be a DataFrame")
if (type(y) != pd.core.series.Series):
raise ValueError("y must be a Series")
return self.__regressor.score(df.values, y, sample_weight)
else:
raise ValueError("You must call the fit function before !")
[docs] def get_estimator(self):
"""Return classfier."""
return copy(self.__regressor)