Source code for mlbox.model.classification.classifier

# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause

import warnings
from copy import copy

import numpy as np
import pandas as pd
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              ExtraTreesClassifier, RandomForestClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier


[docs]class Classifier(): """Wraps scikitlearn classifiers. Parameters ---------- strategy : str, default = "LightGBM" The choice for the classifier. Available strategies = {"LightGBM", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear"}. **params : default = None Parameters of the corresponding classifier. Examples : n_estimators, max_depth... """ def __init__(self, **params): """Init Classifier object. User can define strategy parameters. Parameters ---------- strategy : str, default = "LightGBM" The choice of the classifier. Available strategies = {"LightGBM", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear"}. """ if ("strategy" in params): self.__strategy = params["strategy"] else: self.__strategy = "LightGBM" self.__classif_params = {} self.__classifier = None self.__set_classifier(self.__strategy) self.__col = None self.set_params(**params) self.__fitOK = False
[docs] def get_params(self, deep=True): """Get strategy parameters of Classifier object.""" params = {} params["strategy"] = self.__strategy params.update(self.__classif_params) return params
[docs] def set_params(self, **params): """Set strategy parameters of Classifier object.""" self.__fitOK = False if 'strategy' in params.keys(): self.__set_classifier(params['strategy']) for k, v in self.__classif_params.items(): if k not in self.get_params().keys(): warnings.warn("Invalid parameter for classifier " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`classifier.get_params().keys()`") else: setattr(self.__classifier, k, v) for k, v in params.items(): if(k == "strategy"): pass else: if k not in self.__classifier.get_params().keys(): warnings.warn("Invalid parameter for classifier " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`classifier.get_params().keys()`") else: setattr(self.__classifier, k, v) self.__classif_params[k] = v
def __set_classifier(self, strategy): """Set the classifier using scikitlearn Classifier.""" self.__strategy = strategy if(strategy == 'RandomForest'): self.__classifier = RandomForestClassifier( n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif(strategy == "LightGBM"): self.__classifier = LGBMClassifier( n_estimators=500, learning_rate=0.05, colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0) elif(strategy == 'ExtraTrees'): self.__classifier = ExtraTreesClassifier( n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif(strategy == 'Tree'): self.__classifier = DecisionTreeClassifier( criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, class_weight=None, presort=False) elif(strategy == "Bagging"): self.__classifier = BaggingClassifier( base_estimator=None, n_estimators=500, max_samples=.9, max_features=.85, bootstrap=False, bootstrap_features=False, n_jobs=-1, random_state=0) elif(strategy == "AdaBoost"): self.__classifier = AdaBoostClassifier( base_estimator=None, n_estimators=400, learning_rate=.05, algorithm='SAMME.R', random_state=0) elif(strategy == "Linear"): self.__classifier = LogisticRegression( penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=0, solver='lbfgs', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=-1) else: raise ValueError( "Strategy invalid. Please choose between 'LightGBM'" ", 'RandomForest', 'ExtraTrees', " "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")
[docs] def fit(self, df_train, y_train): """Fits Classifier. Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features. y_train : pandas series of shape = (n_train,) The numerical encoded target for classification tasks. Returns ------- object self """ # sanity checks if((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if (type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") self.__classifier.fit(df_train.values, y_train) self.__col = df_train.columns self.__fitOK = True return self
[docs] def feature_importances(self): """Compute feature importances. Classifier must be fitted before. Returns ------- dict Dictionnary containing a measure of feature importance (value) for each feature (key). """ if self.__fitOK: if (self.get_params()["strategy"] in ["Linear"]): importance = {} f = np.mean(np.abs(self.get_estimator().coef_), axis=0) for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest", "ExtraTrees", "Tree"]): importance = {} f = self.get_estimator().feature_importances_ for i, col in enumerate(self.__col): importance[col] = f[i] elif(self.get_params()["strategy"] in ["AdaBoost"]): importance = {} norm = self.get_estimator().estimator_weights_.sum() try: # LGB, RF, ET, Tree and AdaBoost f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa except: # noqa # Linear f = sum(weight * np.mean(np.abs(est.coef_), axis=0) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["Bagging"]): importance = {} importance_bag = [] for i, b in enumerate(self.get_estimator().estimators_): d = {} try: # LGB, RF, ET, Tree and AdaBoost f = b.feature_importances_ except: # noqa # Linear f = np.mean(np.abs(b.coef_), axis=0) for j, c in enumerate(self.get_estimator().estimators_features_[i]): # noqa d[self.__col[c]] = f[j] importance_bag.append(d.copy()) for i, col in enumerate(self.__col): list_filtered = filter(lambda x: x != 0, [k[col] if col in k else 0 for k in importance_bag]) importance[col] = np.mean(list(list_filtered)) # noqa else: importance = {} return importance else: raise ValueError("You must call the fit function before !")
[docs] def predict(self, df): """Predicts the target. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- array of shape = (n, ) The encoded classes to be predicted. """ try: if not callable(getattr(self.__classifier, "predict")): raise ValueError("predict attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__classifier.predict(df.values) else: raise ValueError("You must call the fit function before !")
[docs] def predict_log_proba(self, df): """Predicts class log-probabilities for df. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- y : array of shape = (n, n_classes) The log-probabilities for each class """ try: if not callable(getattr(self.__classifier, "predict_log_proba")): raise ValueError("predict_log_proba attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__classifier.predict_log_proba(df.values) else: raise ValueError("You must call the fit function before !")
[docs] def predict_proba(self, df): """Predicts class probabilities for df. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- array of shape = (n, n_classes) The probabilities for each class """ try: if not callable(getattr(self.__classifier, "predict_proba")): raise ValueError("predict_proba attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__classifier.predict_proba(df.values) else: raise ValueError("You must call the fit function before !")
[docs] def score(self, df, y, sample_weight=None): """Return the mean accuracy. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. y : pandas series of shape = (n,) The numerical encoded target for classification tasks. Returns ------- float Mean accuracy of self.predict(df) wrt. y. """ try: if not callable(getattr(self.__classifier, "score")): raise ValueError("score attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") if(type(y) != pd.core.series.Series): raise ValueError("y must be a Series") return self.__classifier.score(df.values, y, sample_weight) else: raise ValueError("You must call the fit function before !")
[docs] def get_estimator(self): """Return classfier.""" return copy(self.__classifier)