Source code for mlbox.model.classification.classifier

# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause

import warnings
from copy import copy

import numpy as np
import pandas as pd
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              ExtraTreesClassifier, RandomForestClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier


[docs]class Classifier():
    """Wraps scikitlearn classifiers.

    Parameters
    ----------
    strategy : str, default = "LightGBM"
        The choice for the classifier.
        Available strategies = {"LightGBM", "RandomForest", "ExtraTrees",
        "Tree", "Bagging", "AdaBoost" or "Linear"}.

    **params : default = None
        Parameters of the corresponding classifier.
        Examples : n_estimators, max_depth...

    """

    def __init__(self, **params):
        """Init Classifier object.

        User can define strategy parameters.

        Parameters
        ----------
        strategy : str, default = "LightGBM"
            The choice of the classifier.
            Available strategies = {"LightGBM", "RandomForest", "ExtraTrees",
            "Tree", "Bagging", "AdaBoost" or "Linear"}.

        """
        if ("strategy" in params):
            self.__strategy = params["strategy"]
        else:
            self.__strategy = "LightGBM"

        self.__classif_params = {}

        self.__classifier = None
        self.__set_classifier(self.__strategy)
        self.__col = None

        self.set_params(**params)
        self.__fitOK = False

[docs]    def get_params(self, deep=True):
        """Get strategy parameters of Classifier object."""
        params = {}
        params["strategy"] = self.__strategy
        params.update(self.__classif_params)

        return params

[docs]    def set_params(self, **params):
        """Set strategy parameters of Classifier object."""
        self.__fitOK = False

        if 'strategy' in params.keys():
            self.__set_classifier(params['strategy'])

            for k, v in self.__classif_params.items():
                if k not in self.get_params().keys():
                    warnings.warn("Invalid parameter for classifier "
                                  + str(self.__strategy)
                                  + ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`classifier.get_params().keys()`")
                else:
                    setattr(self.__classifier, k, v)

        for k, v in params.items():
            if(k == "strategy"):
                pass
            else:
                if k not in self.__classifier.get_params().keys():
                    warnings.warn("Invalid parameter for classifier "
                                  + str(self.__strategy)
                                  + ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`classifier.get_params().keys()`")
                else:
                    setattr(self.__classifier, k, v)
                    self.__classif_params[k] = v

    def __set_classifier(self, strategy):
        """Set the classifier using scikitlearn Classifier."""
        self.__strategy = strategy

        if(strategy == 'RandomForest'):
            self.__classifier = RandomForestClassifier(
                n_estimators=400, max_depth=10, max_features='sqrt',
                bootstrap=True, n_jobs=-1, random_state=0)

        elif(strategy == "LightGBM"):
            self.__classifier = LGBMClassifier(
                n_estimators=500, learning_rate=0.05,
                colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0)

        elif(strategy == 'ExtraTrees'):
            self.__classifier = ExtraTreesClassifier(
                n_estimators=400, max_depth=10, max_features='sqrt',
                bootstrap=True, n_jobs=-1, random_state=0)

        elif(strategy == 'Tree'):
            self.__classifier = DecisionTreeClassifier(
                criterion='gini', splitter='best', max_depth=None,
                min_samples_split=2, min_samples_leaf=1,
                min_weight_fraction_leaf=0.0, max_features=None,
                random_state=0, max_leaf_nodes=None, class_weight=None,
                presort=False)

        elif(strategy == "Bagging"):
            self.__classifier = BaggingClassifier(
                base_estimator=None, n_estimators=500, max_samples=.9,
                max_features=.85, bootstrap=False, bootstrap_features=False,
                n_jobs=-1, random_state=0)

        elif(strategy == "AdaBoost"):
            self.__classifier = AdaBoostClassifier(
                base_estimator=None, n_estimators=400, learning_rate=.05,
                algorithm='SAMME.R', random_state=0)

        elif(strategy == "Linear"):
            self.__classifier = LogisticRegression(
                penalty='l2', dual=False, tol=0.0001, C=1.0,
                fit_intercept=True, intercept_scaling=1, class_weight=None,
                random_state=0, solver='lbfgs', max_iter=100,
                multi_class='ovr', verbose=0, warm_start=False, n_jobs=-1)

        else:
            raise ValueError(
                "Strategy invalid. Please choose between 'LightGBM'"
                ", 'RandomForest', 'ExtraTrees', "
                "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")

[docs]    def fit(self, df_train, y_train):
        """Fits Classifier.

        Parameters
        ----------
        df_train : pandas dataframe of shape = (n_train, n_features)
            The train dataset with numerical features.

        y_train : pandas series of shape = (n_train,)
            The numerical encoded target for classification tasks.

        Returns
        -------
        object
            self

        """
        # sanity checks
        if((type(df_train) != pd.SparseDataFrame)
           and (type(df_train) != pd.DataFrame)):
            raise ValueError("df_train must be a DataFrame")

        if (type(y_train) != pd.core.series.Series):
            raise ValueError("y_train must be a Series")

        self.__classifier.fit(df_train.values, y_train)
        self.__col = df_train.columns
        self.__fitOK = True

        return self

[docs]    def feature_importances(self):
        """Compute feature importances.

        Classifier must be fitted before.

        Returns
        -------
        dict
            Dictionnary containing a measure of feature importance (value) for
            each feature (key).

        """
        if self.__fitOK:

            if (self.get_params()["strategy"] in ["Linear"]):

                importance = {}
                f = np.mean(np.abs(self.get_estimator().coef_), axis=0)

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest",
                                                    "ExtraTrees", "Tree"]):

                importance = {}
                f = self.get_estimator().feature_importances_

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif(self.get_params()["strategy"] in ["AdaBoost"]):

                importance = {}
                norm = self.get_estimator().estimator_weights_.sum()

                try:
                    # LGB, RF, ET, Tree and AdaBoost
                    f = sum(weight * est.feature_importances_
                        for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm  # noqa

                except:  # noqa
                    # Linear
                    f = sum(weight * np.mean(np.abs(est.coef_), axis=0)
                        for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm  # noqa

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["Bagging"]):

                importance = {}
                importance_bag = []

                for i, b in enumerate(self.get_estimator().estimators_):

                    d = {}

                    try:
                        # LGB, RF, ET, Tree and AdaBoost
                        f = b.feature_importances_
                    except:  # noqa
                        # Linear
                        f = np.mean(np.abs(b.coef_), axis=0)

                    for j, c in enumerate(self.get_estimator().estimators_features_[i]):  # noqa
                        d[self.__col[c]] = f[j]

                    importance_bag.append(d.copy())

                for i, col in enumerate(self.__col):
                    list_filtered = filter(lambda x: x != 0,
                                           [k[col] if col in k
                                            else 0 for k in importance_bag])
                    importance[col] = np.mean(list(list_filtered))  # noqa

            else:

                importance = {}

            return importance

        else:

            raise ValueError("You must call the fit function before !")

[docs]    def predict(self, df):
        """Predicts the target.

        Parameters
        ----------
        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        Returns
        -------
        array of shape = (n, )
            The encoded classes to be predicted.

        """
        try:
            if not callable(getattr(self.__classifier, "predict")):
                raise ValueError("predict attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if((type(df) != pd.SparseDataFrame) and
               (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__classifier.predict(df.values)

        else:
            raise ValueError("You must call the fit function before !")

[docs]    def predict_log_proba(self, df):
        """Predicts class log-probabilities for df.

        Parameters
        ----------
        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        Returns
        -------
        y : array of shape = (n, n_classes)
            The log-probabilities for each class

        """
        try:
            if not callable(getattr(self.__classifier, "predict_log_proba")):
                raise ValueError("predict_log_proba attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if((type(df) != pd.SparseDataFrame) and
               (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__classifier.predict_log_proba(df.values)
        else:
            raise ValueError("You must call the fit function before !")

[docs]    def predict_proba(self, df):
        """Predicts class probabilities for df.

        Parameters
        ----------
        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        Returns
        -------
        array of shape = (n, n_classes)
            The probabilities for each class

        """
        try:
            if not callable(getattr(self.__classifier, "predict_proba")):
                raise ValueError("predict_proba attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if((type(df) != pd.SparseDataFrame)
               and (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__classifier.predict_proba(df.values)
        else:
            raise ValueError("You must call the fit function before !")

[docs]    def score(self, df, y, sample_weight=None):
        """Return the mean accuracy.

        Parameters
        ----------
        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        y : pandas series of shape = (n,)
            The numerical encoded target for classification tasks.

        Returns
        -------
        float
            Mean accuracy of self.predict(df) wrt. y.

        """
        try:
            if not callable(getattr(self.__classifier, "score")):
                raise ValueError("score attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if((type(df) != pd.SparseDataFrame) and
               (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            if(type(y) != pd.core.series.Series):
                raise ValueError("y must be a Series")

            return self.__classifier.score(df.values, y, sample_weight)
        else:
            raise ValueError("You must call the fit function before !")

[docs]    def get_estimator(self):
        """Return classfier."""
        return copy(self.__classifier)