Source code for mlbox.prediction.predictor

# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause


import pandas as pd
import numpy as np
import os
import pickle
import warnings
import time
import operator
import matplotlib
# The following line allow to use matplolib <= 3.0 versions on mac os
try:
    matplotlib.use('TkAgg')
except:
    pass
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline

from ..encoding.na_encoder import NA_encoder
from ..encoding.categorical_encoder import Categorical_encoder
from ..model.classification.feature_selector import Clf_feature_selector
from ..model.regression.feature_selector import Reg_feature_selector
from ..model.classification.stacking_classifier import StackingClassifier
from ..model.regression.stacking_regressor import StackingRegressor
from ..model.classification.classifier import Classifier
from ..model.regression.regressor import Regressor


[docs]class Predictor():

    """Fits and predicts the target on the test dataset.

    The test dataset must not contain the target values.

    Parameters
    ----------
    to_path : str, default = "save"
        Name of the folder where feature importances and
        predictions are saved (.png and .csv formats).
        Must contain target encoder object (for classification task only).

    verbose : bool, default = True
        Verbose mode
    """

    def __init__(self, to_path="save", verbose=True):

        self.to_path = to_path
        self.verbose = verbose

    def get_params(self, deep=True):

        return {'to_path': self.to_path,
                'verbose': self.verbose
                }

    def set_params(self, **params):

        self.__fitOK = False

        for k, v in params.items():
            if k not in self.get_params():
                warnings.warn("Invalid parameter a for predictor Predictor. "
                              "Parameter IGNORED. "
                              "Check the list of available parameters with "
                              "`predictor.get_params().keys()`")
            else:
                setattr(self,k,v)

    def __save_feature_importances(self, importance, fig_name="feature_importance.png"):

        """Saves feature importances plot

        Parameters
        ----------
        importance : dict
            Dictionary with features (key) and importances (values)

        fig_name : str, default = "feature_importance.png"
            figure name

        Returns
        -------
        NoneType
            None
        """

        if (len(importance) > 0):

            # Generates plot of feature importances

            importance_sum = np.sum(list(importance.values()))
            tuples = [(k, np.round(importance[k] * 100. / importance_sum, 2))
                      for k in importance]
            tuples = sorted(tuples, key=lambda x: x[1])
            labels, values = zip(*tuples)
            plt.figure(figsize=(20, int(len(importance) * 0.3) + 1))

            ylocs = np.arange(len(values))
            plt.barh(ylocs, values, align='center')

            for x, y in zip(values, ylocs):
                plt.text(x + 1, y, x, va='center')

            plt.yticks(ylocs, labels)
            plt.title("Feature importance (%)")
            plt.grid(True)
            plt.savefig(fig_name)
            plt.close()

            # Leak Detection

            leak = sorted(dict(tuples).items(), key=operator.itemgetter(1))[-1]
            if((leak[-1] > 70) & (len(importance) > 1)):
                warnings.warn("WARNING : "
                              + str(leak[0])
                              + " is probably a leak ! "
                                "Please check and delete it...")

        else:
            pass


    def __plot_feature_importances(self, importance, top = 10):

        """Plots top 10 feature importances

        Parameters
        ----------
        importance : dict
            Dictionary with features (key) and importances (values)

        top : int
            Number of top features to display.

        Returns
        -------
        NoneType
            None
        """

        if (len(importance) > 0):

            # Plot feature importances

            importance_sum = np.sum(list(importance.values()))
            tuples = [(k, np.round(importance[k] * 100. / importance_sum, 2))
                      for k in importance]
            tuples = sorted(tuples, key=lambda x: x[1])[-top:]
            labels, values = zip(*tuples)
            plt.figure(figsize=(20, top * 0.3 + 1))

            ylocs = np.arange(len(values))
            plt.barh(ylocs, values, align='center')

            for x, y in zip(values, ylocs):
                plt.text(x + 1, y, x, va='center')

            plt.yticks(ylocs, labels)
            plt.title("Top " + str(top) + " feature importance (%)")
            plt.grid(True)
            plt.show()
            plt.close()

        else:
            pass


[docs]    def fit_predict(self, params, df):


        """Fits the model and predicts on the test set.

        Also outputs feature importances and the submission file
        (.png and .csv format).

        Parameters
        ----------
        params : dict, default = None.
            Hyper-parameters dictionary for the whole pipeline.

            - The keys must respect the following syntax : "enc__param".

                - "enc" = "ne" for na encoder
                - "enc" = "ce" for categorical encoder
                - "enc" = "fs" for feature selector [OPTIONAL]
                - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
                - "enc" = "est" for the final estimator

                - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ...

            - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ...

        df : dict, default = None
            Dataset dictionary. Must contain keys and values:

            - "train": pandas DataFrame for the train set.
            - "test" : pandas DataFrame for the test set.
            - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set.

        Returns
        -------
        object
            self.
        """

        if(self.to_path is None):
            raise ValueError("You must specify a path to save your model "
                             "and your predictions")

        else:

            ne = NA_encoder()
            ce = Categorical_encoder()

            ##########################################
            #    Automatically checking the task
            ##########################################

            ##########################################
            #             Classification
            ##########################################

            if (df['target'].dtype == 'int'):

                # Estimator

                est = Classifier()

                # Feature selection if specified

                fs = None
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("fs__")):
                            fs = Clf_feature_selector()
                        else:
                            pass

                # Stacking if specified

                STCK = {}
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("stck")):
                            STCK[p.split("__")[0]] = StackingClassifier()
                        else:
                            pass

        ##########################################
        #               Regression
        ##########################################

            elif (df['target'].dtype == 'float'):

                # Estimator

                est = Regressor()

                # Feature selection if specified

                fs = None
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("fs__")):
                            fs = Reg_feature_selector()
                        else:
                            pass

                # Stacking if specified

                STCK = {}
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("stck")):
                            STCK[p.split("__")[0]] = StackingRegressor()
                        else:
                            pass

            else:
                raise ValueError("Impossible to determine the task. "
                                 "Please check that your target is encoded.")

            ##########################################
            #          Creating the Pipeline
            ##########################################

            pipe = [("ne", ne), ("ce", ce)]

            # Do we need to cache transformers?

            cache = False

            if (params is not None):
                if("ce__strategy" in params):
                    if(params["ce__strategy"] == "entity_embedding"):
                        cache = True
                    else:
                        pass
                else:
                    pass

            if (fs is not None):
                if ("fs__strategy" in params):
                    if(params["fs__strategy"] != "variance"):
                        cache = True
                    else:
                        pass
            else:
                pass

            if (len(STCK) != 0):
                cache = True
            else:
                pass

            # Pipeline creation

            if (fs is not None):
                pipe.append(("fs", fs))
            else:
                pass

            for stck in np.sort(list(STCK)):
                pipe.append((stck, STCK[stck]))

            pipe.append(("est", est))

            if(cache):
                pp = Pipeline(pipe, memory=self.to_path)
            else:
                pp = Pipeline(pipe)

            ##########################################
            #          Fitting the Pipeline
            ##########################################

            start_time = time.time()

            # No params : default configuration

            if(params is None):
                print("")
                print('> No parameters set. Default configuration is tested')
                set_params = True

            else:
                try:
                    pp = pp.set_params(**params)
                    set_params = True
                except:
                    set_params = False

            if(set_params):

                try:
                    if(self.verbose):
                        print("")
                        print("fitting the pipeline ...")

                    pp.fit(df['train'], df['target'])

                    if(self.verbose):
                        print("CPU time: %s seconds"%(time.time() - start_time))

                    try:
                        os.mkdir(self.to_path)
                    except OSError:
                        pass

                    # Feature importances

                    try:

                        importance = est.feature_importances()
                        self.__save_feature_importances(importance,
                                                        self.to_path
                                                        + "/"
                                                        + est.get_params()["strategy"]
                                                        + "_feature_importance.png")

                        if(self.verbose):
                            self.__plot_feature_importances(importance, 10)
                            print("")
                            print("> Feature importances dumped into directory : " + self.to_path)

                    except:
                        warnings.warn("Unable to get feature importances !")

                except:
                    raise ValueError("Pipeline cannot be fitted")
            else:
                raise ValueError("Pipeline cannot be set with these parameters."
                                 " Check the name of your stages.")

            ##########################################
            #               Predicting
            ##########################################

            if (df["test"].shape[0] == 0):
                warnings.warn("You have no test dataset. Cannot predict !")

            else:

                start_time = time.time()

                ##########################################
                #             Classification
                ##########################################

                if (df['target'].dtype == 'int'):

                    enc_name = "target_encoder.obj"

                    try:

                        fhand = open(self.to_path + "/" + enc_name, 'rb')
                        enc = pickle.load(fhand)
                        fhand.close()

                    except:
                        raise ValueError("Unable to load '" + enc_name +
                                         "' from directory : " + self.to_path)

                    try:
                        if(self.verbose):
                            print("")
                            print("predicting ...")

                        pred = pd.DataFrame(pp.predict_proba(df['test']),
                                            columns=enc.inverse_transform(range(len(enc.classes_))),
                                            index=df['test'].index)
                        pred[df['target'].name + "_predicted"] = pred.idxmax(axis=1)  # noqa

                        try:
                            pred[df['target'].name + "_predicted"] = pred[df['target'].name + "_predicted"].apply(int)  # noqa
                        except:
                            pass

                    except:
                        raise ValueError("Can not predict")

                ##########################################
                #               Regression
                ##########################################

                elif (df['target'].dtype == 'float'):

                    pred = pd.DataFrame([],
                                        columns=[df['target'].name + "_predicted"],
                                        index=df['test'].index)

                    try:
                        if(self.verbose):
                            print("")
                            print("predicting...")

                        pred[df['target'].name + "_predicted"] = pp.predict(df['test'])  # noqa

                    except:
                        raise ValueError("Can not predict")

                else:
                    pass

                if(self.verbose):
                    print("CPU time: %s seconds" % (time.time() - start_time))

                ##########################################
                #               Displaying
                ##########################################

                if(self.verbose):
                    print("")
                    print("> Overview on predictions : ")
                    print("")
                    print(pred.head(10))

                ##########################################
                #           Dumping predictions
                ##########################################

                if(self.verbose):
                    print("")
                    print("dumping predictions into directory : "+self.to_path + " ...")

                pred.to_csv(self.to_path
                            + "/"
                            + df['target'].name
                            + "_predictions.csv",
                            index=True)

        return self