Source code for mlbox.prediction.predictor

# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause


import pandas as pd
import numpy as np
import os
import pickle
import warnings
import time
import operator
import matplotlib
# The following line allow to use matplolib <= 3.0 versions on mac os
try:
    matplotlib.use('TkAgg')
except:
    pass
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline

from ..encoding.na_encoder import NA_encoder
from ..encoding.categorical_encoder import Categorical_encoder
from ..model.classification.feature_selector import Clf_feature_selector
from ..model.regression.feature_selector import Reg_feature_selector
from ..model.classification.stacking_classifier import StackingClassifier
from ..model.regression.stacking_regressor import StackingRegressor
from ..model.classification.classifier import Classifier
from ..model.regression.regressor import Regressor


[docs]class Predictor(): """Fits and predicts the target on the test dataset. The test dataset must not contain the target values. Parameters ---------- to_path : str, default = "save" Name of the folder where feature importances and predictions are saved (.png and .csv formats). Must contain target encoder object (for classification task only). verbose : bool, default = True Verbose mode """ def __init__(self, to_path="save", verbose=True): self.to_path = to_path self.verbose = verbose def get_params(self, deep=True): return {'to_path': self.to_path, 'verbose': self.verbose } def set_params(self, **params): self.__fitOK = False for k, v in params.items(): if k not in self.get_params(): warnings.warn("Invalid parameter a for predictor Predictor. " "Parameter IGNORED. " "Check the list of available parameters with " "`predictor.get_params().keys()`") else: setattr(self,k,v) def __save_feature_importances(self, importance, fig_name="feature_importance.png"): """Saves feature importances plot Parameters ---------- importance : dict Dictionary with features (key) and importances (values) fig_name : str, default = "feature_importance.png" figure name Returns ------- NoneType None """ if (len(importance) > 0): # Generates plot of feature importances importance_sum = np.sum(list(importance.values())) tuples = [(k, np.round(importance[k] * 100. / importance_sum, 2)) for k in importance] tuples = sorted(tuples, key=lambda x: x[1]) labels, values = zip(*tuples) plt.figure(figsize=(20, int(len(importance) * 0.3) + 1)) ylocs = np.arange(len(values)) plt.barh(ylocs, values, align='center') for x, y in zip(values, ylocs): plt.text(x + 1, y, x, va='center') plt.yticks(ylocs, labels) plt.title("Feature importance (%)") plt.grid(True) plt.savefig(fig_name) plt.close() # Leak Detection leak = sorted(dict(tuples).items(), key=operator.itemgetter(1))[-1] if((leak[-1] > 70) & (len(importance) > 1)): warnings.warn("WARNING : " + str(leak[0]) + " is probably a leak ! " "Please check and delete it...") else: pass def __plot_feature_importances(self, importance, top = 10): """Plots top 10 feature importances Parameters ---------- importance : dict Dictionary with features (key) and importances (values) top : int Number of top features to display. Returns ------- NoneType None """ if (len(importance) > 0): # Plot feature importances importance_sum = np.sum(list(importance.values())) tuples = [(k, np.round(importance[k] * 100. / importance_sum, 2)) for k in importance] tuples = sorted(tuples, key=lambda x: x[1])[-top:] labels, values = zip(*tuples) plt.figure(figsize=(20, top * 0.3 + 1)) ylocs = np.arange(len(values)) plt.barh(ylocs, values, align='center') for x, y in zip(values, ylocs): plt.text(x + 1, y, x, va='center') plt.yticks(ylocs, labels) plt.title("Top " + str(top) + " feature importance (%)") plt.grid(True) plt.show() plt.close() else: pass
[docs] def fit_predict(self, params, df): """Fits the model and predicts on the test set. Also outputs feature importances and the submission file (.png and .csv format). Parameters ---------- params : dict, default = None. Hyper-parameters dictionary for the whole pipeline. - The keys must respect the following syntax : "enc__param". - "enc" = "ne" for na encoder - "enc" = "ce" for categorical encoder - "enc" = "fs" for feature selector [OPTIONAL] - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL] - "enc" = "est" for the final estimator - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ... - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ... df : dict, default = None Dataset dictionary. Must contain keys and values: - "train": pandas DataFrame for the train set. - "test" : pandas DataFrame for the test set. - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set. Returns ------- object self. """ if(self.to_path is None): raise ValueError("You must specify a path to save your model " "and your predictions") else: ne = NA_encoder() ce = Categorical_encoder() ########################################## # Automatically checking the task ########################################## ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): # Estimator est = Classifier() # Feature selection if specified fs = None if(params is not None): for p in params.keys(): if(p.startswith("fs__")): fs = Clf_feature_selector() else: pass # Stacking if specified STCK = {} if(params is not None): for p in params.keys(): if(p.startswith("stck")): STCK[p.split("__")[0]] = StackingClassifier() else: pass ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): # Estimator est = Regressor() # Feature selection if specified fs = None if(params is not None): for p in params.keys(): if(p.startswith("fs__")): fs = Reg_feature_selector() else: pass # Stacking if specified STCK = {} if(params is not None): for p in params.keys(): if(p.startswith("stck")): STCK[p.split("__")[0]] = StackingRegressor() else: pass else: raise ValueError("Impossible to determine the task. " "Please check that your target is encoded.") ########################################## # Creating the Pipeline ########################################## pipe = [("ne", ne), ("ce", ce)] # Do we need to cache transformers? cache = False if (params is not None): if("ce__strategy" in params): if(params["ce__strategy"] == "entity_embedding"): cache = True else: pass else: pass if (fs is not None): if ("fs__strategy" in params): if(params["fs__strategy"] != "variance"): cache = True else: pass else: pass if (len(STCK) != 0): cache = True else: pass # Pipeline creation if (fs is not None): pipe.append(("fs", fs)) else: pass for stck in np.sort(list(STCK)): pipe.append((stck, STCK[stck])) pipe.append(("est", est)) if(cache): pp = Pipeline(pipe, memory=self.to_path) else: pp = Pipeline(pipe) ########################################## # Fitting the Pipeline ########################################## start_time = time.time() # No params : default configuration if(params is None): print("") print('> No parameters set. Default configuration is tested') set_params = True else: try: pp = pp.set_params(**params) set_params = True except: set_params = False if(set_params): try: if(self.verbose): print("") print("fitting the pipeline ...") pp.fit(df['train'], df['target']) if(self.verbose): print("CPU time: %s seconds"%(time.time() - start_time)) try: os.mkdir(self.to_path) except OSError: pass # Feature importances try: importance = est.feature_importances() self.__save_feature_importances(importance, self.to_path + "/" + est.get_params()["strategy"] + "_feature_importance.png") if(self.verbose): self.__plot_feature_importances(importance, 10) print("") print("> Feature importances dumped into directory : " + self.to_path) except: warnings.warn("Unable to get feature importances !") except: raise ValueError("Pipeline cannot be fitted") else: raise ValueError("Pipeline cannot be set with these parameters." " Check the name of your stages.") ########################################## # Predicting ########################################## if (df["test"].shape[0] == 0): warnings.warn("You have no test dataset. Cannot predict !") else: start_time = time.time() ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): enc_name = "target_encoder.obj" try: fhand = open(self.to_path + "/" + enc_name, 'rb') enc = pickle.load(fhand) fhand.close() except: raise ValueError("Unable to load '" + enc_name + "' from directory : " + self.to_path) try: if(self.verbose): print("") print("predicting ...") pred = pd.DataFrame(pp.predict_proba(df['test']), columns=enc.inverse_transform(range(len(enc.classes_))), index=df['test'].index) pred[df['target'].name + "_predicted"] = pred.idxmax(axis=1) # noqa try: pred[df['target'].name + "_predicted"] = pred[df['target'].name + "_predicted"].apply(int) # noqa except: pass except: raise ValueError("Can not predict") ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): pred = pd.DataFrame([], columns=[df['target'].name + "_predicted"], index=df['test'].index) try: if(self.verbose): print("") print("predicting...") pred[df['target'].name + "_predicted"] = pp.predict(df['test']) # noqa except: raise ValueError("Can not predict") else: pass if(self.verbose): print("CPU time: %s seconds" % (time.time() - start_time)) ########################################## # Displaying ########################################## if(self.verbose): print("") print("> Overview on predictions : ") print("") print(pred.head(10)) ########################################## # Dumping predictions ########################################## if(self.verbose): print("") print("dumping predictions into directory : "+self.to_path + " ...") pred.to_csv(self.to_path + "/" + df['target'].name + "_predictions.csv", index=True) return self