Source code for mlbox.model.regression.feature_selector

# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause

import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
import warnings


[docs]class Reg_feature_selector(): """Selects useful features. Several strategies are possible (filter and wrapper methods). Works for regression problems only. Parameters ---------- strategy : str, defaut = "l1" The strategy to select features. Available strategies = {"variance", "l1", "rf_feature_importance"} threshold : float, defaut = 0.3 The percentage of variable to discard according the strategy. Must be between 0. and 1. """ def __init__(self, strategy='l1', threshold=0.3): self.strategy = strategy self.threshold = threshold self.__fitOK = False self.__to_discard = [] def get_params(self, deep=True): return {'strategy': self.strategy, 'threshold': self.threshold} def set_params(self, **params): self.__fitOK = False for k, v in params.items(): if k not in self.get_params(): warnings.warn("Invalid parameter a for feature selector" "Reg_feature_selector. Parameter IGNORED. Check " "the list of available parameters with " "`feature_selector.get_params().keys()`") else: setattr(self, k, v)
[docs] def fit(self, df_train, y_train): """Fits Reg_feature_selector. Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features and no NA y_train : pandas series of shape = (n_train, ). The target for regression task. Returns ------- sobject self """ # sanity checks if((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if (type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") if(self.strategy == 'variance'): coef = df_train.std() abstract_threshold = np.percentile(coef, 100. * self.threshold) self.__to_discard = coef[coef < abstract_threshold].index self.__fitOK = True elif(self.strategy == 'l1'): model = Lasso(alpha=100.0, random_state=0) # to be tuned model.fit(df_train, y_train) coef = np.abs(model.coef_) abstract_threshold = np.percentile(coef, 100. * self.threshold) self.__to_discard = df_train.columns[coef < abstract_threshold] self.__fitOK = True elif(self.strategy == 'rf_feature_importance'): model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=0) # to be tuned model.fit(df_train, y_train) coef = model.feature_importances_ abstract_threshold = np.percentile(coef, 100. * self.threshold) self.__to_discard = df_train.columns[coef < abstract_threshold] self.__fitOK = True else: raise ValueError("Strategy invalid. Please choose between " "'variance', 'l1' or 'rf_feature_importance'") return self
[docs] def transform(self, df): """Transforms the dataset Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features and no NA Returns ------- pandas dataframe of shape = (n_train, n_features*(1-threshold)) The train dataset with relevant features """ if(self.__fitOK): # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return df.drop(self.__to_discard, axis=1) else: raise ValueError("call fit or fit_transform function before")
[docs] def fit_transform(self, df_train, y_train): """Fits Reg_feature_selector and transforms the dataset Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features and no NA y_train : pandas series of shape = (n_train, ). The target for regression task. Returns ------- pandas dataframe of shape = (n_train, n_features*(1-threshold)) The train dataset with relevant features """ self.fit(df_train, y_train) return self.transform(df_train)