Source code for mlbox.model.classification.feature_selector

# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import warnings


[docs]class Clf_feature_selector(): """Selects useful features. Several strategies are possible (filter and wrapper methods). Works for classification problems only (multiclass or binary). Parameters ---------- strategy : str, defaut = "l1" The strategy to select features. Available strategies = {"variance", "l1", "rf_feature_importance"} threshold : float, defaut = 0.3 The percentage of variable to discard according to the strategy. Must be between 0. and 1. """ def __init__(self, strategy='l1', threshold=0.3): # 'variance','l1, 'rf_feature_importance' self.strategy = strategy # a float between 0. and 1. defaut : 0.3 ie we drop 0.3 of features self.threshold = threshold self.__fitOK = False self.__to_discard = [] def get_params(self, deep=True): return {'strategy': self.strategy, 'threshold': self.threshold} def set_params(self, **params): self.__fitOK = False for k, v in params.items(): if k not in self.get_params(): warnings.warn("Invalid parameter a for feature selector" "Clf_feature_selector. Parameter IGNORED. Check" "the list of available parameters with" "`feature_selector.get_params().keys()`") else: setattr(self, k, v)
[docs] def fit(self, df_train, y_train): """Fits Clf_feature_selector Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features and no NA y_train : pandas series of shape = (n_train, ) The target for classification task. Must be encoded. Returns ------- object self """ # sanity checks if((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if (type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") if(self.strategy == 'variance'): coef = df_train.std() abstract_threshold = np.percentile(coef, 100. * self.threshold) self.__to_discard = coef[coef < abstract_threshold].index self.__fitOK = True elif(self.strategy == 'l1'): model = LogisticRegression(C=0.01, penalty='l1', solver="saga", n_jobs=-1, random_state=0) # to be tuned model.fit(df_train, y_train) coef = np.mean(np.abs(model.coef_), axis=0) abstract_threshold = np.percentile(coef, 100. * self.threshold) self.__to_discard = df_train.columns[coef < abstract_threshold] self.__fitOK = True elif(self.strategy == 'rf_feature_importance'): model = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=0) # to be tuned model.fit(df_train, y_train) coef = model.feature_importances_ abstract_threshold = np.percentile(coef, 100. * self.threshold) self.__to_discard = df_train.columns[coef < abstract_threshold] self.__fitOK = True else: raise ValueError("Strategy invalid. Please choose between " "'variance', 'l1' or 'rf_feature_importance'") return self
[docs] def transform(self, df): """Transforms the dataset Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features and no NA Returns ------- pandas dataframe of shape = (n_train, n_features*(1-threshold)) The train dataset with relevant features """ if(self.__fitOK): # sanity checks if((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return df.drop(self.__to_discard, axis=1) else: raise ValueError("call fit or fit_transform function before")
[docs] def fit_transform(self, df_train, y_train): """Fits Clf_feature_selector and transforms the dataset Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features and no NA y_train : pandas series of shape = (n_train, ). The target for classification task. Must be encoded. Returns ------- pandas dataframe of shape = (n_train, n_features*(1-threshold)) The train dataset with relevant features """ self.fit(df_train, y_train) return self.transform(df_train)