Source code for mlbox.encoding.na_encoder
# coding: utf-8
# Author: Axel ARONIO DE ROMBLAY <axelderomblay@gmail.com>
# License: BSD 3 clause
import pandas as pd
import warnings
from sklearn.impute import SimpleImputer
[docs]class NA_encoder():
"""Encodes missing values for both numerical and categorical features.
Several strategies are possible in each case.
Parameters
----------
numerical_strategy : str or float or int. default = "mean"
The strategy to encode NA for numerical features.
Available strategies = "mean", "median",
"most_frequent" or a float/int value
categorical_strategy : str, default = '<NULL>'
The strategy to encode NA for categorical features.
Available strategies = a string or "most_frequent"
"""
def __init__(self,
numerical_strategy='mean',
categorical_strategy='<NULL>'):
"""Init a NA_encoder.
User can choose numerical strategy and categorical strategy.
Parameters
----------
numerical_strategy : str or float or int. default = "mean"
The strategy to encode NA for numerical features.
categorical_strategy : str, default = '<NULL>'
The strategy to encode NA for categorical features.
"""
self.numerical_strategy = numerical_strategy
self.categorical_strategy = categorical_strategy
self.__Lcat = []
self.__Lnum = []
self.__imp = None
self.__mode = dict()
self.__fitOK = False
[docs] def get_params(self, deep=True):
"""Get parameters of a NA_encoder object."""
return {'numerical_strategy': self.numerical_strategy,
'categorical_strategy': self.categorical_strategy}
[docs] def set_params(self, **params):
"""Set parameters for a NA_encoder object.
Set numerical strategy and categorical strategy.
Parameters
----------
numerical_strategy : str or float or int. default = "mean"
The strategy to encode NA for numerical features.
categorical_strategy : str, default = '<NULL>'
The strategy to encode NA for categorical features.
"""
self.__fitOK = False
for k, v in params.items():
if k not in self.get_params():
warnings.warn("Invalid parameter(s) for encoder NA_encoder. "
"Parameter(s) IGNORED. "
"Check the list of available parameters with "
"`encoder.get_params().keys()`")
else:
setattr(self, k, v)
[docs] def fit(self, df_train, y_train=None):
"""Fits NA Encoder.
Parameters
----------
df_train : pandas dataframe of shape = (n_train, n_features)
The train dataset with numerical and categorical features.
y_train : pandas series of shape = (n_train, ), default = None
The target for classification or regression tasks.
Returns
-------
object
self
"""
self.__Lcat = df_train.dtypes[df_train.dtypes == 'object'].index
self.__Lnum = df_train.dtypes[df_train.dtypes != 'object'].index
# Dealing with numerical features
if (self.numerical_strategy in ['mean', 'median', "most_frequent"]):
self.__imp = SimpleImputer(strategy=self.numerical_strategy)
if (len(self.__Lnum) != 0):
self.__imp.fit(df_train[self.__Lnum])
else:
pass
elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)):
pass
else:
raise ValueError("Numerical strategy for NA encoding is not valid")
# Dealing with categorical features
if (type(self.categorical_strategy) == str):
if (self.categorical_strategy == "most_frequent"):
na_count = df_train[self.__Lcat].isnull().sum()
for col in na_count[na_count>0].index:
try:
self.__mode[col] = df_train[col].mode()[0]
except:
self.__mode[col] = "<NULL>"
else:
pass
else:
raise ValueError("Categorical strategy for NA encoding is not valid")
self.__fitOK = True
return self
[docs] def fit_transform(self, df_train, y_train=None):
"""Fits NA Encoder and transforms the dataset.
Parameters
----------
df_train : pandas.Dataframe of shape = (n_train, n_features)
The train dataset with numerical and categorical features.
y_train : pandas.Series of shape = (n_train, ), default = None
The target for classification or regression tasks.
Returns
-------
pandas.Dataframe of shape = (n_train, n_features)
The train dataset with no missing values.
"""
self.fit(df_train, y_train)
return self.transform(df_train)
[docs] def transform(self, df):
"""Transform the dataset.
Parameters
----------
df : pandas.Dataframe of shape = (n, n_features)
The dataset with numerical and categorical features.
Returns
-------
pandas.Dataframe of shape = (n, n_features)
The dataset with no missing values.
"""
if(self.__fitOK):
if(len(self.__Lnum) == 0):
if (self.categorical_strategy != "most_frequent"):
return df[self.__Lcat].fillna(self.categorical_strategy)
else:
return df[self.__Lcat].fillna(self.__mode)
else:
if (self.numerical_strategy in ['mean',
'median',
"most_frequent"]):
if (len(self.__Lcat) != 0):
if (self.categorical_strategy != "most_frequent"):
return pd.concat(
(pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
columns=self.__Lnum,
index=df.index),
df[self.__Lcat].fillna(self.categorical_strategy)
),
axis=1)[df.columns]
else:
return pd.concat(
(pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
columns=self.__Lnum,
index=df.index),
df[self.__Lcat].fillna(self.__mode)
),
axis=1)[df.columns]
else:
return pd.DataFrame(
self.__imp.transform(df[self.__Lnum]),
columns=self.__Lnum,
index=df.index
)
elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)):
if (len(self.__Lcat) != 0):
if (self.categorical_strategy != "most_frequent"):
return pd.concat(
(df[self.__Lnum].fillna(self.numerical_strategy),
df[self.__Lcat].fillna(self.categorical_strategy)
),
axis=1)[df.columns]
else:
return pd.concat(
(df[self.__Lnum].fillna(self.numerical_strategy),
df[self.__Lcat].fillna(self.__mode)
),
axis=1)[df.columns]
else:
return df[self.__Lnum].fillna(self.numerical_strategy)
else:
raise ValueError("Call fit or fit_transform function before")