I am building a Machine Learning model pipeline. I have a custom function which will change the value of a specific column. I have defined custom transformer and it's working fine separately. But If I call it from pipeline it's throwing me error.
Sample Dataframe
df = pd.DataFrame({'y': [4,5,6], 'a':[3,2,3], 'b' : [2,3,4]})
import numpy as np
import pandas as pd
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
class Extractor(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def fit(self, x, y=None):
return self
def map_values(self, x):
if x in [1.0,2.0,3.0]:
return "Class A"
if x in [4.0,5.0,6.0]:
return "Class B"
if x in [7.0,8.0]:
return "Class C"
if x in [9.0,10.0]:
return "Class D"
else:
return "Other"
def transform(self, X):
return self
def fit_transform(self, X):
X = X.copy()
X = X.apply(lambda x : self.map_values(x))
return X
e = Extractor()
e.fit_transform(df['a'])
0 Class A
1 Clas C
2 Other
3 Class B
Name: a, dtype: object
Pipeline
features = ['a']
numeric_features=['b']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median'))])
custom_transformer = Pipeline(steps=[
('map_value', Extractor())])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('time',custom_transformer, features)])
X_new = df[['a','b']]
y_new = df['y']
X_transform = preprocessor.fit_transform(X_new,y_new)
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'Pipeline(steps=[('map_value', Extractor())])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't.
I want to make the custom processor working in the the pipeline.
so I tried working with your code and found some issues. Below is the updated code and some remarks.
First of all, after copy pasting your code and adding the missing import for SimpleImputer, I could not reproduce your error. Instead it showed the error: "TypeError: fit_transform() takes 2 positional arguments but 3 were given". After some research, I found this fix here and adjusted your method.
But now it returned the error: "ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()."
The problem is, that your Extractor requires/expects a Pandas.Series, where each entry is an number so that it can be mapped to one of your classes. So that means its one-dimensional like a list. This works well with df['a'], which is basically [3,2,3].
But when you are trying to use df[['a','b']] with it, you use two columns, which means there are two lists, one is [3,2,3] and the other for b is [2,3,4].
So here you need to decide what you actually want your Extractor to do. My first thought was, that you could put a and b into a list, so that it forms [3,2,3,2,3,4], but then you will end up with 6 classes, which does not match your three y entries.
Therefore I believe you want to implement some method, which takes a list of classes and perhaps picks the most represented class or something.
For example you need to map a[0] & b[0] to y[0], so Class A & Class A = 4 (to match with y[0]).
import numpy as np
import pandas as pd
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Added import
from sklearn.impute import SimpleImputer
class Extractor(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def fit(self, x, y=None):
return self
def map_values(self, x):
if x in [1.0,2.0,3.0]:
return "Class A"
if x in [4.0,5.0,6.0]:
return "Class B"
if x in [7.0,8.0]:
return "Class C"
if x in [9.0,10.0]:
return "Class D"
else:
return "Other"
def transform(self, X):
return self
def fit_transform(self, X, y=0):
# TypeError: fit_transform() takes 2 positional arguments but 3 were given
# Adjusted: https://intellipaat.com/community/2966/fittransform-takes-2-positional-arguments-but-3-were-given-with-labelbinarizer
# ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
# -> compare df['a'].shape and X_new.shape. df['a'] is basically [3,2,3] and X_new is [[3,2,3],[2,3,4]]. Using X_new['a'] or X_new['b'] works.
# But with both columns, its not clear which should be mapped -> therefore ambiguous
X = X.copy()
X = X.apply(lambda x : self.map_values(x))
return X
df = pd.DataFrame({'y': [4,5,6], 'a':[3,2,3], 'b' : [2,3,4]})
e = Extractor()
e.fit_transform(df['a'])
features = ['a']
numeric_features=['b']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median'))])
custom_transformer = Pipeline(steps=[
('map_value', Extractor())])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('time',custom_transformer, features)])
X_new = df[['a','b']]
y_new = df['y']
# Triedpd.Series(X_new.values.flatten().tolist()), but tuple index out of range, because of course there are 6 x and only 3 y values now.
X_transform = preprocessor.fit_transform(pd.Series(X_new.values.flatten().tolist()),y_new)
Is there a method to find a co-relation between multiple categorical variables? when you have a very big data set with a lot of categorical variables.
as #arpitrathi mentioned, typically you need to use Cramer's V.
AS I remember, there are already prepared code snippets on the internet,
I will leave you the one I am typically using for this, maybe it will help you.
You will need to import some libs to use it.
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
def cramers_v(x, y):
'''
Returns cramers_v for 2 categorical features
'''
confusion_matrix = pd.crosstab(x,y)
chi2 = stats.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
def heatmap_categorical_columns_w_dependant_categorical(df, dependent_variable, columns):
'''
Takes df, a dependant variable as str
Returns a heatmap of catecorical columns cramers_v with dependent variable
'''
plt.figure(figsize=(8, 10))
corrM = [cramers_v(df[dependent_variable], df[column]) for column in columns]
corr = pd.DataFrame(corrM, index=columns, columns=[dependent_variable])
ax = sns.heatmap(corr,
annot=True,
cmap='coolwarm',
vmin=-1,
vmax=1,
)
ax.set_title("Cramer V Correlation between Variables")
return ax
The result will be something like:
Example of usage you can check out in this code:
https://github.com/OzmundSedler/IBM-advanced-DS-coursera/blob/master/4%20Capstone%20/ML-project-draft.ipynb
You can use this code to get correlations among multiple variables using plt.matshow. Where 'bikesharing_data' is the pandas DataFrame.
plt.figure(figsize=[12, 8])
plt.matshow(bikesharing_data.corr(),
fignum=False,
aspect='equal')
columns = len(bikesharing_data.columns)
plt.xticks(range(columns), bikesharing_data.columns)
plt.yticks(range(columns), bikesharing_data.columns)
plt.colorbar()
plt.xticks(rotation=90)
plt.title('Correlation', y=1.2)
plt.show()
Where you can drop the numerical variables using:
features = bikesharing_data.drop(['x1', 'x2', ... ,'xn'], axis=1)
The resultant output is as follows;
Correlation Matrix
Given the following data:
DC,Mode,Mod,Ven,TY1,TY2,TY3,TY4,TY5,TY6,TY7,TY8
Intra,S,Dir,C1,False,False,False,False,False,True,True,False
Intra,S,Co,C1,False,False,False,False,False,False,False,False
Intra,M,Dir,C1,False,False,False,False,False,False,True,False
Inter,S,Co,C1,False,False,False,False,False,False,False,False
Intra,S,Dir,C2,False,True,True,True,True,True,True,False
Intra,S,Co,C2,False,False,False,False,False,False,False,False
Intra,M,Dir,C2,False,False,False,False,False,False,False,False
Inter,S,Co,C2,False,False,False,False,False,False,False,False
Intra,S,Dir,C3,False,False,False,False,True,True,False,False
Intra,S,Co,C3,False,False,False,False,False,False,False,False
Intra,M,Dir,C3,False,False,False,False,False,False,False,False
Inter,S,Co,C3,False,False,False,False,False,False,False,False
Intra,S,Dir,C4,False,False,False,False,False,True,False,True
Intra,S,Co,C4,True,True,True,True,False,True,False,True
Intra,M,Dir,C4,False,False,False,False,False,True,False,True
Inter,S,Co,C4,True,True,True,False,False,True,False,True
Intra,S,Dir,C5,True,True,False,False,False,False,False,False
Intra,S,Co,C5,False,False,False,False,False,False,False,False
Intra,M,Dir,C5,True,True,False,False,False,False,False,False
Inter,S,Co,C5,False,False,False,False,False,False,False,False
Imports:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
To reproduce my DataFrame, copy the data then use:
df = pd.read_clipboard(sep=',')
I'd like to create a plot conveying the same information as my example, but not necessarily with the same shape (I'm open to suggestions). I'd also like to hover over the color and have the appropriate Ven displayed (e.g. C1, not 1).:
Edit 2018-10-17:
The two solutions provided so far, are helpful and each accomplish a different aspect of what I'm looking for. However, the key issue I'd like to resolve, which wasn't explicitly stated prior to this edit, is the following:
I would like to perform the plotting without converting Ven to an int; this numeric transformation isn't practical with the real data. So the actual scope of the question is to plot all categorical data with two categorical axes.
The issue I'm experiencing is the data is categorical and the y-axis is multi-indexed.
I've done the following to transform the DataFrame:
# replace False witn nan
df = df.replace(False, np.nan)
# replace True with a number representing Ven (e.g. C1 = 1)
def rep_ven(row):
return row.iloc[4:].replace(True, int(row.Ven[1]))
df.iloc[:, 4:] = df.apply(rep_ven, axis=1)
# drop the Ven column
df = df.drop(columns=['Ven'])
# set multi-index
df_m = df.set_index(['DC', 'Mode', 'Mod'])
Plotting the transformed DataFrame produces:
plt.figure(figsize=(20,10))
heatmap = plt.imshow(df_m)
plt.xticks(range(len(df_m.columns.values)), df_m.columns.values)
plt.yticks(range(len(df_m.index)), df_m.index)
plt.show()
This plot isn't very streamlined, there are four axis values for each Ven. This is a subset of data, so the graph would be very long with all the data.
Here's my solution. Instead of plotting I just apply a style to the DataFrame, see https://pandas.pydata.org/pandas-docs/stable/style.html
# Transform Ven values from "C1", "C2" to 1, 2, ..
df['Ven'] = df['Ven'].str[1]
# Given a specific combination of dc, mode, mod, ven,
# do we have any True cells?
g = df.groupby(['DC', 'Mode', 'Mod', 'Ven']).any()
# Let's drop any rows with only False values
g = g[g.any(axis=1)]
# Convert True, False to 1, 0
g = g.astype(int)
# Get the values of the ven index as an int array
# Note: we don't want to drop the ven index!!
# Otherwise styling won't work
ven = g.index.get_level_values('Ven').values.astype(int)
# Multiply 1 and 0 with Ven value
g = g.mul(ven, axis=0)
# Sort the index
g.sort_index(ascending=False, inplace=True)
# Now display the dataframe with styling
# first we get a color map
import matplotlib
cmap = matplotlib.cm.get_cmap('tab10')
def apply_color_map(val):
# hide the 0 values
if val == 0:
return 'color: white; background-color: white'
else:
# for non-zero: get color from cmap, convert to hexcode for css
s = "color:white; background-color: " + matplotlib.colors.rgb2hex(cmap(val))
return s
g
g.style.applymap(apply_color_map)
The available matplotlib colormaps can be seen here: Colormap reference, with some additional explanation here: Choosing a colormap
Explanation: Remove rows where TY1-TY8 are all nan to create your plot. Refer to this answer as a starting point for creating interactive annotations to display Ven.
The below code should work:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_clipboard(sep=',')
# replace False witn nan
df = df.replace(False, np.nan)
# replace True with a number representing Ven (e.g. C1 = 1)
def rep_ven(row):
return row.iloc[4:].replace(True, int(row.Ven[1]))
df.iloc[:, 4:] = df.apply(rep_ven, axis=1)
# drop the Ven column
df = df.drop(columns=['Ven'])
idx = df[['TY1','TY2', 'TY3', 'TY4','TY5','TY6','TY7','TY8']].dropna(thresh=1).index.values
df = df.loc[idx,:].sort_values(by=['DC', 'Mode','Mod'], ascending=False)
# set multi-index
df_m = df.set_index(['DC', 'Mode', 'Mod'])
plt.figure(figsize=(20,10))
heatmap = plt.imshow(df_m)
plt.xticks(range(len(df_m.columns.values)), df_m.columns.values)
plt.yticks(range(len(df_m.index)), df_m.index)
plt.show()