Custom preprocessor in Sklearn pipeline - python-3.x

I am building a Machine Learning model pipeline. I have a custom function which will change the value of a specific column. I have defined custom transformer and it's working fine separately. But If I call it from pipeline it's throwing me error.
Sample Dataframe
df = pd.DataFrame({'y': [4,5,6], 'a':[3,2,3], 'b' : [2,3,4]})
import numpy as np
import pandas as pd
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
class Extractor(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def fit(self, x, y=None):
return self
def map_values(self, x):
if x in [1.0,2.0,3.0]:
return "Class A"
if x in [4.0,5.0,6.0]:
return "Class B"
if x in [7.0,8.0]:
return "Class C"
if x in [9.0,10.0]:
return "Class D"
else:
return "Other"
def transform(self, X):
return self
def fit_transform(self, X):
X = X.copy()
X = X.apply(lambda x : self.map_values(x))
return X
e = Extractor()
e.fit_transform(df['a'])
0 Class A
1 Clas C
2 Other
3 Class B
Name: a, dtype: object
Pipeline
features = ['a']
numeric_features=['b']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median'))])
custom_transformer = Pipeline(steps=[
('map_value', Extractor())])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('time',custom_transformer, features)])
X_new = df[['a','b']]
y_new = df['y']
X_transform = preprocessor.fit_transform(X_new,y_new)
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'Pipeline(steps=[('map_value', Extractor())])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't.
I want to make the custom processor working in the the pipeline.

so I tried working with your code and found some issues. Below is the updated code and some remarks.
First of all, after copy pasting your code and adding the missing import for SimpleImputer, I could not reproduce your error. Instead it showed the error: "TypeError: fit_transform() takes 2 positional arguments but 3 were given". After some research, I found this fix here and adjusted your method.
But now it returned the error: "ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()."
The problem is, that your Extractor requires/expects a Pandas.Series, where each entry is an number so that it can be mapped to one of your classes. So that means its one-dimensional like a list. This works well with df['a'], which is basically [3,2,3].
But when you are trying to use df[['a','b']] with it, you use two columns, which means there are two lists, one is [3,2,3] and the other for b is [2,3,4].
So here you need to decide what you actually want your Extractor to do. My first thought was, that you could put a and b into a list, so that it forms [3,2,3,2,3,4], but then you will end up with 6 classes, which does not match your three y entries.
Therefore I believe you want to implement some method, which takes a list of classes and perhaps picks the most represented class or something.
For example you need to map a[0] & b[0] to y[0], so Class A & Class A = 4 (to match with y[0]).
import numpy as np
import pandas as pd
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Added import
from sklearn.impute import SimpleImputer
class Extractor(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def fit(self, x, y=None):
return self
def map_values(self, x):
if x in [1.0,2.0,3.0]:
return "Class A"
if x in [4.0,5.0,6.0]:
return "Class B"
if x in [7.0,8.0]:
return "Class C"
if x in [9.0,10.0]:
return "Class D"
else:
return "Other"
def transform(self, X):
return self
def fit_transform(self, X, y=0):
# TypeError: fit_transform() takes 2 positional arguments but 3 were given
# Adjusted: https://intellipaat.com/community/2966/fittransform-takes-2-positional-arguments-but-3-were-given-with-labelbinarizer
# ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
# -> compare df['a'].shape and X_new.shape. df['a'] is basically [3,2,3] and X_new is [[3,2,3],[2,3,4]]. Using X_new['a'] or X_new['b'] works.
# But with both columns, its not clear which should be mapped -> therefore ambiguous
X = X.copy()
X = X.apply(lambda x : self.map_values(x))
return X
df = pd.DataFrame({'y': [4,5,6], 'a':[3,2,3], 'b' : [2,3,4]})
e = Extractor()
e.fit_transform(df['a'])
features = ['a']
numeric_features=['b']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median'))])
custom_transformer = Pipeline(steps=[
('map_value', Extractor())])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('time',custom_transformer, features)])
X_new = df[['a','b']]
y_new = df['y']
# Triedpd.Series(X_new.values.flatten().tolist()), but tuple index out of range, because of course there are 6 x and only 3 y values now.
X_transform = preprocessor.fit_transform(pd.Series(X_new.values.flatten().tolist()),y_new)

Related

TypeError: this constructor takes no arguments. __init__() takes 1 positional argument but 4 were given

TypeError: this constructor takes no arguments
class CustomScaler(BaseEstimator,TransformerMixin):
# init or what information we need to declare a CustomScaler object
# and what is calculated/declared as we do
def __init__(self,columns,copy=True,with_mean=True,with_std=True):
# scaler is nothing but a Standard Scaler object
self.scaler = StandardScaler(copy,with_mean,with_std)
# with some columns 'twist'
self.columns = columns
self.mean_ = None
self.var_ = None
# the fit method, which, again based on StandardScale
def fit(self, X, y=None):
self.scaler.fit(X[self.columns], y)
self.mean_ = np.mean(X[self.columns])
self.var_ = np.var(X[self.columns])
return self
# the transform method which does the actual scaling
def transform(self, X, y=None, copy=None):
# record the initial order of the columns
init_col_order = X.columns
# scale all features that you chose when creating the instance of the class
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
# declare a variable containing all information that was not scaled
X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
# return a data frame which contains all scaled features and all 'not scaled' features
# use the original order (that you recorded in the beginning)
return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
unscaled_inputs.columns.values
olumns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
absenteeism_scaler = CustomScaler(columns_to_scale)
when i run the last line of code i get " init() takes 1 positional argument but 4 were given"
This may be a dumb question, but I am having a difficult time figuring out the error. I created a class called CustomScaler but when I try running it, it's giving me a typerror. tried to change init with multiple underscores but nothing works. changed the class, the function, ..etc. keep getting TypeError: this constructor takes no arguments.
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module,
# so you can imagine that the Custom Scaler is build on it
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
# create the Custom Scaler class
class CustomScaler(BaseEstimator,TransformerMixin):
# init or what information we need to declare a CustomScaler object
# and what is calculated/declared as we do
def __init__(self,columns,copy=True,with_mean=True,with_std=True):
# scaler is nothing but a Standard Scaler object
self.scaler = StandardScaler(copy,with_mean,with_std)
# with some columns 'twist'
self.columns = columns
self.mean_ = None
self.var_ = None
# the fit method, which, again based on StandardScale
def fit(self, X, y=None):
self.scaler.fit(X[self.columns], y)
self.mean_ = np.mean(X[self.columns])
self.var_ = np.var(X[self.columns])
return self
# the transform method which does the actual scaling
def transform(self, X, y=None, copy=None):
# record the initial order of the columns
init_col_order = X.columns
# scale all features that you chose when creating the instance of the class
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
# declare a variable containing all information that was not scaled
X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
# return a data frame which contains all scaled features and all 'not scaled' features
# use the original order (that you recorded in the beginning)
return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

BERT - The truth value of a DataFrame is ambiguous

I am getting into deep learning for some of my models and I am running into issues. I wanted to get it to work simply without any adjustments in the data, but I got
Graph execution error:
followed by a bunch of lines like
File "C:\Users\rober\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
I also tried oversampling in order to get this to work but still no luck. Instead I get
The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Anyways here is my code for this part of my project:
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics import classification_report
import pandas as pd
import time
from sklearn.model_selection import train_test_split
tart = time.perf_counter()
df = pd.read_excel(r'F:\Documents\Graduate Research\RMGmail2YrsClassified.xlsx')
pd.set_option('display.max_columns', None)
print("Lets start by looking at the top of the dataframe.")
print(df.head(10))
print(df.groupby('Classification').describe())
print(type(df['Classification']))
#independent
#join into 1 column to do analysis
df['Text']= df['Subject'].astype(str)+ ' ' +df['Body'].astype(str)
#1D array required for vectorizer for DF1 (not in the scope of this)
X = df['Text']
df = df.drop(['Subject','Body','Email Frequency', 'Recieved_Time','Account','Sender_Email'], axis=1)
#DF 2 for Deep Learning
df2 =df
#classification needs to be int
df2['Classification']=df2['Classification'].astype(int)
X1 = df2['Text']
y1 = df2['Classification']
print("df2 is:")
print(df2)
df_Primary = df2[df2['Classification']==1]
print(df_Primary.shape)
df_Secondary = df2[df2['Classification']==2]
print(df_Secondary.shape)
df_Social = df2[df2['Classification']==3]
print(df_Social.shape)
df_Promotional = df2[df2['Classification']==4]
print(df_Promotional.shape)
#df_Primary = df2[df2['Classification']==1]
#print(df_Primary.shape)
df_Primary_oversampled = df_Primary.sample(df_Promotional, replace=True)
print(df_Primary_oversampled.shape)
df_Secondary_oversampled = df_Secondary.sample(df_Promotional, replace=True)
print(df_Primary_oversampled.shape)
df_Social_oversampled = df_Social.sample(df_Promotional, replace=True)
print(df_Primary_oversampled.shape)
df_balanced = pd.concat([df_Primary_oversampled, df_Secondary_oversampled,
df_Social_oversampled,df_Promotional])
print(df_balanced.shape)
#change to preprocessed when removing stop words
X2 = df_balanced['Text']
y2 = df_balanced['Category']
X_trainDLT, X_testDLT, y_trainDLT, y_testDLT = train_test_split(
X2, y2, test_size=0.3,stratify=df['Classification'], random_state=50)
X_trainDLV, X_testDLV, y_trainDLV, y_testDLV = train_test_split(
X1, y1, test_size=0.3,stratify=df['Classification'], random_state=50)
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
# Use inputs and outputs to construct a final model
BERT = tf.keras.Model(inputs=[text_input], outputs = [l])
print(BERT.summary())
print(len(X_trainDLT))
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall')
]
BERT.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=METRICS)
BERT.fit(X_trainDLT, y_trainDLT, epochs=10)
y_predBERT = BERT.predict(X_testDLV)
y_predBERT = y_predBERT.flatten()
print("Bert")
print(classification_report(y_testDLV, y_predBERT))
I also read I shouldn't use oversampled or synthetically created samples in testing thus the 2 train test splits. In addition for my Y value "Classification" is represented in 1-4 for respective mail categories (primary, secondary, social, promotional)

How to use a Pytorch DataLoader for a dataset with multiple labels

I'm wondering how to create a DataLoader that supports multiple types of labels in Pytorch. How do I do this?
You can return a dict of labels for each item in the dataset, and DataLoader is smart enough to collate them for you. i.e. if you provide a dict for each item, the DataLoader will return a dict, where the keys are the label types. Accessing a key of that label type returns a collated tensor of that label type.
See below:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
class M(Dataset):
def __init__(self):
super().__init__()
self.data = np.random.randn(20, 2)
print(self.data)
def __getitem__(self, i):
return self.data[i], {'label_1':self.data[i], 'label_2':self.data[i]}
def __len__(self):
return len(self.data)
ds = M()
dl = DataLoader(ds, batch_size=6)
for x, y in dl:
print(x, '\n', y)
print(type(x), type(y))
[[-0.33029911 0.36632142]
[-0.25303721 -0.11872778]
[-0.35955625 -1.41633132]
[ 1.28814629 0.38238357]
[ 0.72908184 -0.09222787]
[-0.01777293 -1.81824167]
[-0.85346074 -1.0319562 ]
[-0.4144832 0.12125039]
[-1.29546792 -1.56314292]
[ 1.22566887 -0.71523568]]
tensor([[-0.3303, 0.3663],
[-0.2530, -0.1187],
[-0.3596, -1.4163]], dtype=torch.float64)
{'item_1': tensor([[-0.3303, 0.3663],
[-0.2530, -0.1187],
[-0.3596, -1.4163]], dtype=torch.float64), 'item_2': tensor([[-0.3303, 0.3663],
[-0.2530, -0.1187],
[-0.3596, -1.4163]], dtype=torch.float64)}
<class 'torch.Tensor'> <class 'dict'>
...

Sklearn method in class

I would like to create a class that uses sklearn transformation methods. I found this article and I am using it as an example.
from sklearn import preprocessing
from sklearn.base import TransformerMixin
def minmax(dataframe):
minmax_transformer = preprocessing.MinMaxScaler()
return minmax_tranformer
class FunctionFeaturizer(TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
return self
def transform(self, X):
fv = self.scaler(X)
return fv
if __name__=="__main__":
scaling = FunctionFeaturizer(minmax)
df = pd.DataFrame({'feature': np.arange(10)})
df_scaled = scaling.fit(df).transform(df)
print(df_scaled)
The output is StandardScaler(copy=True, with_mean=True, with_std=True) which is actually the result of the preprocessing.StandardScaler().fit(df) if I use it out of the class.
What I am expecting is:
array([[0. ],
[0.11111111],
[0.22222222],
[0.33333333],
[0.44444444],
[0.55555556],
[0.66666667],
[0.77777778],
[0.88888889],
[1. ]])
I am feeling that I am mixing few things here but I do not know what.
Update
I did some modifications:
def minmax():
return preprocessing.MinMaxScaler()
class FunctionFeaturizer(TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
return self
def fit_transform(self, X):
self.scaler.fit(X)
return self.scaler.transform(X)
if __name__=="__main__":
scaling = FunctionFeaturizer(minmax)
df = pd.DataFrame({'feature': np.arange(10)})
df_scaled = scaling.fit_transform(df)
print(df_scaled)
But now I am receiving the following error:
Traceback (most recent call last):
File "C:/my_file.py", line 33, in <module>
test_scale = scaling.fit_transform(df)
File "C:/my_file.py", line 26, in fit_transform
self.scaler.fit(X)
AttributeError: 'function' object has no attribute 'fit'
Solving your error
in your code you have:
if __name__=="__main__":
scaling = FunctionFeaturizer(minmax)
df = pd.DataFrame({'feature': np.arange(10)})
df_scaled = scaling.fit_transform(df)
print(df_scaled)
change the line
scaling = FunctionFeaturizer(minmax)
to
scaling = FunctionFeaturizer(minmax())
you need to call the function to get the instantiation of MinMaxScaler returned to you.
Suggestion
Instead of implementing fit and fit_transform, implement fit and transform unless you can optimize both process into fit_tranform. This way, it is clearer what you are doing.
If you implement only fit and transform, you can still call fit_transform because you extend the TransformerMixin class. It will just call both functions in a row.
Getting your expected results
Your transformer is looking at every column of your dataset and distributing the values linearly between 0 and 1.
So, to get your expected results, it will really depend on what your df looks like. However, you did not share that with us, so it is difficult to tell if you will get it.
However, if you have df = [[0],[1],[2],[3],[4],[5],[6],[7],[8],[9]], you will see your expected result.
if __name__=="__main__":
scaling = FunctionFeaturizer(minmax())
df = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]
df_scaled = scaling.fit_transform(df)
print(df_scaled)
> [[0. ]
> [0.11111111]
> [0.22222222]
> [0.33333333]
> [0.44444444]
> [0.55555556]
> [0.66666667]
> [0.77777778]
> [0.88888889]
> [1. ]]

Outputting coefficients when running linear regression using sklearn

I'm attempting to run a simple linear regression on a data set and retrieve the coefficients. The data which is from a a .csv file looks like:
"","time","LakeHuron"
"1",1875,580.38
"2",1876,581.86
"3",1877,580.97
"4",1878,580.8
...
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
def Main():
location = r"~/Documents/Time Series/LakeHuron.csv"
ts = pd.read_csv(location, sep=",", parse_dates=[0], header=None)
ts.drop(ts.columns[[0]], axis=1, inplace=True)
length = len(ts)
x = ts[1].values
y = ts[2].values
x = x.reshape(length, 1)
y = y.reshape(length, 1)
regr = linear_model.LinearRegression()
regr.fit(x, y)
print(regr.coef_)
if __name__ == "__main__":
Main()
Since this is a simple linear model then $Y_t = a_0 + a_1*t$, which in this case should be $Y_t = 580.202 -0.0242t$. and all that prints out when running the above code is [[-0.02420111]]. Is there anyway to get the second coefficient 580.202?
I've had a look at the documentation on http://scikit-learn.org/stable/modules/linear_model.html and it outputs two variables in the array.
Look like you only have one X and one Y, So output is correct.
Try this:
#coef_ : array, shape (n_features, ) or (n_targets, n_features)
print(regr.coef_)
#intercept_ : array Independent term in the linear model.
print(regr.intercept_)
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression

Resources