FeatureTransform from sklearn properly used - scikit-learn

i am trying to pass an email to my Pipeline and throw some prob based on the training. For doing that i used a bunch of function to take from the email pass like
from collections import Counter
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class EmailLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array([len(e[0].split("#")[0]) for e in X]).reshape(-1, 1)
class DomainLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array([len(e[0].split("#")[-1]) for e in X]).reshape(-1, 1)
class NumberOfVoulsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
vouls = "aeiouAEIOU"
name = [e[0].split("#")[0] for e in X]
return np.array(
[sum(1 for char in name if char in vouls) for name in name]
).reshape(-1, 1)
class NumberOfCapitalsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array(
[sum(1 for char in email[0] if char.isupper()) for email in X]
).reshape(-1, 1)
class NumberOfDigitsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
digits = "0123456789"
return np.array(
[sum(1 for char in email[0] if char in digits) for email in X]
).reshape(-1, 1)
So after this i package them inside another class and send it to Pipeline like this
class EmailsSuspicionModel:
def __init__(self, X_train, X_valid, y_train, y_valid, model_params):
self.X_train = X_train
self.X_valid = X_valid
self.y_train = y_train
self.y_valid = y_valid
self.model_params = model_params
self.preprocesser = FeatureUnion(
[
("email_length", EmailLengthTransformer()),
("domain_length", DomainLengthTransformer()),
("number_of_vouls", NumberOfVoulsTransfomer()),
("number_of_capitals", NumberOfCapitalsTransfomer()),
("number_of_digits", NumberOfDigitsTransfomer()),
("highest_char_frequency", HighestCharFrequencyTransfomer()),
("number_of_different_chars", NumberOfDifferentChars()),
(
"number_of_consecutive_or_identical_chars",
NumberOfConsecutiveOrIdenticalCharsTransfomer()
),
]
)
def transform(self):
logging.info("Transform validation data - Required for evaluation")
valid_preprocesser = self.preprocesser.fit(self.X_train)
return valid_preprocesser.transform(self.X_valid)
def pipeline(self):
logging.info("Build sklearn pipeline with XGBoost model")
xgb_model = XGBClassifier(eval_metric="logloss", use_label_encoder=False)
if self.model_params:
logging.info(f"XGBoost model params: {self.model_params}")
xgb_model = XGBClassifier(**self.model_params)
return Pipeline([("preproc", self.preprocesser), ("classifier", xgb_model)])
def fit(self):
self.pipeline().fit(
self.X_train, self.y_train, classifier__eval_set=[(self.transform(), self.y_valid)]
)
So whenver i start using the classes in action
X_valid_transformed = EmailsSuspicionModel(X_train.values, X_valid.values, y_train, y_valid, model_params=None).transform()
pipeline = EmailsSuspicionModel(X_train, X_valid, y_train, y_valid, model_params=None).pipeline()
pipeline.fit(
X_train, y_train, classifier__eval_set=[(X_valid_transformed, y_valid)]
)
My model is not yielding my expecting results ( i double check it against a notebook which i dont use pipeline ) and i think is because X_train is not being trained with the proper feature set since whenever i do
pipeline['preproc'].transform(['lucasdresl#gmail.com'])
([1, 1, 0, 0, 0])
Clearly that transformation is not being applied correctly since result is ([10, 9, 3, 0, 0]) from the functions provided and i think the model is being trained with this same error

It's a shape issue. If you transform instead [['lucasdres1#gmail.com']] you'll recover the expected values. You've written your transformers to expect 2D array inputs (the e[0] or email[0] in each of them otherwise selects the first character of the email).

Related

How does `optimizer.step()` perform an in-place operation?

Here is a simple example that results in an in-place operation error.
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=3,
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
shared_net = SharedNetwork()
net_1 = Network1()
segmentor = Network2()
optimizer = optim.Adam(list(shared_net.parameters()) + list(segmentor.parameters()), lr=1e-6)
optimizer_conf = optim.Adam(list(shared_net.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
optimizer.zero_grad()
optimizer_conf.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
optimizer.step()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
Error message:
UserWarning: Error detected in ConvolutionBackward0. No forward pass information available. Enable detect anomaly during forward pass for more information. (Triggered internally at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:97.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 3, 3, 3]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I was able to solve the problem by changing the order of running the step function of optimizers.
optimizer_conf.zero_grad()
optimizer.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
optimizer.step()
The following questions, however, remain:
How does the step method cause an inplace operation in convolution?
Why does moving the steps to the end of the file resolve this error?
NOTE: The loss function is used for simplicity, using dice-loss also results in the same error!
Before answering the question, I have to mention that it seems having multiple optimizers for one set of parameters is anti-pattern and it's better to be avoided.
How does the step method cause an inplace operation in convolution?
A: step method adds the gradients to the weights, so it does something like the following:
param.weight += param.grad
which can be interpreted as an in place operation
Why does moving the steps to the end of the file resolve this error?
A: Obviously, by moving the step method after the second backward method, the above-mentioned operation is not executed. As a result, there are no in-place operations and no errors raised due to their existence.
To sum up, it's best to have only one optimizer for one set of parameters, the previous example could coded in the following way:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=(3,3),
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
torch.manual_seed(0)
shared_net = SharedNetwork()
net_1 = Network1()
net_2 = Network2()
shared_optimizer = optim.Adam(list(shared_net.parameters()), lr=1e-6)
net_1_optimizer = optim.Adam(list(net_1.parameters()), lr=1e-6)
net_2_optimizer = optim.Adam(list(segmentor.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
net_2_optimizer.zero_grad()
features = shared_net(fake_data)
net_2_out = net_2(features)
s_loss = loss_fn(net_2_out, target_data_2)
s_loss.backward(retain_graph=True)
net_2_optimizer.step()
net_1_optimizer.zero_grad()
shared_optimizer.zero_grad()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
net_1_optimizer.step()
shared_optimizer.step()
Note: If you want to have two different learning rates for different losses applied to one set of parameters, you can multiply the losses based on their importance by a value. For example, you can multiply loss_1 by 0.1 and loss_1 by 0.5. Or, you can use backward hooks as mentioned in this comment:
backward-hook

How do I set the dimensions of Conv1d correctly?

This is a toy example as I'm learning PyTorch and using it on one-dimensional time series, in this case a sine wave.
I'm trying to use Conv1d, but I get the following error:
RuntimeError: Given groups=1, weight of size [5, 1, 2], expected input[1, 994, 5] to have 1 channels, but got 994 channels instead
My 'lookback' is 5 time steps, and the shape of my data batch is [994, 5].
What am I doing wrong?
import torch;from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F;import pytorch_lightning as pl
from torch import nn, tensor
class TsDs(torch.utils.data.Dataset):
def __init__(self, s, l=5): super().__init__();self.l,self.s=l,s
def __len__(self): return self.s.shape[0] - 1 - self.l
def __getitem__(self, i): return self.s[i:i+self.l], torch.log(self.s[i+self.l+1]/self.s[i+self.l])
def plt(self): plt.plot(self.s)
class TsDm(pl.LightningDataModule):
def __init__(self, length=5000, batch_size=1000): super().__init__();self.batch_size=batch_size;self.s = torch.sin(torch.arange(length)*0.2) + 5
def train_dataloader(self): return DataLoader(TsDs(self.s[:3999]), batch_size=self.batch_size, shuffle=False)
def val_dataloader(self): return DataLoader(TsDs(self.s[4000:]), batch_size=self.batch_size)
dm = TsDm()
class MyModel(pl.LightningModule):
def __init__(self, learning_rate=0.01):
super().__init__();self.learning_rate = learning_rate
super().__init__();self.learning_rate = learning_rate
self.network = nn.Sequential(nn.Conv1d(1,5,2),nn.ReLU(),nn.Linear(5,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
# self.network = nn.Sequential(nn.Linear(5,5),nn.ReLU(),nn.Linear(5,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
def forward(self, x): return self.network(x)
def step(self, batch, batch_idx, stage):
x, y = batch
loss = -torch.mean(self(x)*y)
print(loss)
return loss
def training_step(self, batch, batch_idx): return self.step(batch, batch_idx, "train")
def validation_step(self, batch, batch_idx): return self.step(batch, batch_idx, "val")
def configure_optimizers(self): return torch.optim.SGD(self.parameters(), lr=self.learning_rate)
mm = MyModel(0.01);trainer = pl.Trainer(max_epochs=10)
trainer.fit(mm, datamodule=dm)
There are two issues in your code:
Looking at the documentation of nn.Conv1d, your input shape should be (B, C, L). In your default case, you have L=5, the sequence length, but you need to create that extra dimension representing the feature size of a sequence element, here C=1. You can do so by changing TsDs's __getitem__ function to:
def __getitem__(self, i):
x = self.s[i:i+self.l] # minibatch x shaped (1, self.l)
y = torch.log(self.s[i+self.l+1]/self.s[i+self.l]) # minibatch y shaped (1,)
return x, y
Your convolutional layer has a stride of 1 and a size of 2, this means its output will be shaped (B, 5, L-1=4). The following layer is a fully connected layer instantiated as nn.Linear(5, 3), which means it expects (*, H_in=5) and will output (*, H_out). You can either
You can flatten the conv1d output with nn.Flatten and feed it to a bigger fully connected layer (for instance nn.Linear(20, 3).
You can use a convolutional layer with a wider kernel, if you use a kernel of 5 (your sequence length you will end up with a tensor of (B, 5, 1) which you feed to a nn.Linear(5, 3). Although this approach doesn't really scale when L is changed.
You could apply a nn.AvgPool1d to get an average representation of the sequence after the convolutional layers have been applied.
Those are just a few directions...

From two Tensors (labels, inputs) to DataLoader

I'm working with two tensors, inputs and labels, and I want to have them together to train a model. I'm using torch 1.7, but I can't use the function TensorDataset() and then apply DataLoader(), due to some incompatibilities with other packages when I use TensorDataset(). There is another solution to my problem?
Summary:
2 Tensors --> DataLoader without using TensorDataset()
You can construct your own custom DataSet:
class MyDataSet(torch.utils.data.Dataset):
def __init__(self, x, y):
super(MyDataSet, self).__init__()
# store the raw tensors
self._x = x
self._y = y
def __len__(self):
# a DataSet must know it size
return self._x.shape[0]
def __getitem__(self, index):
x = self._x[index, :]
y = self._y[index, :]
return x, y

Change custom loss parameter and NN parameter with respect to epoch

I have a Keras model defined in the following manner (Tried to keep only the necessary parts):
temperature = 5.0
def knowledge_distillation_loss(y_true, y_pred, lambda_const):
y_true, logits = y_true[:, :10], y_true[:, 10:]
y_soft = K.softmax(logits/temperature)
y_pred, y_pred_soft = y_pred[:, :10], y_pred[:, 10:]
return lambda_const*logloss(y_true, y_pred) + logloss(y_soft, y_pred_soft)
def get_model(num_labels):
#Some layers for model
model.add(Dense(num_labels))
logits = model.layers[-1].output
probabilities = Activation('softmax')(logits)
# softed probabilities
logits_T = Lambda(lambda x: x/temperature)(logits)
probabilities_T = Activation('softmax')(logits_T)
output = concatenate([probabilities, probabilities_T])
model = Model(model.input, output)
lambda_const = 0.07
model.compile(
optimizer=optimizers.SGD(lr=1e-1, momentum=0.9, nesterov=True),
loss=lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, lambda_const),
metrics=[accuracy])
return model
I am following this reference.
This is implemented using fit generator() on Keras with tf backend. Obviously, I will have trouble when loading the model since temperature is hared coded.
Also,
I wish to update temperature parameter with respect to the epoch number in both loss function and model.
How do I define such a control signal?
I've turned this into a complete example of one way to do this.
You could make a class for the loss function.
class TemperatureLossFunction:
def __init__(self, temperature):
self.temperature = temperature
def loss_fun(self, y_truth, y_pred):
return self.temperature*keras.losses.mse(y_truth, y_pred)
def setTemperature(self, t, session=None):
if session:
session.run(self.temperature.assign( t )
elif tensorflow.get_default_session():
tensorflow.get_default_session().run(self.temperature.assign( t ))
class TemperatureLossCallback(keras.callbacks.Callback):
def __init__(self, temp_lf):
self.temp_lf = temp_lf
def on_epoch_end(self, epoch, params):
self.temp_lf.setTemperature(epoch)
I've created two methods for working with this, the first method creates and saves the model.
def init(session):
global temperature #global for serialization issues
temperature = tensorflow.Variable(5.0)
tlo = TemperatureLossFunction(temperature)
inp = keras.layers.Input((4,4))
l1 = keras.layers.Lambda( lambda x: temperature*x )
op = l1(inp)
m = keras.models.Model(inputs=[inp], outputs=[op])
m.compile( optimizer = keras.optimizers.SGD(0.01), loss=tlo.loss_fun)
#make sure the session is the one your using!
session.run(temperature.initializer)
The first test I run makes sure we are changing the value.
m.evaluate( numpy.ones((1, 4, 4)), numpy.zeros((1, 4, 4)) )
session.run(temperature.assign(1))
m.evaluate( numpy.ones((1, 4, 4)), numpy.zeros((1, 4, 4)) )
The second test I run makes sure we can change the values with a callback.
cb = TemperatureLossCallback(tlo)
def gen():
for i in range(10):
yield numpy.ones((1, 4, 4)), numpy.zeros((1, 4, 4))
m.fit_generator(
gen(), steps_per_epoch=1, epochs=10, callbacks=[cb]
)
m.save("junk.h5")
Finally, to demonstrate reloading the file.
def restart(session):
global temperature
temperature = tensorflow.Variable(5.0)
tlo = TemperatureLossFunction(temperature)
loss_fun = tlo.loss_fun
m = keras.models.load_model(
"junk.h5",
custom_objects = {"loss_fun":tlo.loss_fun}
)
session.run(temperature.initializer)
m.evaluate( numpy.ones((1, 4, 4)), numpy.zeros((1, 4, 4)) )
session.run(temperature.assign(1))
m.evaluate( numpy.ones( (1, 4, 4) ), numpy.zeros( ( 1, 4, 4) ) )
This is just the code I use to start the program for completeness
import sys
if __name__=="__main__":
sess = tensorflow.Session()
with sess.as_default():
if "restart" in sys.argv:
restart(sess)
else:
init(sess)
One downside of this method, if you run this you will see that the temperature variable does not get loaded from the model file. It takes on the value assigned in the code.
On the plus side, both the loss function and the layer are referencing the same Variable
One way I found to save the variable value is to create a new layer and use the variable as the weight for the new layer.
class VLayer(keras.layers.Layer):
def __init__(self, *args, **kwargs):
super().__init__(**kwargs)
def build(self, input_shape):
self.v1 = self.add_weight(
dtype="float32",
shape = (),
trainable=False,
initializer="zeros"
)
def call(self, x):
return x*self.v1
def setValue(self, val):
self.set_weights( numpy.array([val]) )
Now when you load the model, the weight will be loaded. Unfortunately, I could not find a way to link the weight to a Variable on load. So there will be two variables, one for the loss function and one for the layer. Both of them can be set from a callback though. So I feel this method is on a more robust path.

AttributeError: 'numpy.ndarray' object has no attribute 'fit' when calling fit_transform on a pipeline

I'm getting the below error when I call pipeline.fit_transform(X_train, y_train).
AttributeError: 'numpy.ndarray' object has no attribute 'fit'
The individual transformers in the pipeline work fine, but when I combine them in the pipeline I get the error.
X, y = training_data.drop('Response', axis=1), training_data['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
class preprocess(TransformerMixin, BaseEstimator):
def __init__():
self.X = None
def fit(self, X, y=None):
self.X = X
self.PI2 = 'Product_Info_2'
self.PI2_categories = list(training_data[self.PI2].unique())
return self
def transform(self, X, y=None):
Xt = X.copy()
Xt = pd.concat([Xt, pd.get_dummies(Xt[self.PI2])], axis=1).drop(self.PI2, axis=1)
Xt.drop('Id', axis=1, inplace=True)
Xt.fillna(value=0, inplace=True)
return np.array(Xt)
class apply_NB(TransformerMixin, BaseEstimator):
def __init__(self):
self.gridCV = None
self.params = {"var_smoothing": [x*10**(-9) for x in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
0.8, 0.9, 1, 1.5, 2, 2.5, 3, 3.5,
4, 4.5, 5]]}
self.best_params = None
def fit(self, X, y):
self.gridCV = GridSearchCV(GaussianNB(), self.params, verbose=10, n_jobs=-1)
self.gridCV.fit(X, y)
self.best_params = self.gridCV.best_params_
return self
def transform(self, X, y=None):
Xt = self.gridCV.predict(X)
return Xt
nb_pipeline = Pipeline([('preprocess', preprocess),
('fit_NB', apply_NB())])
nb_pipeline.fit_transform(X_train, y_train)
When I try the final line I just get:
AttributeError: 'numpy.ndarray' object has no attribute 'fit'
You forgot to put self in the first init of preprocess
class preprocess(TransformerMixin, BaseEstimator):
def __init__(self):
self.X = None
and then you gotta initialize the class for applyNB too.
nb_pipeline = [('preprocess', preprocess()),
('fit_NB', apply_NB())]
Seems to work for m after making these changes!

Resources