Why is my AlexNet to train on CIFAR10 is not working? - pytorch

I'm trying to implement AlexNet and train on CIFAR10. However, the loss is not diminishing. Can you please tell me any problem?
class Alexnet(nn.Module):
def __init__(self):
super().__init__()
self.conv1=nn.Sequential(nn.Conv2d(3,64,3,2,1),nn.ReLU(),nn.MaxPool2d(2))
self.conv2=nn.Sequential(nn.Conv2d(64,192,3,1,padding=2),nn.MaxPool2d(2))
self.conv3=nn.Sequential(nn.Conv2d(192,384,3,1,padding=1),nn.ReLU())
self.conv4=nn.Sequential(nn.Conv2d(384,256,3,1,padding=1),nn.ReLU())
self.conv5=nn.Sequential(nn.Conv2d(256,256,3,1,padding=1),nn.ReLU(),nn.MaxPool2d(2))
self.dropout=nn.Dropout(0.3)
self.fc1=nn.Sequential(nn.Flatten(),nn.Linear(256*2*2,4096),nn.ReLU())
self.fc2=nn.Sequential(nn.Linear(4096,4096),nn.ReLU())
self.fc3=nn.Sequential(nn.Linear(4096,10),nn.ReLU())
def forward(self,x):
x=self.conv1(x)
x=self.conv2(x)
x=self.conv3(x)
x=self.conv4(x)
x=self.conv5(x)
x=self.dropout(x)
x=self.fc1(x)
x=self.dropout(x)
x=self.fc2(x)
x=self.fc3(x)
return x

Related

Saving the model architecture with activation functions in PyTorch

I use PyTorch for training neural networks. While saving the model, the weights of the network are saved, while the activation functions are not captured. Now, I reload the model from the saved weights with the activation functions changed, the model load still does not throw error. Further, the network outputs incorrect values (obviously). Is there a way to save the structure of the neural network along with the weights? An MWE is presented below.
import torch
from torch import nn
class Test(nn.Module):
def __init__(self):
super(Test, self).__init__()
self.fc1 = nn.Linear(10, 25)
self.fc2 = nn.Linear(25, 10)
self.relu = nn.ReLU()
self.tanh = nn.Tanh()
def forward(self, inputs):
return self.tanh(self.fc2(self.relu(self.fc1(inputs))))
To save
test = Test().float()
torch.save(test.state_dict(), "test.pt")
To load
import torch
from torch import nn
class Test1(nn.Module):
def __init__(self):
super(Test, self).__init__()
self.fc1 = nn.Linear(10, 25)
self.fc2 = nn.Linear(25, 10)
self.relu = nn.ReLU()
self.tanh = nn.Tanh()
def forward(self, inputs):
return self.relu(self.fc2(self.tanh(self.fc1(inputs))))
test1 = Test1().float()
test1.load_state_dict(torch.load("test.pt")) # Loads without error. However the activation functions, tanh and relu are interchanged, and the network outputs incorrect values.
Is there a way to also capture the activation functions, while saving? Thanks.

What can be the cause of the validation loss increasing and the accuracy remaining constant to zero while the train loss decreases?

I am trying to solve a multiclass text classification problem. Due to specific requirements from my project I am trying to use skorch (https://skorch.readthedocs.io/en/stable/index.html) to wrap pytorch for the sklearn pipeline. What I am trying to do is fine-tune a pretrained version of BERT from Huggingface (https://huggingface.co) with my dataset. I have tried, in the best of my knowledge, to follow the instructions from skorch on how I should input my data, structure the model etc. Still during the training the train loss decreases until the 8th epoch where it starts fluctuating, all while the validation loss increases from the beginning and the validation accuracy remains constant to zero. My pipeline setup is
from sklearn.pipeline import Pipeline
pipeline = Pipeline(
[
("tokenizer", Tokenizer()),
("classifier", _get_new_transformer())
]
in which I am using a tokenizer class to preprocess my dataset, tokenizing it for BERT and creating the attention masks. It looks like this
import torch
from transformers import AutoTokenizer, AutoModel
from torch import nn
import torch.nn.functional as F
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import numpy as np
class Tokenizer(BaseEstimator, TransformerMixin):
def __init__(self):
super(Tokenizer, self).__init__()
self.tokenizer = AutoTokenizer.from_pretrained(/path/to/model)
def _tokenize(self, X, y=None):
tokenized = self.tokenizer.encode_plus(X, max_length=20, add_special_tokens=True, pad_to_max_length=True)
tokenized_text = tokenized['input_ids']
attention_mask = tokenized['attention_mask']
return np.array(tokenized_text), np.array(attention_mask)
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
word_tokens, attention_tokens = np.array([self._tokenize(string)[0] for string in tqdm(X)]), \
np.array([self._tokenize(string)[1] for string in tqdm(X)])
X = word_tokens, attention_tokens
return X
def fit_transform(self, X, y=None, **fit_params):
self = self.fit(X, y)
return self.transform(X, y)
then I initialize the model I want to fine-tune as
class Transformer(nn.Module):
def __init__(self, num_labels=213, dropout_proba=.1):
super(Transformer, self).__init__()
self.num_labels = num_labels
self.model = AutoModel.from_pretrained(/path/to/model)
self.dropout = torch.nn.Dropout(dropout_proba)
self.classifier = torch.nn.Linear(768, num_labels)
def forward(self, X, **kwargs):
X_tokenized, attention_mask = torch.stack([x.unsqueeze(0) for x in X[0]]),\
torch.stack([x.unsqueeze(0) for x in X[1]])
_, X = self.model(X_tokenized.squeeze(), attention_mask.squeeze())
X = F.relu(X)
X = self.dropout(X)
X = self.classifier(X)
return X
I initialize the model and create the classifier with skorch as follows
from skorch import NeuralNetClassifier
from skorch.dataset import CVSplit
from skorch.callbacks import ProgressBar
import torch
from transformers import AdamW
def _get_new_transformer() -> NeuralNetClassifier:
transformer = Transformer()
net = NeuralNetClassifier(
transformer,
lr=2e-5,
max_epochs=10,
criterion=torch.nn.CrossEntropyLoss,
optimizer=AdamW,
callbacks=[ProgressBar(postfix_keys=['train_loss', 'valid_loss'])],
train_split=CVSplit(cv=2, random_state=0)
)
return net
and I use fit like that
pipeline.fit(X=dataset.training_samples, y=dataset.training_labels)
in which my training samples are lists of strings and my labels are the an array containing the indexes of each class, as pytorch requires.
This is a sample of what happens
training history
I have tried to keep train only the fully connected layer and not BERT but I have the same issue again. I also tested the train accuracy after the training process and it was only 0,16%. I would be grateful for any advice or insight on how to solve my problem! I am pretty new with skorch and not so comfortable with pytorch yet and I believe that I am missing something really simple. Thank you very much in advance!

How to implement some trainable parameters in the model of Keras like nn.Parameters() in Pytorch?

I just wanna to implement some trainable parameters in my model with Keras. In Pytorch, we can do it by using torch.nn.Parameter() like below:
self.a = nn.Parameter(torch.ones(8))
self.b = nn.Parameter(torch.zeros(16,8))
I think by doing this in pytorch it can add some trainable parameters into the model. And now I wanna to know, how to achieve similar operations in keras?
Any suggestions or advice are welcomed!
THX! :)
p.s. I just write a custom layer in Keras as below:
class Mylayer(Layer):
def __init__(self,input_dim,output_dim,**kwargs):
self.input_dim = input_dim
self.output_dim = output_dim
super(Mylayer,self).__init__(**kwargs)
def build(self):
self.kernel = self.add_weight(name='pi',
shape=(self.input_dim,self.output_dim),
initializer='zeros',
trainable=True)
self.kernel_2 = self.add_weight(name='mean',
shape=(self.input_dim,self.output_dim),
initializer='ones',
trainable=True)
super(Mylayer,self).build()
def call(self,x):
return x,self.kernel,self.kernel_2
and I wanna to know if I haven't change the tensor which pass through the layer, should I write the function def compute_output_shape() for necessary?
You need to create the trainable weights in a custom layer:
class MyLayer(Layer):
def __init__(self, my_args, **kwargs):
#do whatever you need with my_args
super(MyLayer, self).__init__(**kwargs)
#you create the weights in build:
def build(self, input_shape):
#use the input_shape to infer the necessary shapes for weights
#use self.whatever_you_registered_in_init to help you, like units, etc.
self.kernel = self.add_weight(name='kernel',
shape=the_shape_you_calculated,
initializer='uniform',
trainable=True)
#create as many weights as necessary for this layer
#build the layer - equivalent to self.built=True
super(MyLayer, self).build(input_shape)
#create the layer operation here
def call(self, inputs):
#do whatever operations are needed
#example:
return inputs * self.kernel #make sure the shapes are compatible
#tell keras about the output shape of your layer
def compute_output_shape(self, input_shape):
#calculate the output shape based on the input shape and your layer's rules
return calculated_output_shape
Now use your layer in the model.
If you are using eager execution on with tensorflow and creating a custom training loop, you can work pretty much the same way you do with PyTorch, and you can create weights outside layers with tf.Variable, passing them as parameters to the gradient calculation methods.

How to add BatchNormalization loss to gradient calculation in tensorflow 2.0 using keras subclass API

Using the keras subclass API it is easy enough to add a a batch normalization layer however the layer.losses list always appears empty. What is the correct method of including in the train loss when doing tape.gradient(loss, lossmodel.trainable_variables) where lossmodel is some separate keras subclass model defining a more complicated loss function that must include the gradient losses?
For example, this is minimal model with ONLY the batch norm layer. It has no loss AFAIK
class M(tf.keras.Model):
def __init__(self, axis):
super().__init__()
self.layer = tf.keras.layers.BatchNormalization(axis=axis, scale=False, center=True, virtual_batch_size=1, input_shape=(6,))
def call(self, x):
out = self.layer(x)
return out
m = M(1)
In [77]: m.layer.losses
Out[77]: []

Parallelization of multiples independent models in tensorflow-gpu / keras

I need to train a set of models but do not benefit from GPU acceleration using tensorflow-gpu / keras as time augments linearly with the number of models trained.
In
class Models(tf.keras.Model):
def __init__(self,N_MODELS=1):
super(Models, self).__init__()
self.block_i = [estimate_affine()
for node in range(N_MODELS)]
def call(self, inputs):
x = [self.block_i[i](input_i) for i,input_i in enumerate(inputs)]
return x
a list of N_MODELS layers are built and as are idenpendant should be parallelized. As it is not the case, even though output is what I expect, I guess my implementation is not optimal. Any idea how to make it parallelizable ?
Best
Paul
Here is a toynet of N_MODELS of linear regression
import tensorflow as tf
tf.enable_eager_execution()
from tensorflow.keras import layers
import numpy as np
from numpy import random
import time
class estimate_affine(layers.Layer):
def __init__(self):
'''
'''
super(estimate_affine, self).__init__()
self.a = tf.Variable(initial_value=[0.], dtype='float32',trainable=True,name='par1')
self.b = tf.Variable(initial_value=[0.], dtype='float32',trainable=True,name='par2')
def call(self, inputs):
return (self.a,self.b)
class Models(tf.keras.Model):
def __init__(self,N_MODELS=1):
super(Models, self).__init__()
self.block_i = [estimate_affine()
for node in range(N_MODELS)]
def call(self, inputs):
x = [self.block_i[i](input_i) for i,input_i in enumerate(inputs)]
return x
N_ITERATIONS=100
N_POINTS=100
ls_t=[]
for N_MODELS in [5,10,50,100,1000]:
t=time.time()
### Aim is to fit N_MODELS on N_POINTS which are basically N_MODELS of ax+b
a=np.random.randint(0,10,N_MODELS)
b=np.random.randint(0,10,N_MODELS)
noise=np.random.rand(N_POINTS) * 1
x=np.linspace(0,1,N_POINTS)
dataset=np.array([a_i *( x + noise) + b_i for a_i,b_i in zip(a,b)])
model=Models(N_MODELS=N_MODELS)
optimizer=tf.keras.optimizers.SGD(learning_rate=5e-3)
for i in range(N_ITERATIONS):
with tf.GradientTape() as tape:
outputs=model(dataset)
L=tf.reduce_sum([((outputs[idx][0]*x+outputs[idx][1])
- dataset[idx,:])**2 for idx in range(N_MODELS)])
grads = tape.gradient(L, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
t_diff=time.time()-t
print('N_MODEL : {}, time : {}'.format(N_MODELS,t_diff))
ls_t.append(t_diff)

Resources