Implementing Maxout Activation in Theano - theano

The only example of maxout implementation in Theano is on this link. My understanding is that I use any activation function and then maxout is just a post processing of the hidden layer outputs.
I tried to apply this to my own HiddenLayer class. Below is the class before maxout:
class HiddenLayer(object):
def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh):
'''
Initialise the Hidden Layer
Parameters:
rng - random number generator
input - input values from the preceding layer
n_in - number of input nodes (number of nodes of the preceding layer)
n_out - number of output nodes (number of nodes of this hidden layer)
W - the Weights of the layer
b - the bias of the layer
activation - the activation function: T.tanh(), relu()
'''
self.input = input
W, b = self.init_weights(rng, n_in, n_out, W, b, activation) # initialise the wrights of a hidden layer
self.W = W; self.b = b;
lin_output = T.dot(input, self.W) + self.b
self.output = (lin_output if activation is None else activation(lin_output))
# parameters of the model
self.params = [self.W, self.b]
If I understood the link correctly, the class after maxout implementation should look as below. Is this correct? If not, could you point out which part I misunderstood?
class HiddenLayer(object):
def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh, maxout=False):
'''
maxout - whether to apply maxout after the activation function
'''
self.input = input
W, b = self.init_weights(rng, n_in, n_out, W, b, activation) # initialise the wrights of a hidden layer
self.W = W; self.b = b;
lin_output = T.dot(input, self.W) + self.b
self.output = (lin_output if activation is None else activation(lin_output))
if maxout: #apply maxout to the 'activated' hidden layer output
maxout_out = None
maxoutsize = n_out
for i in xrange(maxoutsize):
t = self.output[:,i::maxoutsize]
if maxout_out is None:
maxout_out = t
else:
maxout_out = T.maximum(maxout_out, t)
self.output = maxout_out
# parameters of the model
self.params = [self.W, self.b]

Related

Normalizing multivariate time-series data with different sequence length

I have a multivariate time-series dataset with different sequence lengths. I filled the missing values in the sequences with zeros. I am trying to use a recurrent neural network model for forcasting with Time Series. I noticed my results of the model degrade when the range of the data is outside -1 and 1. I wrote the following normalization class using MinMaxScaler. However, I don't know how to exclude the missing values in the sequences during computation of MinMaxScaler. Here is my code
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
import numpy as np
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_mask_from_sequence_lengths(
sequence_lengths: torch.Tensor, max_length: int
) -> torch.BoolTensor:
# (batch_size, max_length)
ones = sequence_lengths.new_ones(sequence_lengths.size(0), max_length)
range_tensor = ones.cumsum(dim=1)
return sequence_lengths.unsqueeze(1) >= range_tensor
class Normalizer1D(nn.Module):
# Data size of (batch_size, seq_len, input_size)
def __init__(self, input_dim, inputs ):
super(Normalizer1D, self).__init__()
self.input_dim = input_dim
self.to(device)
self._norm = self.build_normalizers(inputs)
max_len=inputs.shape[-1]
data = torch.from_numpy(inputs)
length = torch.LongTensor([torch.max((data[i,0,:]!=0).nonzero()).item()+1 for i in range(data.shape[0])])
mask = get_mask_from_sequence_lengths( length, max_len)
def build_normalizers(self, x):
normalizers = OrderedDict()
for i in range(self.input_dim):
if np.min(x[:,i,:])<0:
scaler = MinMaxScaler(feature_range=(-1, 1))
else:
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(x[:,i,:])
normalizers[str(i)] = scaler
return normalizers
def normalize(self, x):
#(B, D, T)
d = x.cpu().detach().numpy()
n_x=[]
for i in range(x.shape[1]):
n_x.append(self._norm[str(i)].fit_transform(d[:,i,:]))
x =np.stack(n_x, axis=1)
return torch.from_numpy(x).to(device)
def unnormalize(self, x):
#(T, B, D)==>(B, T, D)
d = x.cpu().detach().numpy()
n_x=[]
for i in range(x.shape[1]):
n_x.append(self._norm[str(i)].inverse_transform(d[:,i,:]))
x =np.stack(n_x, axis=1)
return torch.from_numpy(x).to(device)
#property
def min_(self):
#(T, B, D)
min_ = []
for i in range(len(self._norm)):
min_.append(self._norm[str(i)].min_)
return torch.from_numpy(np.stack(min_, axis=1))
#property
def scale_(self):
#(T, B, D)
scale_ = []
for i in range(len(self._norm)):
scale_.append(self._norm[str(i)].scale_)
return torch.from_numpy(np.stack(scale_, axis=1))
def unnormalize_mean(self, x_mu):
Xscale = self.scale_()
Xmin = self.min_()
normX = x_mu.mul_(Xscale)
return normX.add_(Xmin)
def unnormalize_sigma(self, x_sigma):
Xscale =self.scale_()
return x_sigma.mul_(Xscale)
# compute the normalizers
def compute_normalizer(loader_train):
##batch_size, input_dim, seq_len
for i, (u, y) in enumerate(loader_train):
if i ==0:
#input u torch.Size([B, D, T])
inputs = u
outputs = y
else:
inputs = torch.cat([inputs,u], dim=0)
outputs = torch.cat([outputs,y], dim=0)
inputs = inputs.cpu().detach().numpy()
outputs = outputs.cpu().detach().numpy()
# initialization
u_normalizer = Normalizer1D(inputs.shape[1], inputs)
y_normalizer = Normalizer1D(outputs.shape[1], outputs)
return u_normalizer, y_normalizer
I will appreciate if someone could suggest a way to exclude the missing values from the normalization process.

nn.CrossEntropyLoss attribute Error while trying to develop video to caption generator in pytorch

I am getting
AttributeError: 'CrossEntropyLoss' object has no attribute 'dim'
This is the code block which I run and got the error
hidden_size=256
encoder1=VideoEncoderGRU(hidden_size)
encoder1=accelerator.prepare(encoder1)
decoder1 =DecoderRNN(hidden_size, vcd.lang_object.n_words).to(device)
decoder1=accelerator.prepare(decoder1)
encoder_hidden=encoder1.initHidden()
trainIters_modified(encoder1, decoder1,encoder_hidden)
A more detailed code of each layer.
trainIters_modified - It just a function that trains for a multiple number of times.
def trainIters_modified(encoder,decoder,encoder_hidden,print_every=10,plot_every=10):
start=time.time()
plot_losses=[]
print_loss_total=0 # Reset every print
plot_loss_total=0 #plot every
encoder_optimizer = optim.Adagrad(encoder.parameters())
decoder_optimizer = optim.Adagrad(decoder.parameters())
encoder_optimizer=accelerator.prepare(encoder_optimizer)
decoder_optimizer=accelerator.prepare(decoder_optimizer)
criterion = nn.CrossEntropyLoss()
for ep in range(1):
for vid,lab in train_loader:
n_iters=vid.shape[0]
for iter in range(1, n_iters + 1):
input_tensor = vid[iter-1]
target_tensor = lab[iter-1]
loss,encoder_hidden = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion,encoder_hidden)
print_loss_total += loss
plot_loss_total += loss
if iter % print_every == 0:
print_loss_avg = print_loss_total / print_every
print_loss_total = 0
print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),iter, iter / n_iters * 100, print_loss_avg))
if iter % plot_every == 0:
plot_loss_avg = plot_loss_total / plot_every
plot_losses.append(plot_loss_avg)
plot_loss_total = 0
#break
showPlot(plot_losses)
train function
teacher_forcing_ratio=0.5
def train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,decoder_optimizer,encoder_hidden,criterion,max_length=MAX_LENGTH):
encoder_hidden=encoder_hidden
encoder_optimizer.zero_grad() #set's encoder gradients to zero
decoder_optimizer.zero_grad() #set's decoder gradients to zero
input_length = input_tensor.size(0) #no_of_wordsin*
target_length = target_tensor.size(0)
#print(f"input.shape:{input_tensor.shape},input_length:{input_length}")
#print(f"target.shape{target_tensor.shape},target_length:{target_length}")
encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
#print(f"encoder_outputs:{encoder_outputs.shape},max_length:{max_length},encoder.hidden_size={encoder.hidden_size}")
loss = 0
for ei in range(input_length):
in_tensor=torch.permute(input_tensor[ei],(2,1,0))
#print("passing input tensor.shape",in_tensor.shape) #torch.Size([1])
#print("encoder_hidden.shape:",encoder_hidden.shape) #torch.Size([1, 1, 256])
encoder_output, encoder_hidden = encoder(in_tensor, encoder_hidden) #
#print("encoder_output:",encoder_output.shape)
#print("encoder_hidden:",encoder_hidden.shape)
#the outputs are being stored in encoder_outputs matrix
encoder_outputs[ei]=encoder_output[0,0]
#after we have trained encoder for one epoch
decoder_input = torch.tensor([[SOS_token]], device=device) #SOS_token and EOS_token where defined above.
#set the encoder hidden as decoder hidden state
decoder_hidden = encoder_output
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
if use_teacher_forcing:
#teacher forcing , feed the target as next input
for di in range(target_length):
decoder_output,decoder_hidden = decoder(decoder_input,decoder_hidden)
loss += criterion(decoder_output, target_tensor[di])
decoder_input = target_tensor[di]
else:
# Without teacher forcing: use its own predictions as the next input
for di in range(target_length):
decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
#print("decoder_output.shape:",decoder_output.shape,"decoder_output:",decoder_output)
topv, topi = decoder_output.topk(1)
#print("tov,topi:",topv,topi)
decoder_input = topi.squeeze().detach() # detach from history as input
#print("decoder_input:",decoder_input.shape,"decoder_input:",decoder_input)
loss += criterion(decoder_output, target_tensor[di])
#print(loss)
if decoder_input.item() == EOS_token:
break
#loss.backward()
accelerator.backward(loss)
encoder_optimizer.step()
decoder_optimizer.step()
return (loss.item() / target_length) ,encoder_hidden
EncoderGRU
class VideoEncoderGRU(nn.Module):
def __init__(self,hidden_size):
super(VideoEncoderGRU,self).__init__()
self.vgg=vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
self.vgg.classifier=nn.Sequential(*list(self.vgg.classifier.children())[0:0])
self.vegru_classifier=nn.Sequential(
nn.Linear(512 * 7 * 7, 1024)
)
self.gru=nn.GRU(1024,hidden_size)
self.hidden_size=hidden_size
def initHidden(self):
return torch.rand(1, 1, self.hidden_size, device=device)
def forward(self,input,hidden):
out=self.vgg(input)
out=torch.reshape(out,(-1,))
out=self.vegru_classifier(out)
out=out.view(1,1,-1)
out,hidden=self.gru(out,hidden)
return out,hidden
DecoderGRU
class DecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
output = self.embedding(input).view(1, 1, -1)
output = F.relu(output)
output, hidden = self.gru(output, hidden)
output = self.softmax(self.out(output[0]))
return output, hidden
def initHidden(self):
return torch.rand(1, 1, self.hidden_size, device=device)
My dataset consists of videos stored in a folder and csv file containing names of video along with thier captions. I think I have loaded my data correctly and convert it into a video of frame size =8 and each caption is changed to a tensor of max_length =8 containing indices of each wod in caption.
You can look at my whole source code in this notebook file
There might be a simple logical error I just knew the theory behind this problem and so I coded it
I tried to implement an LSTM encoder which takes frames of video as input. These frames are first directly passed into vgg16 network which encodes it and encoded representation is passed into each frame and final output of encoder is passed as initial hidden state to decoder along with SOS_token as first input to decoder. I wanted this model to work and predict output

tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [100,200] vs. [100,10,200]

The shape of the tensor input to my model is(None, 10, 256),after processing by the attention layer, the shape becomes(None, 256),How should I modify layercompute_output_shape(self, input_shape) so that the shape of the model does not change?
attention layer
class Attention_layer(Layer):
def __init__(self,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
super(Attention_layer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(name='att_weight',shape=(input_shape[-1], input_shape[-1],),
initializer=self.init,
regularizer=self.W_regularizer,
constraint=self.W_constraint
)
if self.bias:
self.b = self.add_weight((input_shape[-1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
super(Attention_layer, self).build(input_shape)
def compute_mask(self, input, input_mask=None):#build(input_shape):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):#call(x):
uit = K.dot(x, self.W)
if self.bias:
uit += self.b
uit = K.tanh(uit)
a = K.exp(uit)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
# and this results in NaN's. A workaround is to add a very small positive number to the sum.
# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
# a = K.expand_dims(a)
weighted_input = x * a
print(weighted_input)
return K.sum(weighted_input, axis=1)#output.shape = (batch_size, embedding_size)
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]

Why the GAN Discriminator shows wrong classes?

I created a GAN for text and using VanillaGAN training approach, but the main problem is the sequences that are created are good, but the main problem is that when I use nn.sigmoid to see the discriminator labels it shows [0] for data that is created which are completely real, and it is not correct.
Here is my Discriminator code:
class Classifier(nn.Module):
def __init__(self, hidden_size, hidden_size2, dropout):
super().__init__()
self.FC1 = nn.Sequential(
nn.Linear(512, hidden_size),
nn.LeakyReLU(0.1),
nn.Dropout(dropout))
self.FC2 = nn.Sequential(
nn.Linear(hidden_size, hidden_size2),
nn.LeakyReLU(0.1),
nn.Dropout(dropout)
)
self.FC3 = nn.Linear(hidden_size2, 1)
self.dropout = nn.Dropout(dropout)
self.bach = nn.BatchNorm1d(512)
self.bach2 = nn.BatchNorm1d(hidden_size)
self.bach3 = nn.BatchNorm1d(64)
def forward(self, x):
z = self.dropout(x)
z = self.bach(z)
z = self.FC1(z)
z = self.bach2(z)
z = self.FC2(z)
z = self.bach3(z)
out = self.FC3(z)
return out
As input to this Classifier, hidden states of real and fake sequences are feed into this network.
My loss function is BCEWithlogits and this is the Train class
# to create real labels (1s)
def label_real(size):
data = torch.ones(size, 1)
return data.to(device)
# to create fake labels (0s)
def label_fake(size):
data = torch.zeros(size, 1)
return data.to(device)
# function to train the discriminator network
def train_discriminator(optimizer, data_real, data_fake):
b_size = data_real.size(1)
real_label = label_real(b_size)
fake_label = label_fake(b_size)
optimizer.zero_grad()
output_real = discriminator(data_real)
loss_real = criterion(output_real, real_label)
output_fake = discriminator(data_fake)
loss_fake = criterion(output_fake, fake_label)
loss_real.backward()
loss_fake.backward()
optimizer.step()
return loss_real + loss_fake
I use nn.sigmoid after training and on testing the model. Please help me to know what is wrong with my neural network?

Implementing one to many LSTM/RNN, PyTorch

I have a matrix sized m x n, and want to predict by 1 x n vector (x at the picture with the network structure) the whole next (m-1) x n matrix (y^{i} at the picture), using RNN or LSTM, I don't understand how to implement feeding each
1 x n vector to the next hidden state and get all the
(m-1) x n vectors simultaneously and how to compute error over all y^{i}
I have this vanilla RNN-model and don't know how to modify it
class RNNModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(RNNModel, self).__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
# (batch_dim, seq_dim, feature_dim)
self.RNN = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='tanh')
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Initialize hidden state with zeros
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
out, h_t = self.RNN(x, h0)
#out = self.fc(h_t[:, -1, :])
out = self.fc(out[:, -1, :])
return out
Stainley, try this:
you initiate hidden state only if no other hidden state is passed. then you return the hidden state and pass it to forward() at the next iteration.
def forward(self, x, h=None):
if h is None: # if no hidden state is passed
h = torch.zeros( # Initialize hidden state with zeros
self.layer_dim, x.size(0),
self.hidden_dim).requires_grad_()
out, h_t = self.RNN(x, h)
out = self.fc(out[:, -1, :])
return out, h_t
in training code you run the cycle like this like this:
x = seed
h = None
for i in range (...)
optimizer.zero_grad()
...
x, h = model.forward (x, h)
...
loss = ...
loss.backward()
optimizer.step()

Resources