Fluctuations and overfitting in first epochs - pytorch

I am training a CNN network on the DVS gesture dataset using PyTorch. However, the training is not progressing in a soft way, the accuracies of both training and validation fluctuate a lot, they are both progressing, but there is a big difference between them (5~6% up to 10%) as if there is overfitting in 3/4 epoch. I have tried L2 regularization as well as a dropout with high values, the difference disappears in the first iterations but reappears strongly afterward, and I am sure that datasets are perfectly merged and split randomly, changed several times the batch size but didn't impact, normalization make it worse.
PS: May this be an underfit, how to identify an underfit ?
Thanks in advance!
CODE (Using snntorch library) :
spike_grad = surrogate.fast_sigmoid(slope=5.4)
beta = 0.72
num_epochs = 200
class Net(nn.Module):
def __init__(self):
super().__init__()
# Initialize layers
self.conv1 = nn.Conv2d(2, 16, kernel_size=5, bias=False)
self.pool1 = nn.AvgPool2d(2)
self.lif1 = snn.Leaky(beta=beta, spike_grad=spike_grad, threshold=2.5)#, threshold_p=2.5, threshold_n=-2.5)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, bias=False)
self.pool2 = nn.AvgPool2d(2)
self.lif2 = snn.Leaky(beta=beta, spike_grad=spike_grad, threshold=2.5)#, threshold_p=2.5, threshold_n=-2.5)
self.fc1 = nn.Linear(800, 11)
self.drop1 = nn.Dropout(0.93)
self.lif3 = snn.Leaky(beta=beta, spike_grad=spike_grad, threshold=2.5)#, threshold_p=2.5, threshold_n=-2.5)
self.flatten = nn.Flatten()
def forward(self, x):
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
mem3 = self.lif3.init_leaky()
spk_rec = []
mem_rec = []
for step in range(x.size(1)):
cur1 = self.pool1(self.conv1((x.permute(1,0,2,3,4))[step]))
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.pool1(self.conv2(spk1))
spk2, mem2 = self.lif2(cur2, mem2)
cur3 = self.drop1(self.fc1(self.flatten(spk2)))
spk3, mem3 = self.lif3(cur3, mem3)
spk_rec.append(spk3)
mem_rec.append(mem3)
return torch.stack(spk_rec), torch.stack(mem_rec)
net_9 = Net().to(device)
optimizer = torch.optim.Adam(net_9.parameters(), lr=7.5e-3, betas=(0.9, 0.999))#, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=735, eta_min=0, last_epoch=-1)
loss = SF.mse_count_loss() # spk mse
train_loss_hist_9 = []
valid_loss_hist_9 = []
train_acc_hist_9 = []
valid_acc_hist_9 = []
path_9 = "1-DVS\net_9_"
for epoch in range(num_epochs):
batch_train = batch_valid = 0
# Minibatch training loop
net_9.train()
for data_train, targets_train in iter(train_loader):
data_train = data_train.to(device)
targets_train = targets_train.to(device)
spk_train, mem_train = net_9.forward(data_train)
loss_train = loss(spk_train, targets_train)
optimizer.zero_grad()
loss_train.backward()
optimizer.step()
scheduler.step()
_, idx = spk_train.sum(dim=0).max(1)
acc_train = np.mean((targets_train == idx).detach().cpu().numpy())
train_acc_hist_9.append(acc_train.item())
train_loss_hist_9.append(loss_train.item())
batch_train += 1
# Minibatch validation loop
net_9.eval()
with torch.no_grad():
for data_valid, targets_valid in iter(valid_loader):
data_valid = data_valid.to(device)
targets_valid = targets_valid.to(device)
spk_valid, mem_valid = net_9.forward(data_valid)
loss_valid = loss(spk_valid, targets_valid)
_, idx = spk_valid.sum(dim=0).max(1)
acc_valid = np.mean((targets_valid == idx).detach().cpu().numpy())
valid_acc_hist_9.append(acc_valid.item())
valid_loss_hist_9.append(loss_valid.item())
batch_valid += 1
scheduler.step(loss_valid)
torch.save({'model_state_dict': net_9.state_dict()}, path_9 + str(epoch))
print("----------------------------------------------------------------------")
print_epoch_accuracy(train_acc_hist_9, valid_acc_hist_9, batch_train, batch_valid)
print("----------------------------------------------------------------------")
print("\n")

Related

Multivariate Encoder-Decoder Model approaches value

I am creating my first multivariate multistep encoder-decoder LSTM to forecast revenues.
As you can see, the values move towards a value and then stop at that value. The aim is to create a forecast for a longer period, but there is no deviation at all from this standard value after the first week.
What is wrong and what can I do? To me it doesn't look like it is working at all.
code:
class ModelTrainer:
def __init__(self, prediction_length=30, offset=1):
self.prediction_length = prediction_length
self.offset = offset
self._setup_values()
self.use_scaling = True
self.__prepare_data()
def _setup_values(self):
# Model configuration
self.additional_metrics = ['accuracy']
self.embedding_output_dims = 15
self.max_sequence_length = 300
self.num_distinct_words = 5000
self.verbosity_mode = 1
# DATA
self.WINDOW_LENGTH = 70 # ! SHOULD BE ADJUSTED TO THE AMOUNT OF FORECASTING DAYS
self.SAMPLING_RATE = 1
self.BATCH_SIZE = 128
# MODEL
self.DROPOUT = 0.3
self.NODES_PER_LAYER = 256
self.NUMBER_OF_LAYERS = 3
# TRAINING
self.LEARNING_RATE = 0.001
self.OPTIMIZER = Adam(learning_rate=self.LEARNING_RATE)
self.VALIDATION_SPLIT = 0.20
self.NUMBER_OF_EPOCHS = 10
self.TEST_SIZE = 0.1
self.RANDOM_STATE = 123
self.LOSS_FUNCTION = MeanSquaredError()
def __import_data(self):
self.series = DataOrganizer().df
def __prepare_data(self):
self.__import_data()
self.scaler = preprocessing.MinMaxScaler()
data_scaled = self.scaler.fit_transform(self.series)
self.features, self.target = self._create_feature_target_values_window(
data_scaled)
def _create_feature_target_values_window(self, data):
self.number_of_output_columns = 4
feature_data = data
target_data = data[:, :self.number_of_output_columns]
features, target = list(), list()
in_start = 0
for _ in range(len(data)):
in_end = in_start + self.WINDOW_LENGTH
out_end = in_end + self.prediction_length
if out_end <= len(data):
features.append(feature_data[in_start:in_end, :])
target.append(
target_data[in_end:out_end, 0:self.number_of_output_columns])
in_start += 1
return np.array(features), np.array(target)
def __create_LSTM_model(self):
num_feature_columns = self.features.shape[2]
num_output_columns = self.target.shape[2]
model = Sequential()
model.add(LSTM(self.NODES_PER_LAYER, input_shape=(
self.WINDOW_LENGTH, num_feature_columns)))
model.add(Dropout(self.DROPOUT))
model.add(RepeatVector(self.prediction_length))
model.add(LSTM(self.NODES_PER_LAYER, return_sequences=True))
model.add(Dropout(self.DROPOUT))
model.add(TimeDistributed(Dense(self.NODES_PER_LAYER)))
model.add(Dropout(self.DROPOUT))
model.add(TimeDistributed(Dense(num_output_columns)))
model.summary()
return model
def train_model(self, callbacks=[]):
model = self.__create_LSTM_model()
model.compile(loss=self.LOSS_FUNCTION,
optimizer=self.OPTIMIZER,
metrics=['accuracy', MeanAbsoluteError()]
)
model.fit(
x=self.features,
y=self.target,
epochs=self.NUMBER_OF_EPOCHS,
validation_split=self.TEST_SIZE,
shuffle=False,
callbacks=callbacks
)
self.model = model
def create_forecast(self):
prediction = self.model.predict(self.features[-1:])
# prediction = self.model.predict(self.features[-30:-29]) # Show forecast from a month old
test_X = self.features.copy()
test_X = test_X[:self.prediction_length,
:1, self.number_of_output_columns:]
test_X = test_X.reshape(
self.prediction_length, self.series.shape[1] - self.number_of_output_columns)
prediction = prediction.reshape(self.prediction_length,
self.number_of_output_columns)
inv_yhat = np.concatenate((prediction, test_X), axis=1)
inv_yhat = self.scaler.inverse_transform(inv_yhat)
prediction_df = pd.DataFrame(
inv_yhat, columns=self.scaler.feature_names_in_)
first_date = self.series.last_valid_index() + timedelta(days=1)
last_date = first_date + timedelta(days=self.prediction_length-1)
days = pd.date_range(first_date, last_date, freq='D')
prediction_df.set_index(days, inplace=True)
prediction_df = prediction_df[self.series.columns[0:4]]
Actual
Forecast:
(I know the x-axis description is incorrect. Don't worry about it)

GPU memory increasing at each batch (PyTorch)

I am trying to build a convolutionnal network using ConvLSTM layer (LSTM cell but with convolutions instead of matrix multiplications), but the problem is that my GPU memory increases at each batch, even if I'm deleting variables, and getting the true value for the loss (and not the graph) for each iteration. I may be doing something wrong but that exact same script ran without issues with another model (with more parameters and also using ConvLSTM layer).
Each batch is composed of num_batch x 3 images (grayscale) and I'm trying to predict the difference |Im(t+1)-Im(t)| with the input Im(t)
def main():
config = Config()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, num_workers=0, shuffle=True, drop_last=True)
nb_img = len(train_dataset)
util.clear_progress_dir()
step_tensorboard = 0
###################################
# Model Setup #
###################################
model = fully_convLSTM()
if torch.cuda.is_available():
model = model.float().cuda()
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
util.enumerate_params([model])
###################################
# Training Loop #
###################################
model.train() #Put model in training mode
train_loss_recon = []
train_loss_recon2 = []
for epoch in tqdm(range(config.num_epochs)):
running_loss1 = 0.0
running_loss2 = 0.0
for i, (inputs, outputs) in enumerate(train_dataloader, 0):
print(i)
torch.cuda.empty_cache()
gc.collect()
# if torch.cuda.is_available():
inputs = autograd.Variable(inputs.float()).cuda()
outputs = autograd.Variable(outputs.float()).cuda()
im1 = inputs[:,0,:,:,:]
im2 = inputs[:,1,:,:,:]
im3 = inputs[:,2,:,:,:]
diff1 = torch.abs(im2 - im1).cuda().float()
diff2 = torch.abs(im3 - im2).cuda().float()
model.initialize_hidden()
optimizer.zero_grad()
pred1 = model.forward(im1)
loss = reconstruction_loss(diff1, pred1)
loss.backward()
# optimizer.step()
model.update_hidden()
optimizer.zero_grad()
pred2 = model.forward(im2)
loss2 = reconstruction_loss(diff2, pred2)
loss2.backward()
optimizer.step()
model.update_hidden()
## print statistics
running_loss1 += loss.detach().data
running_loss2 += loss2.detach().data
if i==0:
with torch.no_grad():
img_grid_diff_true = (diff2).cpu()
img_grid_diff_pred = (pred2).cpu()
f, axes = plt.subplots(2, 4, figsize=(48,48))
for l in range(4):
axes[0, l].imshow(img_grid_diff_true[l].squeeze(0).squeeze(0), cmap='gray')
axes[1, l].imshow(img_grid_diff_pred[l].squeeze(0).squeeze(0), cmap='gray')
plt.show()
plt.close()
writer_recon_loss.add_scalar('Reconstruction loss', running_loss1, step_tensorboard)
writer_recon_loss2.add_scalar('Reconstruction loss2', running_loss2, step_tensorboard)
step_tensorboard += 1
del pred1
del pred2
del im1
del im2
del im3
del diff1
del diff2#, im1_noised, im2_noised
del inputs
del outputs
del loss
del loss2
for obj in gc.get_objects():
if torch.is_tensor(obj) :
del obj
torch.cuda.empty_cache()
gc.collect()
epoch_loss = running_loss1 / len(train_dataloader.dataset)
epoch_loss2 = running_loss2/ len(train_dataloader.dataset)
print(f"Epoch {epoch} loss reconstruction1: {epoch_loss:.6f}")
print(f"Epoch {epoch} loss reconstruction2: {epoch_loss2:.6f}")
train_loss_recon.append(epoch_loss)
train_loss_recon2.append(epoch_loss2)
del running_loss1, running_loss2, epoch_loss, epoch_loss2
Here is the model used :
class ConvLSTMCell(nn.Module):
def __init__(self, input_channels, hidden_channels, kernel_size):
super(ConvLSTMCell, self).__init__()
# assert hidden_channels % 2 == 0
self.input_channels = input_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
# self.num_features = 4
self.padding = 1
self.Wxi = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whi = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxf = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whf = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxc = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whc = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxo = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Who = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wci = None
self.Wcf = None
self.Wco = None
def forward(self, x, h, c): ## Equation (3) dans Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting
ci = torch.sigmoid(self.Wxi(x) + self.Whi(h) + c * self.Wci)
cf = torch.sigmoid(self.Wxf(x) + self.Whf(h) + c * self.Wcf)
cc = cf * c + ci * torch.tanh(self.Wxc(x) + self.Whc(h)) ###gt= tanh(cc)
co = torch.sigmoid(self.Wxo(x) + self.Who(h) + cc * self.Wco) ##channel out = hidden channel
ch = co * torch.tanh(cc)
return ch, cc #short memory, long memory
def init_hidden(self, batch_size, hidden, shape):
if self.Wci is None:
self.Wci = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wcf = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wco = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
else:
assert shape[0] == self.Wci.size()[2], 'Input Height Mismatched!'
assert shape[1] == self.Wci.size()[3], 'Input Width Mismatched!'
return (autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda(),
autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda())
class fully_convLSTM(nn.Module):
def __init__(self):
super(fully_convLSTM, self).__init__()
layers = []
self.hidden_list = [1,32,32,1]#,32,64,32,
for k in range(len(self.hidden_list)-1): # Define blocks of [ConvLSTM,BatchNorm,Relu]
name_conv = "self.convLSTM" +str(k)
cell_conv = ConvLSTMCell(self.hidden_list[k],self.hidden_list[k+1],3)
setattr(self, name_conv, cell_conv)
name_batchnorm = "self.batchnorm"+str(k)
batchnorm=nn.BatchNorm2d(self.hidden_list[k+1])
setattr(self, name_batchnorm, batchnorm)
name_relu =" self.relu"+str(k)
relu=nn.ReLU()
setattr(self, name_relu, relu)
self.sigmoid = nn.Sigmoid()
self.internal_state=[]
def initialize_hidden(self):
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
(h,c) = getattr(self,name_conv).init_hidden(config.batch_size, self.hidden_list[k+1],(256,256))
self.internal_state.append((h,c))
self.internal_state_new=[]
def update_hidden(self):
for i, hidden in enumerate(self.internal_state_new):
self.internal_state[i] = (hidden[0].detach(), hidden[1].detach())
self.internal_state_new = []
def forward(self, input):
x = input
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
name_batchnorm = "self.batchnorm"+str(k)
name_relu =" self.relu"+str(k)
x, c = getattr(self,name_conv)(x, self.internal_state[k][1], self.internal_state[k][0])
self.internal_state_new.append((x.detach(),c.detach()))
x = getattr(self,name_batchnorm)(x)
if k!= len(self.hidden_list)-2:
x = getattr(self,name_relu)(x)
else :
x = self.sigmoid(x)
return x
So my question is, what in my code is causing memory to accumulate during the training phase?
A few quick notes about training code:
torch.Variable is deprecated since at least 8 minor versions (see here), don't use it
gc.collect() has no point, PyTorch does the garbage collector on it's own
Don't use torch.cuda.empty_cache() for each batch, as PyTorch reserves some GPU memory (doesn't give it back to OS) so it doesn't have to allocate it for each batch once again. It will make your code slow, don't use this function at all tbh, PyTorch handles this.
Don't spam random memory cleaning, that's most probably not where the error is
Model
Yes, this is probably the case (although it's hard to read this model's code).
Take notice of self.internal_state list and self.internal_state_new list also.
Each time you call model.initialize_hidden() a new set of tensor is added to this list (and never cleaned as far as I can tell)
self.internal_state_new seems to be cleaned in update_hidden, maybe self.internal_state should be also?
In essence, check out this self.internal_state property of your model, the list grows indefinitely from what I see. Initializing with zeros everywhere is quite strange, there is probably no need to do that (e.g. PyTorch's RNN is initialized with zeros by default, this is probably similar).

Pytorch on Windows : Dataloader problems with numworkers

I have just got a new computer running Windows 10 which has a GPU so I wanted to see if I could sensibly use it for machine learning.
So I tried running an old model which I previously trained on Google Colab.
The answer is that it does do quite well, but I discovered that I could not use more than one worker in the Dataloader. Googling found that this is a known issue with PyTorch on Windows in Jupyter Notebooks so I tried running it in a normal Python program. I found that it did work but that the creation of the DataIterator took a very long time. Below are the times in seconds for 1, 2 and 6 workers, each done twice:
I note that 2 workers seems to be the fastest and there seems to be quite a lot of variation which surprised me as the machine was doing nothing else.
So the first question is:
Is there a way to let PyTorch choose the most efficient number of workers to use?
The second question is:
If I install a version of Linux will I be able to use Jupyter Notebooks with multiple workers which is what I would prefer to do in an ideal world.
The code I ran is below the relevant part is after if __name__ == "__main__":
# -*- coding: utf-8 -*-
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import os
import numpy as np
#import gym
import pickle
import matplotlib.pyplot as plt
import time
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(
in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
# create dataset
class C4Dataset(Dataset):
'''
the data for the first 12moves is held in a pickles list
as (key,val)
The key has to be converted to the pos amd mask which can then be converted to the ones,twos and zeros
Val is the value for the player playing so needs to be changed for wnen moves i odd to minus val
'''
fileName = r'C:\Users\alan\Desktop\Python\Python36\connect4\Layers\ListAllKeyVal19'
bottom_mask = 4432676798593
board_mask = bottom_mask * ((1 << 6) - 1)
bNos = 2**np.arange(49, dtype = np.uint64)
def getData(fileName):
with open(fileName,'rb') as inFile:
dict = pickle.load(inFile)
return dict
def oneHot(x):
return np.eye(37,dtype = np.float32)[x]
def getNoMoves(ones,twos) :
return np.sum(ones+twos)
def getPosMask(key):
binary = ('{:049b}'.format(key))[::-1]
arr = np.frombuffer(binary.encode(encoding='utf-8', errors='strict'),'u1') - ord('0')
outArr = np.reshape(arr,(7,7),order = 'F')
arr = np.flipud(outArr)
pos = arr.copy()
mask =arr.copy()
for col in range(7):
res = np.where(arr[:,col]==1)
topPos = res[0][0]
pos[topPos,col] = 0
mask[topPos,col] = 0
if topPos<6:
mask[topPos+1:,col] = 1
msk = np.flipud(mask)
msk = np.reshape(msk,(49),order = 'F')
maskNo = np.array(msk.dot(C4Dataset.bNos),dtype = np.uint64).item()
return pos.astype('float32'),(pos ^ mask).astype('float32'),(np.logical_not(mask)).astype('float32'),maskNo
def possible(mask) :
poss = (mask + C4Dataset.bottom_mask) & C4Dataset.board_mask
binary = ('{:049b}'.format(poss))[::-1]
arr = np.frombuffer(binary.encode(encoding='utf-8', errors='strict'),'u1') - ord('0')
outArr = np.reshape(arr,(7,7),order = 'F')
arr = np.flipud(outArr)
return arr
def __init__(self):
self.lst = C4Dataset.getData(C4Dataset.fileName)
def __len__(self):
return len(self.lst)
def __getitem__(self, idx):
key,val = self.lst[idx]
val = int(val)
ones,twos,zeros,mask = C4Dataset.getPosMask(key)
arr = np.zeros((5,7,7),dtype = np.float32)
arr[0,:6,:7] = ones[1:,:]
arr[1,:6,:7] = twos[1:,:]
arr[2,:6,:7] = zeros[1:,:]
moves = int(C4Dataset.getNoMoves(ones,twos))
p = (moves % 2) + 3
arr[p,:6,:7] = C4Dataset.possible(mask)[1:,:]
return arr,val+18 #C4Dataset.oneHot(val+18)
class C4Net(nn.Module):
def __init__(self, inFilters,outFilters):
super(C4Net, self).__init__()
self.conv1 = nn.Conv2d(inFilters, 32, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(32)
self.layer1 = BasicBlock(32,32)
self.layer2 = BasicBlock(32,32)
self.layer3 = BasicBlock(32,32)
self.layer4 = BasicBlock(32,32)
self.layer5 = BasicBlock(32,32)
self.layer6 = BasicBlock(32,32)
self.layer7 = BasicBlock(32,32)
self.layer8 = BasicBlock(32,32)
self.linear = nn.Linear(32*7*7,outFilters)#1568
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = self.layer5(out)
out = self.layer6(out)
out = self.layer7(out)
out = self.layer8(out)
#out = F.avg_pool2d(out, 2)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
# show some images
def show(img):
npimg = img.numpy()[:3,:,:]
plt.imshow(np.transpose(npimg, (1,2,0)), interpolation='nearest')
# get some random training images
if __name__ == "__main__":
dirName =r'C:\Users\alan\Desktop\Python\Python36\connect4\Layers'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
# create dataloader
max_epochs = 1
batchSize = 1024#512#256
learningRate = .00003
# Parameters
params = {'batch_size': batchSize,'shuffle': True,'num_workers': 2}
# Generators
dataset = C4Dataset()
start = time.time()
dataloader = DataLoader(dataset, **params)
middle = time.time()
print('create dataloader',middle-start)
dataiter = iter(dataloader)
end = time.time()
print('create data iterator',end-middle)
images, labels = dataiter.next()
final = time.time()
print('get one batch',final-end)
# show images
show(torchvision.utils.make_grid(images[:16]))
#create the weights
wts =np.array([59, 963, 12406, 148920, 62551, 47281, 55136, 54312, 44465, 31688,
27912, 37907, 114778, 242800, 394530, 495237, 582174, 163370, 480850,
201152, 690905, 633937, 721340, 372479, 193375, 84648, 76576, 91087, 130428,
154184, 157339, 156453, 227696, 1705325, 548155, 44315, 2082],dtype = np.float32)
maxwt = wts.max()
weights = wts/maxwt
weights = torch.from_numpy(weights)
weights.to(device)
# create the network
net = C4Net(5,37)
net.to(device)
PATH = r'C:\Users\alan\Desktop\Python\connectX\c4Net37Weights00003.pth'
net.load_state_dict(torch.load(PATH,map_location=torch.device(device)))
#create the loss function and optimiser
criterion = nn.CrossEntropyLoss(weight = weights.to(device) )
optimizer = optim.Adam(net.parameters(), lr=learningRate)
#train the network
import time
start = time.time()
for epoch in range(max_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(dataloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = inputs, labels = data[0].to(device), data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
torch.save(net.state_dict(),r'C:\Users\alan\Desktop\Python\connectX\tempWeights')
print('Finished Training')
# save the weights
PATH = r'C:\Users\alan\Desktop\Python\connectX\c4Net37Weights00004.pth'
torch.save(net.state_dict(), PATH)
end = time.time()
print('elapsed time',end-start)
PS the machine is a Dell XPS 17 with Intel Core i9-10885H with 8 cores and the GPU is a NVIDIA GeForce RTX 2060 with Max-Q. In this one test it runs 4 times faster than on Google Colab but I do not know what GPU I was allocated.

Must the input height of a 1D CNN be constant?

I'm currently doing my honours research project on online/dynamic signature verification. I am using the SVC 2004 dataset (Task 2). I have done the following data processing:
def load_dataset_normalized(path):
file_names = os.listdir(path)
num_of_persons = len(file_names)
initial_starting_point = np.zeros(np.shape([7]))
x_dataset = []
y_dataset = []
for infile in file_names:
full_file_name = os.path.join(path, infile)
file = open(full_file_name, "r")
file_lines = file.readlines()
num_of_points = int(file_lines[0])
x = []
y = []
time_stamp = []
button_status = []
azimuth_angles = []
altitude = []
pressure = []
for idx, line in enumerate(file_lines[1:]):
idx+=1
nums = line.split(' ')
if idx == 1:
nums[2] = 0
initial_starting_point = nums
x.append(int(nums[0]))
y.append(int(nums[1]))
time_stamp.append(0)
button_status.append(int(nums[3]))
azimuth_angles.append(int(nums[4]))
altitude.append(int(nums[5]))
pressure.append(int(nums[6]))
else:
x.append(int(nums[0]))
y.append(int(nums[1]))
time_stamp.append(10)
button_status.append(int(nums[3]))
azimuth_angles.append(int(nums[4]))
altitude.append(int(nums[5]))
pressure.append(int(nums[6]))
max_x = max(x)
max_y = max(y)
max_azimuth_angle = max(azimuth_angles)
max_altitude = max(altitude)
max_pressure = max(pressure)
min_x = min(x)
min_y = min(y)
min_azimuth_angle = min(azimuth_angles)
min_altitude = min(altitude)
min_pressure = min(pressure)
#Alignment normalization:
for i in range(num_of_points):
x[i] -= int(initial_starting_point[0])
y[i] -= int(initial_starting_point[1])
azimuth_angles[i] -= int(initial_starting_point[4])
altitude[i] -= int(initial_starting_point[5])
pressure[i] -= int(initial_starting_point[6])
#Size normalization
for i in range(num_of_points):
x[i] = ((x[i] - max_x) / (min_x - max_x))
y[i] = ((y[i] - max_y) / (min_y - max_y))
azimuth_angles[i] = ((azimuth_angles[i] - max_azimuth_angle) / (min_azimuth_angle - max_azimuth_angle))
altitude[i] = ((altitude[i] - max_altitude) / (min_altitude - max_altitude))
pressure[i] = ((pressure[i] - max_pressure) / (min_pressure - max_pressure))
#data points to dataset
x_line = []
for i in range (num_of_points):
x_line.append([x[i], y[i], time_stamp[i], button_status[i], azimuth_angles[i], altitude[i], pressure[i]])
if i == num_of_points-1:
x_dataset.append(x_line)
infile_without_extension = infile.replace('.TXT','')
index_of_s = infile_without_extension.find("S")
index_of_num = index_of_s + 1
sig_ID = int(infile_without_extension[index_of_num:])
if sig_ID < 21:
y_dataset.append([1,0])
else:
y_dataset.append([0,1])
x_dataset = np.asarray(x_dataset)
y_dataset = np.asarray(y_dataset)
return x_dataset, y_dataset
I also have another method that takes the values as they are in the text file and created an "original" dataset.
Now, the aim of my research is to create a CRNN (convolutional recurrent neural network) that can identify if a signature is authentic or forged. Here is the code for the model:
class crnn_model:
def __init__(self, trainX, trainy, testX, testy, optimizer_method):
self.trainX = trainX
self.trainy = trainy
self.testX = testX
self.testy = testy
self.evaluate_model(optimizer_method)
def evaluate_model(self, optimizer_method):
verbose, epochs, batch_size = 0, 40, 10
n_timesteps, n_features, n_outputs = len(self.trainX), 7, 2
print(n_timesteps)
model = keras.Sequential()
model.add(keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(n_timesteps, n_features), use_bias=True))
model.add(keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.MaxPooling1D(pool_size=2))
model.add(keras.layers.Flatten())
model.add(keras.layers.LSTM(2, input_shape=[30592,1], return_sequences=True))
model.summary()
# Compile the model
model.compile(optimizer=optimizer_method, loss='categorical_crossentropy', metrics=['accuracy'])
#fit model
model.fit(self.trainX, self.trainy, epochs=epochs, batch_size=batch_size, verbose=verbose)
# evaluate model
_, accuracy = model.evaluate(self.testX, self.testy, batch_size=batch_size, verbose=0)
return accuracy
Here is the problem I am having: the number of points used to store each signature is different, hence making the input height of the input matrix vary from one signature to the next. Must I now force the dataset to some uniform/constant number of points?
Much appreciated for your time.

How to generate sequence correctly with encoder-decoder lstm?

I am implementing some code to generate labeled data for Natural Language Understanding (NLU) from the article "Labeled Data Generation with Encoder-decoder LSTM for Semantic Slot Filling" (https://pdfs.semanticscholar.org/7ffe/83d7dd3a474e15ccc2aef412009f100a5802.pdf). My architecture is a simple encoder-decoder LSTM, but since my generated sentences (for words and labels) are not correct, I am trying to generate exactly the same sentence (only words) I give as input. Unfortunately, this is not working correctly.
I am using a vord2vec for word-embedding and the dimension of the embeddings is set to 64 (as suggested in the article). The encoder LSTM is receiving the sequence in reversed order and with a dropout rate of 0.5. The decoder LSTM also has a dropout rate of 0.5 and a softmax layer for each output of the sequence to map the most probable word. The inputs are exactly the same that the targets (same sentences) since first I want to produce exactly the same sentence.
For training, I used Adam optimizer and categorical_crossentropy for loss. For inference, I used a beam search (B=3) when generating sequences.
My training code:
def pretrained_embedding_layer(emb):
vocab_len = len(emb)
emb_dim = len(emb[0])
emb_layer = Embedding(vocab_len, emb_dim, trainable = False)
emb_layer.build((None,))
emb_layer.set_weights([emb])
return emb_layer
LSTM_encoder = LSTM(1024, dropout=0.5, return_state=True, go_backwards=True, name='lstm_encoder')
LSTM_decoder = LSTM(1024, dropout=0.5, return_sequences=True, return_state=True, name='lstm_decoder')
dense_w = Dense(vocab_w_size, activation='softmax', name="word_output")
K.set_learning_phase(1)
def model1_enc_dec(input_shape, w_emb):
words_indices = Input(shape=input_shape, dtype='int32')
wemb_layer = pretrained_embedding_layer(w_emb)
wemb = wemb_layer(words_indices)
enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
encoder_states = [enc_state_h, enc_state_c]
dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb,
initial_state=encoder_states)
dec_out = dense_w(dec_out)
model1 = Model(inputs=[words_indices], outputs=[dec_out])
return model1
model = model1_enc_dec((maxlen,), w_emb, s_emb)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model.fit(train_w, train_lab_w, validation_data=(val_w, val_lab_w), epochs=epochs, verbose=1, shuffle=True)
My inference code:
wemb_layer = Embedding(len(w_emb), len(w_emb[0]), trainable=False)
wemb_layer.build((None,))
LSTM_encoder = LSTM(1024, return_state=True, go_backwards=True, name='lstm_encoder')
LSTM_decoder = LSTM(1024, return_sequences=True, return_state=True, name='lstm_decoder')
dense_w = Dense(vocab_w_size, activation='softmax', name="word_output")
def target_model(input_shape):
words_indices = Input(shape=input_shape, dtype='int32')
wemb = wemb_layer(words_indices)
enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
encoder_states = [enc_state_h, enc_state_c]
dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb,
initial_state=encoder_states)
dec_out = dense_w(dec_out)
model = Model(inputs=[words_indices], outputs=[dec_out])
return model
target_model = target_model((maxlen,))
wemb_layer.set_weights(model1.layers[1].get_weights()) # layer 0: input
LSTM_encoder.set_weights(model1.layers[2].get_weights())
LSTM_decoder.set_weights(model1.layers[3].get_weights())
dense_w.set_weights(model1.layers[4].get_weights())
def model1_enco_infe(input_shape):
words_indices = Input(shape=input_shape, dtype='int32')
wemb = wemb_layer(words_indices)
enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
encoder_model = Model(inputs=[words_indices], outputs=[enc_state_h,
enc_state_c])
return encoder_model
def model1_deco_infe(input_shape):
dec_word_input = Input(shape=input_shape, dtype='int32')
dec_state_input_h = Input(shape=(1024,))
dec_state_input_c = Input(shape=(1024,))
wemb = wemb_layer(dec_word_input)
dec_states_input = [dec_state_input_h, dec_state_input_c]
dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb,
initial_state=dec_states_input)
dec_states_output = [dec_state_h, dec_state_c]
deco_out = dense_w(dec_out)
decoder_model = Model(inputs=[dec_word_input] + dec_states_input, outputs=
[deco_out] + dec_states_output)
return decoder_model
encoder_model = model1_enco_infe((maxlen,))
decoder_model = model1_deco_infe((1,))
def beamsearch_B(deco_w_out, beam):
words_index = []
dw = deco_w_out.copy()
for i in range(beam):
word_index = np.argmax(dw, axis=-1)
dw[0][0][word_index[0][0]] = 0
words_index.append(word_index[0][0])
return words_index
def generate_model1_add(word_seq, encoder_model, decoder_model, dec_word_input, id2word, beam):
[enc_state_h, enc_state_c] = encoder_model.predict(word_seq)
states = [enc_state_h, enc_state_c]
word_sentence = ''
probs_word = []
word_sentences = []
dec_word_inputs = []
states_beam = []
stop_condition = False
[dec_w_out, dec_state_h, dec_state_c] =
decoder_model.predict([dec_word_input] + states)
words_index, _ = beamsearch_B(dec_w_out, [], beam)
for i in range(beam):
probs_word.append(-log(dec_w_out[0][0][words_index[i]]))
word_sentences.append(id2word[words_index[i]])
dec_word_inputs.append([words_index[i]])
states_beam.append([dec_state_h, dec_state_c])
n_words = 1
endgame = []
while not stop_condition:
words_indexes, words_sentences, probs_words, states_b = [], [],
[], []
for k in range(beam):
[dec_w_out, dec_state_h, dec_state_c] =
decoder_model.predict([dec_word_inputs[k]] + states_beam[k])
words_index, _ = beamsearch_B(dec_w_out, [], beam)
states = [dec_state_h, dec_state_c]
for j in range(beam):
words_indexes.append(words_index[j])
probs_words.append(probs_word[k] * -log(dec_w_out[0][0]
[words_index[j]]) + 1e-7)
words_sentences.append(word_sentences[k] + ' ' +
id2word[words_index[j]])
states_b.append(states)
probs = []
for i in range(len(probs_words)):
probs.append(1 / (probs_words[i]))
indexes = []
for i in range(beam):
index = np.argmax(probs, axis=-1)
probs[index] = 0
indexes.append(index)
for i in range(beam):
probs_word[i] = probs_words[indexes[i]]
word_sentences[i] = words_sentences[indexes[i]]
dec_word_inputs[i] = [words_indexes[indexes[i]]]
states_beam[i] = states_b[indexes[i]]
if (id2word[words_indexes[indexes[i]]] == 'EOS'):
endgame.append(i)
if len(endgame) == 1:
word_sentence = word_sentences[endgame]
stop_condition = True
elif len(endgame) > 1:
word_sentence = word_sentences[np.min(endgame)]
stop_condition = True
n_words += 1
if n_words > 50:
word_sentence = word_sentences[0]
stop_condition = True
return word_sentence
word_sentence = generate_model1_add(np.reshape(train_w[i], (1, maxlen)),
encoder_model, 0, decoder_model, [w2i['BOS']], i2w, 3)
An example of my generated sequences:
Input sentence: BOS i 'm fourth in flying from boston to atlanta EOS PAD PAD PAD ...
Generated sentence: BOS from from from from from from from from from from from from from from from from from from from ...
It seems that the training weights are not correct but I got loss: 0.0032 - acc: 0.9990 - val_loss: 0.0794 - val_acc: 0.9888 during training.
What I want is just to generate exactly the same sentence of the input. Hope you can help me guys. Thank you in advance!

Resources