Torch throws a RuntimeError: element 0 of tensors does not require grad... but can't find where computational graph is severed - pytorch

I am getting the above error:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
I looked this up and it looks like the computational graph is not connected for some reason. However, I cannot find the location where the graph is severed.
My code is a reproduction of the arjovsky WGAN: https://github.com/martinarjovsky/WassersteinGAN
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
import torch.nn as nn
from __future__ import print_function
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.autograd import Variable
import os
import json
class MLP_G(nn.Module):
def __init__(self, isize, nz, ngf, ngpu):
super(MLP_G, self).__init__()
self.ngpu = ngpu
main = nn.Sequential(
# Z goes into a linear of size: ngf
nn.Linear(nz, ngf),
nn.ReLU(True),
nn.Linear(ngf, ngf),
nn.ReLU(True),
nn.Linear(ngf, ngf),
nn.ReLU(True),
nn.Linear(ngf, isize),
)
self.main = main
self.isize = isize
self.nz = nz
def forward(self, input):
input = input.view(input.size(0), input.size(1))
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
else:
output = self.main(input)
return output.view(output.size(0), self.isize)
class MLP_D(nn.Module):
def __init__(self, isize, nz, ndf, ngpu):
super(MLP_D, self).__init__()
self.ngpu = ngpu
main = nn.Sequential(
# Z goes into a linear of size: ndf
nn.Linear(isize, ndf),
nn.ReLU(True),
nn.Linear(ndf, ndf),
nn.ReLU(True),
nn.Linear(ndf, ndf),
nn.ReLU(True),
nn.Linear(ndf, 1),
)
self.main = main
self.isize = isize
self.nz = nz
def forward(self, input):
input = input.view(input.size(0),input.size(1))
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
else:
output = self.main(input)
output = output.mean(0)
return output.view(1)
netG = None #path to saved generator
netD = None #discriminator path
batchSize = 1000 #size of batch (which is size of data)
cuda = False
lrD = lrG = .00005
beta1 = .5
niter = 25
experiment = '/content/drive/MyDrive/savefolder'
clamp_upper = .01
clamp_lower = -clamp_upper
manualSeed = random.randint(1, 10000) # fix seed
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
cudnn.benchmark = True
dataset = torch.tensor(np.stack([x,y, instrument], axis = 1)).float().reshape(-1,3)
ngpu = 1
nz = 4 #three latents and the instrument
ngf = 128
ndf = 128
# custom weights initialization called on netG and netD
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
m.weight.data.normal_(0.0, 0.02)
elif classname.find('BatchNorm') != -1:
m.weight.data.normal_(1.0, 0.02)
m.bias.data.fill_(0)
netG = MLP_G(2, nz, ngf, ngpu)
netG.apply(weights_init)
print(netG)
netD = MLP_D(3, nz, ndf, ngpu)
print(netD)
input = torch.FloatTensor(batchSize, 2)
noise = torch.FloatTensor(batchSize, nz-1)
fixed_noise = torch.FloatTensor(batchSize, nz-1).normal_(0, 1)
one = torch.FloatTensor([1])
mone = one * -1
# setup optimizer
optimizerD = optim.Adam(netD.parameters(), lr=lrD, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lrG, betas=(beta1, 0.999))
real_cpu = data = dataset
gen_iterations = 0
for epoch in range(niter):
#data_iter = iter(dataloader)
############################
# (1) Update D network
###########################
for p in netD.parameters(): # reset requires_grad
p.requires_grad = True # they are set to False below in netG update
# train the discriminator Diters times
if gen_iterations < 25 or gen_iterations % 500 == 0:
Diters = 100
else:
Diters = 5
j = 0
while j < Diters:
j += 1
# clamp parameters to a cube
for p in netD.parameters():
p.data.clamp_(clamp_lower, clamp_upper)
# train with real
netD.zero_grad()
if cuda:
real_cpu = real_cpu.cuda()
input.resize_as_(real_cpu).copy_(real_cpu)
inputv = Variable(input, requires_grad=False)
errD_real = netD(inputv)
errD_real.backward(one)#Error Occurs here
# train with fake
noise.resize_(batchSize, nz-1).normal_(0, 1)
noisev = torch.cat([Variable(noise, requires_grad=False), dataset[:,2].reshape(-1,1)], 1)# totally freeze netG
fake = torch.cat([Variable(netG(noisev).data), dataset[:,2].view(-1,1)], 1)
inputv = fake
errD_fake = netD(inputv)
errD_fake.backward(mone)
errD = errD_real - errD_fake
optimizerD.step()
############################
# (2) Update G network
###########################
for p in netD.parameters():
p.requires_grad = False # to avoid computation
netG.zero_grad()
# in case our last batch was the tail batch of the dataloader,
# make sure we feed a full batch of noise
noise.resize_(batchSize, nz-1).normal_(0, 1)
noisev = torch.cat([Variable(noise), dataset[:,2].view(-1,1)], 1)
fake = torch.cat([netG(noisev), dataset[:,2].view(-1,1)], 1)
errG = netD(fake)
errG.backward(one)
optimizerG.step()
gen_iterations += 1
i = 0
print('[%d/%d][%d] Loss_D: %f Loss_G: %f Loss_D_real: %f Loss_D_fake %f'
% (epoch, niter, gen_iterations,
errD.data[0], errG.data[0], errD_real.data[0], errD_fake.data[0]))
# if gen_iterations % 500 == 0:
# real_cpu = real_cpu.mul(0.5).add(0.5)
# vutils.save_image(real_cpu, '{0}/real_samples.png'.format(opt.experiment))
# fake = netG(Variable(fixed_noise, volatile=True))
# fake.data = fake.data.mul(0.5).add(0.5)
# vutils.save_image(fake.data, '{0}/fake_samples_{1}.png'.format(opt.experiment, gen_iterations))
# do checkpointing
torch.save(netG.state_dict(), '{0}/netG_epoch_{1}.pth'.format(experiment, epoch))
torch.save(netD.state_dict(), '{0}/netD_epoch_{1}.pth'.format(experiment, epoch))
Error occurs on the line: errD_real.backward(one). The error might be something regarding zeroing out the computational graph as the code runs for one iteration then throws an error. Thanks for your help.

You most certainly need to add require_grad=True on one. You could define it as:
one = torch.tensor([1], dtype=torch.float16, requires_grad=True)

Related

REINFORCE for Cartpole: Training Unstable

I am implementing REINFORCE for Cartpole-V0. However, the training process is very unstable. I have not implemented `early-stopping' for the environment and allow training to continue for a fixed (high) number of episodes. After a few thousand iterations, the training reward seems to go down again. Is this due to overfitting and early-stopping is essential, or have I implemented something incorrectly?
Here is my code:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import os
def running_average(x, n):
N = n
kernel = np.ones(N)
conv_len = x.shape[0]-N
y = np.zeros(conv_len)
for i in range(conv_len):
y[i] = kernel # x[i:i+N] # matrix multiplication operator: np.mul
y[i] /= N
return y
class PolicyNetwork(nn.Module):
def __init__(self, state_dim, n_actions):
super().__init__()
self.n_actions = n_actions
self.model = nn.Sequential(
nn.Linear(state_dim, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, n_actions),
nn.Softmax(dim=1)
).float()
def forward(self, X):
return self.model(X)
def train_reinforce_agent(env, episode_length, max_episodes, gamma, visualize_step, learning_rate=0.003):
model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
EPISODE_LENGTH = episode_length
MAX_EPISODES = max_episodes
GAMMA = gamma
VISUALIZE_STEP = max(1, visualize_step)
score = []
for episode in range(MAX_EPISODES):
curr_state = env.reset()
done = False
all_episode_t = []
score_episode = 0
for t in range(EPISODE_LENGTH):
act_prob = model(torch.from_numpy(curr_state).unsqueeze(0).float())
action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob.squeeze(0).data.numpy())
prev_state = curr_state
curr_state, reward, done, info = env.step(action)
score_episode += reward
e_t = {'state': prev_state, 'action':action, 'reward': reward, 'returns':0}
all_episode_t.append(e_t)
if done:
break
score.append(score_episode)
G = 0
max_G = 0
for t in range(len(all_episode_t)-1, -1, -1):
G = GAMMA*G + all_episode_t[t]['reward']
all_episode_t[t]['returns'] = G
if G > max_G:
max_G = G
episode_returns = np.array([all_episode_t[t]['returns'] for t in range(len(all_episode_t))])
# normalize the returns
for t in range(len(all_episode_t)):
all_episode_t[t]['returns'] = (all_episode_t[t]['returns'] - np.mean(episode_returns))/(max_G + 10**(-6))
episode_returns = torch.FloatTensor(episode_returns)
state_batch = torch.Tensor(np.array([all_episode_t[t]['state'] for t in range(len(all_episode_t))]))
action_batch = torch.Tensor(np.array([all_episode_t[t]['action'] for t in range(len(all_episode_t))]))
pred_batch = model(state_batch)
prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()
loss_tensor = torch.log(prob_batch) * episode_returns
loss = -torch.sum(loss_tensor)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if episode % VISUALIZE_STEP == 0 and episode > 0:
print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score[-VISUALIZE_STEP:-1])))
# # EARLY-STOPPING: if the average score across last 100 episodes is greater than 195, game is solved
# if np.mean(score[-100:-1]) > 195:
# break
# Training plot
score = np.array(score)
avg_score = running_average(score, visualize_step)
plt.figure(figsize=(15, 7))
plt.ylabel("Episodic Reward", fontsize=12)
plt.xlabel("Training Episodes", fontsize=12)
plt.plot(score, color='gray', linewidth=1)
plt.plot(avg_score, color='blue', linewidth=3)
plt.scatter(np.arange(score.shape[0]), score, color='green', linewidth=0.3)
plt.savefig("cartpole_reinforce_training_plot.pdf")
def main():
env = gym.make('CartPole-v0')
episode_length = 300
n_episodes = 5000
gamma = 0.99
vis_steps = 100
learning_rate = 0.003
train_reinforce_agent(env, episode_length, n_episodes, gamma, vis_steps, learning_rate=learning_rate)
if __name__ == "__main__":
main()

keras BatchGenerator(keras.utils.Sequence) is too slow

I'm using a custom batch generator with large dataframe. but the Generator takes too much time to generate a batch, it takes 127s to generate a batch of 1024. I've tried Dask but still, the processing is slow. is there any way to integrate multiprocessing with inside the generator. knowing that I've tried use_multiprocessing=True with workers=12
import keras
from random import randint
import glob
import warnings
import numpy as np
import math
import pandas as pd
import dask.dataframe as dd
class BatchGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, labels=None, batch_size=8, n_classes=4, shuffle=True,
seq_len=6, data_path=None, meta_path=None,list_IDs=None):
'Initialization'
self.batch_size = batch_size
self.labels = labels
self.n_classes = n_classes
self.shuffle = shuffle
self.seq_len = seq_len
self.meta_df = meta_path
self.data_df = data_path
self.data_df = self.data_df.astype({"mjd": int})
self.list_IDs = list_IDs
if self.list_IDs==None:
self.list_IDs = list(self.meta_df['object_id'].unique())
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_IDs_temp):
X_dat = np.zeros((self.batch_size, self.seq_len,6,1))
Y_mask = np.zeros((self.batch_size, self.seq_len,6,1))
# Y_dat = np.empty((self.batch_size,1), dtype=int)
X_length= np.empty((self.batch_size,1), dtype=int)
for i, trans_id in enumerate(list_IDs_temp):
curve = self.data_df[self.data_df.object_id==trans_id]
mjdlist = list(curve['mjd'].unique())
ts_length = len(mjdlist)
if ts_length <= self.seq_len :
start_ind = 0
else :
start_ind = randint(0, ts_length - self.seq_len)
ts_length = self.seq_len
for j in range(ts_length):
if j+start_ind < len(mjdlist):
step = curve[curve.mjd==mjdlist[j+start_ind]]
for k in range(len(step.mjd)):
obs = step[step.passband==k]
if len(obs) == 0 :
# print('here is one')
continue
else:
if k == 0:
X_dat[i,j,0,0] =obs.flux.iloc[0]
Y_mask[i,j,0,0] = 1
if k == 1:
X_dat[i,j,1,0] = obs.flux.iloc[0]
Y_mask[i,j,1,0] = 1
if k == 2:
X_dat[i,j,2,0] = obs.flux.iloc[0]
Y_mask[i,j,2,0] = 1
if k == 3:
X_dat[i,j,3,0] = obs.flux.iloc[0]
Y_mask[i,j,3,0] = 1
if k == 4:
X_dat[i,j,4,0] = obs.flux.iloc[0]
Y_mask[i,j,4,0] = 1
if k == 5:
X_dat[i,j,5,0] = obs.flux.iloc[0]
Y_mask[i,j,5,0] = 1
# meta = self.meta_df[self.meta_df['object_id'] == trans_id]
# Y_dat[i] = self.labels[int(meta['target'])]
X_length[i,0] = ts_length
flux_max = np.max(X_dat[i])
flux_min = np.min(X_dat[i])
flux_pow = math.log2(flux_max - flux_min)
X_dat[i] /= flux_pow
X_noised = X_dat + np.random.uniform(low=0, high=0.5, size=X_dat.shape)
return [X_noised, X_length, np.reshape(Y_mask,(self.batch_size, self.seq_len*6))], np.reshape(X_dat,(self.batch_size, self.seq_len*6))
To make it faster, the for loop in the function __data_generation should be parallelized. Using the joblib package may help.

Pytorch on Windows : Dataloader problems with numworkers

I have just got a new computer running Windows 10 which has a GPU so I wanted to see if I could sensibly use it for machine learning.
So I tried running an old model which I previously trained on Google Colab.
The answer is that it does do quite well, but I discovered that I could not use more than one worker in the Dataloader. Googling found that this is a known issue with PyTorch on Windows in Jupyter Notebooks so I tried running it in a normal Python program. I found that it did work but that the creation of the DataIterator took a very long time. Below are the times in seconds for 1, 2 and 6 workers, each done twice:
I note that 2 workers seems to be the fastest and there seems to be quite a lot of variation which surprised me as the machine was doing nothing else.
So the first question is:
Is there a way to let PyTorch choose the most efficient number of workers to use?
The second question is:
If I install a version of Linux will I be able to use Jupyter Notebooks with multiple workers which is what I would prefer to do in an ideal world.
The code I ran is below the relevant part is after if __name__ == "__main__":
# -*- coding: utf-8 -*-
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import os
import numpy as np
#import gym
import pickle
import matplotlib.pyplot as plt
import time
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(
in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
# create dataset
class C4Dataset(Dataset):
'''
the data for the first 12moves is held in a pickles list
as (key,val)
The key has to be converted to the pos amd mask which can then be converted to the ones,twos and zeros
Val is the value for the player playing so needs to be changed for wnen moves i odd to minus val
'''
fileName = r'C:\Users\alan\Desktop\Python\Python36\connect4\Layers\ListAllKeyVal19'
bottom_mask = 4432676798593
board_mask = bottom_mask * ((1 << 6) - 1)
bNos = 2**np.arange(49, dtype = np.uint64)
def getData(fileName):
with open(fileName,'rb') as inFile:
dict = pickle.load(inFile)
return dict
def oneHot(x):
return np.eye(37,dtype = np.float32)[x]
def getNoMoves(ones,twos) :
return np.sum(ones+twos)
def getPosMask(key):
binary = ('{:049b}'.format(key))[::-1]
arr = np.frombuffer(binary.encode(encoding='utf-8', errors='strict'),'u1') - ord('0')
outArr = np.reshape(arr,(7,7),order = 'F')
arr = np.flipud(outArr)
pos = arr.copy()
mask =arr.copy()
for col in range(7):
res = np.where(arr[:,col]==1)
topPos = res[0][0]
pos[topPos,col] = 0
mask[topPos,col] = 0
if topPos<6:
mask[topPos+1:,col] = 1
msk = np.flipud(mask)
msk = np.reshape(msk,(49),order = 'F')
maskNo = np.array(msk.dot(C4Dataset.bNos),dtype = np.uint64).item()
return pos.astype('float32'),(pos ^ mask).astype('float32'),(np.logical_not(mask)).astype('float32'),maskNo
def possible(mask) :
poss = (mask + C4Dataset.bottom_mask) & C4Dataset.board_mask
binary = ('{:049b}'.format(poss))[::-1]
arr = np.frombuffer(binary.encode(encoding='utf-8', errors='strict'),'u1') - ord('0')
outArr = np.reshape(arr,(7,7),order = 'F')
arr = np.flipud(outArr)
return arr
def __init__(self):
self.lst = C4Dataset.getData(C4Dataset.fileName)
def __len__(self):
return len(self.lst)
def __getitem__(self, idx):
key,val = self.lst[idx]
val = int(val)
ones,twos,zeros,mask = C4Dataset.getPosMask(key)
arr = np.zeros((5,7,7),dtype = np.float32)
arr[0,:6,:7] = ones[1:,:]
arr[1,:6,:7] = twos[1:,:]
arr[2,:6,:7] = zeros[1:,:]
moves = int(C4Dataset.getNoMoves(ones,twos))
p = (moves % 2) + 3
arr[p,:6,:7] = C4Dataset.possible(mask)[1:,:]
return arr,val+18 #C4Dataset.oneHot(val+18)
class C4Net(nn.Module):
def __init__(self, inFilters,outFilters):
super(C4Net, self).__init__()
self.conv1 = nn.Conv2d(inFilters, 32, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(32)
self.layer1 = BasicBlock(32,32)
self.layer2 = BasicBlock(32,32)
self.layer3 = BasicBlock(32,32)
self.layer4 = BasicBlock(32,32)
self.layer5 = BasicBlock(32,32)
self.layer6 = BasicBlock(32,32)
self.layer7 = BasicBlock(32,32)
self.layer8 = BasicBlock(32,32)
self.linear = nn.Linear(32*7*7,outFilters)#1568
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = self.layer5(out)
out = self.layer6(out)
out = self.layer7(out)
out = self.layer8(out)
#out = F.avg_pool2d(out, 2)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
# show some images
def show(img):
npimg = img.numpy()[:3,:,:]
plt.imshow(np.transpose(npimg, (1,2,0)), interpolation='nearest')
# get some random training images
if __name__ == "__main__":
dirName =r'C:\Users\alan\Desktop\Python\Python36\connect4\Layers'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
# create dataloader
max_epochs = 1
batchSize = 1024#512#256
learningRate = .00003
# Parameters
params = {'batch_size': batchSize,'shuffle': True,'num_workers': 2}
# Generators
dataset = C4Dataset()
start = time.time()
dataloader = DataLoader(dataset, **params)
middle = time.time()
print('create dataloader',middle-start)
dataiter = iter(dataloader)
end = time.time()
print('create data iterator',end-middle)
images, labels = dataiter.next()
final = time.time()
print('get one batch',final-end)
# show images
show(torchvision.utils.make_grid(images[:16]))
#create the weights
wts =np.array([59, 963, 12406, 148920, 62551, 47281, 55136, 54312, 44465, 31688,
27912, 37907, 114778, 242800, 394530, 495237, 582174, 163370, 480850,
201152, 690905, 633937, 721340, 372479, 193375, 84648, 76576, 91087, 130428,
154184, 157339, 156453, 227696, 1705325, 548155, 44315, 2082],dtype = np.float32)
maxwt = wts.max()
weights = wts/maxwt
weights = torch.from_numpy(weights)
weights.to(device)
# create the network
net = C4Net(5,37)
net.to(device)
PATH = r'C:\Users\alan\Desktop\Python\connectX\c4Net37Weights00003.pth'
net.load_state_dict(torch.load(PATH,map_location=torch.device(device)))
#create the loss function and optimiser
criterion = nn.CrossEntropyLoss(weight = weights.to(device) )
optimizer = optim.Adam(net.parameters(), lr=learningRate)
#train the network
import time
start = time.time()
for epoch in range(max_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(dataloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = inputs, labels = data[0].to(device), data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
torch.save(net.state_dict(),r'C:\Users\alan\Desktop\Python\connectX\tempWeights')
print('Finished Training')
# save the weights
PATH = r'C:\Users\alan\Desktop\Python\connectX\c4Net37Weights00004.pth'
torch.save(net.state_dict(), PATH)
end = time.time()
print('elapsed time',end-start)
PS the machine is a Dell XPS 17 with Intel Core i9-10885H with 8 cores and the GPU is a NVIDIA GeForce RTX 2060 with Max-Q. In this one test it runs 4 times faster than on Google Colab but I do not know what GPU I was allocated.

ImportError: Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training

cannot install apex for distributed and fp16 training of bert model
i have tried to install by cloning the apex from github and tried to install packages using pip
i have tried to install apex by cloning from git hub using following command:
git clone https://github.com/NVIDIA/apex.git
and cd apex to goto apex directory and tried to install package using following pip command:
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext"
full code is:
def main(server_ip,server_port,local_rank,no_cuda,fp16,train_batch_size,gradient_accumulation_steps,seed,do_train,do_eval,output_dir,task_name,data_dir,do_lower_case,bert_model,num_train_epochs,cache_dir,learning_rate,warmup_proportion,loss_scale,max_seq_length):
if server_ip and server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
print("Waiting for debugger attach")
ptvsd.enable_attach(address=(server_ip, server_port), redirect_output=True)
ptvsd.wait_for_attach()
processors = {"ner":NerProcessor}
print(processors)
if local_rank == -1 or no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(local_rank != -1), fp16))
if gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
train_batch_size = train_batch_size // gradient_accumulation_steps
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if not do_train and not do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if os.path.exists(output_dir) and os.listdir(output_dir) and do_train:
raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
task_name = task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels()
num_labels = len(label_list) + 1
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
train_examples = None
num_train_optimization_steps = None
if do_train:
train_examples = processor.get_train_examples(data_dir)
num_train_optimization_steps = int(
len(train_examples) / train_batch_size / gradient_accumulation_steps) * num_train_epochs
if local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# # Prepare model
cache_dir = cache_dir if cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(local_rank))
model = Ner.from_pretrained(bert_model,
cache_dir=cache_dir,
num_labels = num_labels)
if fp16:
model.half()
# model.cuda()
model.to(device)
if local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=learning_rate,
warmup=warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0
nb_tr_steps = 0
tr_loss = 0
label_map = {i : label for i, label in enumerate(label_list,1)}
if do_train:
train_features = convert_examples_to_features(
train_examples, label_list, max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long)
all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids)
if local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)
model.train()
for _ in trange(int(num_train_epochs), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch
loss = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
del loss
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if gradient_accumulation_steps > 1:
loss = loss / gradient_accumulation_steps
if fp16:
optimizer.backward(loss)
else:
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % gradient_accumulation_steps == 0:
if fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = learning_rate * warmup_linear(global_step/num_train_optimization_steps, warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
main('','',-1,True,True,8,1,42,True,True,'jpt','ner','data/',True,'bert-base-cased',5,'cache_dir',5e-5,0.4,0,128)
This worked for me:
import os, sys, shutil
import time
import gc
from contextlib import contextmanager
from pathlib import Path
import random
import numpy as np, pandas as pd
from tqdm import tqdm, tqdm_notebook
#contextmanager
def timer(name):
t0 = time.time()
yield
print(f'[{name}] done in {time.time() - t0:.0f} s')
USE_APEX = True
if USE_APEX:
with timer('install Nvidia apex'):
# Installing Nvidia Apex
os.system('git clone https://github.com/NVIDIA/apex; cd apex; pip install -v --no-cache-dir' +
' --global-option="--cpp_ext" --global-option="--cuda_ext" ./')
os.system('rm -rf apex/.git') # too many files, Kaggle fails
from apex import amp

Input shape in Keras

I am creating a deep neural network using Keras using images from the Gym library from Open AI.
I tried to reshape the images using the following code:
def reshape_dimensions(observation):
processed = np.mean(observation,2,keepdims = False)
cropped = processed[35:195]
result = cropped[::2,::2]
return result
This gives me an image of shape (80,80) but every time I try to input that shape in the first layer of the Keras network it doesn't work.
What should be the shape I should use so I can further develop the network?
Attached the whole code:
PART I retrieves the training data
import gym
import random
import numpy as np
from statistics import mean, median
from collections import Counter
### GAME VARIABLE SETTINGS ###
env = gym.make('MsPacman-v0')
env.reset()
goal_steps = 2000
score_requirement = 250
initial_games = 200
print('Options to play: ',env.unwrapped.get_action_meanings())
### DEFINE FUNCTIONS ####
def reshape_dimensions(observation):
processed = np.mean(observation,2,keepdims = False)
cropped = processed[35:195]
result = cropped[::2,::2]
return result
def initial_population():
training_data = []
scores = []
accepted_scores = []
for _ in range(initial_games):
score = 0
game_memory = []
prev_obvservation = []
for _ in range(goal_steps):
#env.render()
action = env.action_space.sample() #Take random action in the env
observation, reward, done, info = env.step(action)
reshape_observation = reshape_dimensions(observation)
if len(prev_obvservation) > 0:
game_memory.append([prev_obvservation, action])
prev_obvservation = reshape_observation
score = score + reward
if done:
break
if score >= score_requirement:
accepted_scores.append(score)
for data in game_memory:
if data[1] == 0:
output = [1,0,0,0,0,0,0,0,0]
elif data[1] == 1:
output = [0,1,0,0,0,0,0,0,0]
elif data[1] == 2:
output = [0,0,1,0,0,0,0,0,0]
elif data[1] == 3:
output = [0,0,0,1,0,0,0,0,0]
elif data[1] == 4:
output = [0,0,0,0,1,0,0,0,0]
elif data[1] == 5:
output = [0,0,0,0,0,1,0,0,0]
elif data[1] == 6:
output = [0,0,0,0,0,0,1,0,0]
elif data[1] == 7:
output = [0,0,0,0,0,0,0,1,0]
elif data[1] == 8:
output = [0,0,0,0,0,0,0,0,1]
training_data.append([data[0],output])
env.reset()
scores.append(score)
print('Average accepted scores:', mean(accepted_scores))
print('Median accepted scores:', median(accepted_scores))
print(Counter(accepted_scores))
return training_data
### RUN CODE ###
training_data = initial_population()
np.save('data_for_training_200.npy', training_data)
PART II trains the model
import gym
import random
import numpy as np
import keras
from statistics import mean, median
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
### LOAD DATA ###
raw_training_data = np.load("data_for_training_200.npy")
training_data = [i[0:2] for i in raw_training_data]
print(np.shape(training_data))
### DEFINE FUNCTIONS ###
def neural_network_model():
network = Sequential()
network.add(Dense(100, activation = 'relu', input_shape = (80,80)))
network.add(Dense(9,activation = 'softmax'))
optimizer = Adam(lr = 0.001)
network.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics=['accuracy'])
return network
def train_model(training_data):
X = [i[0] for i in training_data]
y = [i[1] for i in training_data]
#X = np.array([i[0] for i in training_data])
#y = np.array([i[1] for i in training_data])
print('shape of X: ', np.shape(X))
print('shape of y: ', np.shape(y))
early_stopping_monitor = EarlyStopping(patience = 3)
model = neural_network_model()
model.fit(X, y, epochs = 20, callbacks = [early_stopping_monitor])
return model
train_model(training_data = training_data)
It seems like you are pre-processing individual images correctly but putting them inside a list instead of an input tensor. From the error message you have a list of 36859 (80,80) arrays while you would like to have a single array of shape (36859, 80, 80). You have the code that does this commented out X = np.array([i[0] for i in training_data]), you have to ensure that every i[0] is of same shape (80,80) for this to work.

Resources