why the gpu memory is increasing when I set the parameters of the network? PyTorch - memory-leaks

I'm conducting an experiment, but the gpu memory keeps increasing. By inspecting the codes line by line, I find that it is the following function that caused the memory leak. Can anyone help me find out the reason and the solution? It would be appreciated! Note that it is the "set_param" function.
class MyModule(nn.Module):
def update_params(self, lr_inner, first_order=False, source_params=None, detach=False):
if source_params is not None:
for tgt, src in zip(self.named_params(self), source_params):
name_t, param_t = tgt
# name_s, param_s = src
# grad = param_s.grad
# name_s, param_s = src
grad = src
if first_order:
grad = to_var(grad.detach().data)
tmp = param_t - lr_inner * grad
self.set_param(self, name_t, tmp)
else:
for name, param in self.named_params(self):
if not detach:
grad = param.grad
if first_order:
grad = to_var(grad.detach().data)
tmp = param - lr_inner * grad
self.set_param(self, name, tmp)
else:
param = param.detach_()
self.set_param(self, name, param)
def set_param(self, curr_mod, name, param):
if '.' in name:
n = name.split('.')
module_name = n[0]
rest = '.'.join(n[1:])
for name, mod in curr_mod.named_children():
if module_name == name:
self.set_param(mod, rest, param)
break
else:
setattr(curr_mod, name, param)

Related

How can I speed up using Pytorch DataLoader?

I had a dataset including about a million of rows. Before, I read the rows, preprocessed data and created a list of rows to be trained. Then I defined a Dataloader over this data like:
train_dataloader = torch.utils.data.DataLoader(mydata['train'],
batch_size=node_batch_size,shuffle=shuffle,collate_fn=data_collator)
Preprocessing could be time consuming, so I thought to define an IterableDataSet with __iter__ function. Then I could define my Dataloader like:
train_dataloader = torch.utils.data.DataLoader(myds['train'],
batch_size=node_batch_size,shuffle=shuffle,collate_fn=data_collator)
However, still to begin training it seems that it calls my preprocessing function and creates an Iteration over it. So, it seems I didn't gain much speed up.
Please guide me how could I use speed up in this case?
Here is my part of my class:
def __iter__(self):
iter_start = self.start
iter_end = self.num_samples
worker_info = torch.utils.data.get_worker_info()
if worker_info is None: # single-process data loading, return the full iterator
iter_start = self.start
iter_end = self.num_samples
else: # in a worker process
# split workload
per_worker = int(math.ceil((self.num_samples - self.start) / float(worker_info.num_workers)))
worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.num_samples)
if self.flat_data:
return iter(self.flat_data)
else:
return iter(self.fill_data(iter_start, iter_end))
def fill_data(self, iter_start, iter_end, show_progress=False):
flat_data = []
if iter_end < 0:
iter_end = self.num_samples
kk = 0
dlog.info("========================== SPLIT: %s", self.split_name)
dlog.info("get data from %s to %s", iter_start, iter_end)
dlog.info("total rows: %s", len(self.split_df))
if show_progress:
pbar = tqdm(total = self.num_samples)
for index, d in self.split_df.iterrows():
if kk < iter_start:
dlog.info("!!!!!!!!! before start %s", iter_start)
kk += 1
continue
rel = d["prefix"]
...
# preprocessing and adding to returned list
I did preprosessing in the fill_data or __iter__ body. However, I can use a map for preprocessing. Then the preprocessing is called during training and for every batch and not before training.
import pandas as pd
import torch
class MyDataset(torch.utils.data.IterableDataset):
def __init__(self, fname, until=10):
self.df = pd.read_table("atomic/" + fname)
self.until = until
def preproc(self, t):
prefix, data = t
text = "Preproc: " + prefix + "|" + data
print(text) # to check when it is called
return text
def __iter__(self):
_iter = self.df_iter()
return map(self.preproc, _iter)
def df_iter(self):
ret = []
for idx, row in self.df.iterrows():
ret.append((row["prefix"],row["input_text"]))
return iter(ret)

"for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader: NameError: name 'data_loader' is not defined"

I am trying to implement question answering model with a BERT transformer implemented by jugapuff.
Link to the code: https://github.com/jugapuff/BERT-for-bAbi-task
After executing the main.py file which is written below as well, I m getting this error: "for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader: NameError: name 'data_loader' is not defined"
from dataloader import bAbi_Dataset
import torch
import torch.nn as nn
from model import model
from pytorch_transformers import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
print("GPU:" + str(torch.cuda.get_device_name(0)))
my_model = model()
my_model.to(device)
optimizer = AdamW(my_model.parameters())
criterion = nn.NLLLoss()
EPOCHS = 10
for epoch in range(1, EPOCHS+1):
my_model.train()
train_loss = 0
length = 0
for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader:
output = my_model(tokens_tensor.to(device), segments_tensors.to(device), att_mask.to(device), pos_id.to(device))
loss = criterion(output, trg.to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
length+=1
train_loss += loss.item()
if length % 10 == 0:
print("\t\t{:3}/25000 : {}".format(length, train_loss / length))
epoch_loss = train_loss / length
print("##################")
print("{} epoch Loss : {:.4f}".format(epoch, epoch_loss))
and data_loader.py is as
import os
import torch
import torch.utils.data as data
from pytorch_transformers import BertTokenizer
def _parse( file, only_supporting=False):
data, story = [], []
for line in file:
tid, text = line.rstrip('\n').split(' ', 1)
if tid == '1':
story = []
if text.endswith('.'):
story.append(text[:])
else:
query, answer, supporting = (x.strip() for x in text.split('\t'))
if only_supporting:
substory = [story[int(i) - 1] for i in supporting.split()]
else:
substory = [x for x in story if x]
data.append((substory, query[:-1], answer))
story.append("")
return data
def build_trg_dics(tenK=True, path="tasks_1-20_v1-2", train=True):
if tenK:
dirname = os.path.join(path, 'en-10k')
else:
dirname = os.path.join(path, 'en')
for (dirpath, dirnames, filenames) in os.walk(dirname):
filenames = filenames
if train:
filenames = [filename for filename in filenames if "train.txt" in filename]
else:
filenames = [filename for filename in filenames if "test.txt" in filename]
temp = []
for filename in filenames:
f = open(os.path.join(dirname, filename), 'r')
parsed =_parse(f)
temp.extend([d[2] for d in parsed])
temp = set(temp)
trg_word2id = {word:i for i, word in enumerate(temp)}
trg_id2word = {i:word for i, word in enumerate(temp)}
return trg_word2id, trg_id2word
class bAbi_Dataset(data.Dataset):
def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True):
# joint is Default
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
if tenK:
dirname = os.path.join(path, 'en-10k')
else:
dirname = os.path.join(path, 'en')
for (dirpath, dirnames, filenames) in os.walk(dirname):
filenames = filenames
if train:
filenames = [filename for filename in filenames if "train.txt" in filename]
else:
filenames = [filename for filename in filenames if "test.txt" in filename]
self.src = []
self.trg = []
for filename in filenames:
f = open(os.path.join(dirname, filename), 'r')
parsed = _parse(f)
self.src.extend([d[:2] for d in parsed])
self.trg.extend([trg_word2id[d[2]] for d in parsed])
self.trg = torch.tensor(self.trg)
def __getitem__(self, index):
src_seq = self.src[index]
trg = self.trg[index]
src_seq, seg_seq, att_mask, pos_id = self.preprocess_sequence(src_seq)
return src_seq, seg_seq, att_mask, pos_id, trg
def __len__(self):
return len(self.trg)
def preprocess_sequence(self, seq):
text = ["[CLS]"] + list(seq[0]) + ["[SEP]"] + [seq[1]] + ["[SEP]"]
tokenized_text = self.tokenizer.tokenize(" ".join(text))
indexed_text = self.tokenizer.convert_tokens_to_ids(tokenized_text)
where_is_sep = indexed_text.index(102) + 1
segment_ids = [0 ]* (where_is_sep) + [1] * (len(indexed_text)- where_is_sep)
attention_mask = [1] *len(indexed_text)
pos_id = [i for i in range(len(indexed_text))]
return torch.tensor(indexed_text), torch.tensor(segment_ids), torch.tensor(attention_mask), torch.tensor(pos_id)
def collate_fn(data):
def merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), 512).long()
for i, seq in enumerate(sequences):
end = lengths[i]
if end <= 512:
padded_seqs[i, :end] = seq[:end]
else:
padded_seqs[i] = seq[-512:]
return padded_seqs
def pos_merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), 512).long()
for i, seq in enumerate(sequences):
padded_seqs[i] = torch.tensor([i for i in range(512)])
return padded_seqs
src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
src_seqs = merge(src_seqs)
seg_seqs = merge(seg_seqs)
att_mask = merge(att_mask)
pos_id = pos_merge(pos_id)
trgs = torch.tensor(trgs)
return src_seqs, seg_seqs, att_mask, pos_id, trgs
data_loader variable declaration in main.py is missing. So I tried to load data_loader as
for tokens_tensor, segments_tensors, att_mask, pos_id, trg in dataloader.collate_fn(bAbi_Dataset):
use collate_fn() function in data_loader.py, but it did not work. When I change it as above, it gives the following error:
Traceback (most recent call last):
File "main.py", line 27, in <module>
File "/content/BERT-for-bAbi-task/dataloader.py", line 133, in collate_fn
src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
File "/usr/lib/python3.6/typing.py", line 682, in inner
return func(*args, **kwds)
File "/usr/lib/python3.6/typing.py", line 1107, in __getitem__
params = tuple(_type_check(p, msg) for p in params)
File "/usr/lib/python3.6/typing.py", line 1107, in <genexpr>
params = tuple(_type_check(p, msg) for p in params)
File "/usr/lib/python3.6/typing.py", line 374, in _type_check
raise TypeError(msg + " Got %.100r." % (arg,))
TypeError: Parameters to generic types must be types. Got 0.
Could anyone please help me how to correct the error?
I will just give you some pointers:
collate_fn is not meant to be called with a dataset as argument. It is a special callback function given to a dataloader and used to collate batch elements into a batch.
Since bAbi_Dataset in /dataloader.py is defined as a torch.utils.data.Dataset I would guess you are meant to initialize it instead. It is defined here as:
def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True)
There is another function build_trg_dics in /dataloader.py which is used to create the parse the content from files. You should take a look at them before setting the right arguments for bAbi_Dataset.
Lastly, when you have your dataset initialized, you can attach a dataloader on it using torch.utils.data.DataLoader. This would look like:
data_loader = DataLoader(dataset, batch_size=16)
At this point, you might even need to plug in the collate function provided in /dataloader.py.
If you don't really know what you are doing, I would suggest you start with a working repository and work your way from there. Good luck!

Gradient is equal to 'None'

I have two networks. The output of the first network is the input to the other. In order to calculate the loss for the second network, I use vanilla policy gradient. I want to backpropagate this loss into the first network. After checking if the gradeints has changed, I see that they are all none.
I first load the first network (a pre-trained autoencoer in my network this way):
def load_checkpoint(filepath, model):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint['state_dict'])
for parameter in model.parameters():
parameter.requires_grad = True
model.train()
return model
Then I define the optimizers for both networks this way:
class MultipleOptimizer(object):
def __init__(self, *op):
self.optimizers = op
def zero_grad(self):
for op in self.optimizers:
op.zero_grad()
def step(self):
for op in self.optimizers:
op.step()
opt = MultipleOptimizer(SGD(model.parameters(), lr=1, momentum=0.9), Adam(logits_net.parameters(), lr=lr))
the reward function is:
#Reward function
def reward(x, act):
#print('action', act)
#print('x type', type(x))
km = KMeans(act, n_init=20, n_jobs=4)
y_pred = km.fit_predict(x.detach().cpu().numpy())# seems we can only get a centre from batch
#print('k-means output type', type(y_pred))
sil_score = sil(x.detach().cpu().numpy(), y_pred)
#print('sil score', sil_score)
return sil_score
The architecture of the second neural net and an alternative to avoid (logits=logits.mean(0)):
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network. outputs are the logits
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
class mlp2(torch.nn.Module):
def __init__(self):
super(mlp2, self).__init__()
self.linear1 = nn.Linear(10,100)
self.relu1 = nn.ReLU(inplace=True)
self.linear2 = torch.nn.Linear(100,100)
self.linear3 = torch.nn.Linear(100,20)
self.linear4 = torch.nn.Linear(2000,100)
self.ident = nn.Identity()
def forward(self, x):
a = self.linear1(x)
a = self.relu1(a)
a = self.linear2(a)
a = self.relu1(a)
a = self.linear3(a)
a = torch.flatten(a)
a = self.linear4(a)
a = self.relu1(a)
a = self.linear3(a)
out = self.ident(a)
return out
Loss is calculated as in the following order:
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits.mean(0))
def get_action(obs):
return get_policy(obs).sample().item()
def Logp(obs, act):
logp = get_policy(obs).log_prob(act.cuda())
return logp
def compute_loss(logp, weights):
return -(logp * weights).mean()
def train_one_epoch():
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for R(tau) weighting in policy gradient
batch_logp = []
# reset episode-specific variables
j = 1 # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
for i, data in enumerate(train_loader):
#Create the mean image out of those 100 images
x, label = data
x = model(x.cuda())#torch.Size([100, 10])
obs = x.data.cpu().numpy()#[100, 10] - a trajectory with only one state
# Save obs
batch_obs.append(obs.copy())
#act in the environment
#act = get_action(torch.as_tensor(obs, dtype=torch.float32))
act = get_action(x)
print('action type', type(act))
#log probability
#logp = Logp(torch.as_tensor(obs, dtype=torch.float32),act = torch.as_tensor(act, dtype=torch.int32))
logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32))
#rew = reward(obs, act+2)
rew = reward(x, act+2)
# save action, reward
batch_acts.append(act)
batch_weights.append(rew)#episode rewards
batch_logp.append(logp)
opt.zero_grad()
batch_logp = torch.stack(batch_logp, dim=0)
batch_loss = compute_loss(logp = torch.as_tensor(batch_logp, dtype=torch.float32),
weights = torch.as_tensor(batch_weights, dtype=torch.float32))
batch_loss.backward() #does it return anything? gradients? print them!
opt.step()
for name, param in logits_net.named_parameters():
print(name, param.grad)
I applied some changes with the assumption that maybe recreating some of the tensors maybe the issue:
I have the output of the first network, obs, converted like obs = x.data.cpu().numpy() this and then sent to get_action function: act = get_action(torch.as_tensor(obs, dtype=torch.float32)). I changes this to act = get_action(x) so, x is sent directly to this function. Also, change arguments of logp to logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32)).
After these changes, I still get the none value for the gradient. Is there anyway possible to backpropagate the gradient when loss is calculated this way? any changes that I can apply?
any help is appreciated.

Inference time varies over different GPUs using Torch

I get a bug when running the below inference code. In the function recognize(), it takes 0.4s to finish prediction. It takes another 3s to return the result preds_str to the caller function. I found that if I set gpu_id=0 in file config, it returns instantly. How can I fix this bug? Thanks in advance.
def recognize(imgs, model, demo_loader):
t = time()
model.eval()
with torch.no_grad():
for image_tensors, image_path_list in demo_loader:
batch_size = image_tensors.size(0)
image = image_tensors.to(config.device)
# For max length prediction
length_for_pred = torch.IntTensor([config.batch_max_length] * batch_size).to(config.device)
text_for_pred = torch.LongTensor(batch_size, config.batch_max_length + 1).fill_(0).to(config.device)
preds = model(image, text_for_pred, is_train=False)
_, preds_index = preds.max(2)
preds_str = converter.decode(preds_index, length_for_pred)
print('time elapsed before return:'time()-t) #0.4s
return preds_str
def main():
model = Model()
self.model.cuda(config.device)
model = torch.nn.DataParallel(model, device_ids=[config.device], output_device=[config.device]).to(config.device)
model.load_state_dict(torch.load(config.saved_model, map_location=config.device))
AlignCollate_demo = AlignCollate(imgH=config.imgH, imgW=config.imgW, keep_ratio_with_pad=config.PAD)
imgs_dataset = ImageDataset(imgs)
demo_loader = torch.utils.data.DataLoader(imgs_dataset, batch_size=config.batch_size,shuffle=False,num_workers=int(config.workers),collate_fn=AlignCollate_demo, pin_memory=True)
start_time = time()
# imgs = [img1, img2, ....]
preds_str = recognize(imgs, model, demo_loader)
print('time elapsed after return', time()-start_time) #3.4s
Config file:
class ConfigWordRecognizer:
gpu_id = 1 #troublesome line here
device = torch.device('cuda:{}'.format(gpu_id) if torch.cuda.is_available() else 'cpu')
imgH = 32
imgW = 100
batch_size = 80
workers = 8
batch_max_length = 25
I found the solution from this post.
I set CUDA_VISIBLE_DEVICES=1, gpu_id=0. Then, I remove
model = torch.nn.DataParallel(model, device_ids=[config.device], output_device=[config.device]).to(config.device)
and change
model.load_state_dict(torch.load(config.saved_model, map_location=config.device))
to
model.load_state_dict(self.copyStateDict(torch.load(self.config.saved_model, map_location=self.config.device)))
Copy stateDict function:
def copyStateDict(self, state_dict):
if list(state_dict.keys())[0].startswith("module"):
start_idx = 1
else:
start_idx = 0
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = ".".join(k.split(".")[start_idx:])
new_state_dict[name] = v
return new_state_dict
The model works well on gpu1. But I still don't understand why if I set 'gpu_id=0', it works well on gpu0 without copyStateDict

I define a loss function but backward present error to me could someone tell me how to fix it

class loss(Function):
#staticmethod
def forward(ctx,x,INPUT):
batch_size = x.shape[0]
X = x.detach().numpy()
input = INPUT.detach().numpy()
Loss = 0
for i in range(batch_size):
t_R_r = input[i,0:4]
R_r = t_R_r[np.newaxis,:]
t_R_i = input[i,4:8]
R_i = t_R_i[np.newaxis,:]
t_H_r = input[i,8:12]
H_r = t_H_r[np.newaxis,:]
t_H_i = input[i,12:16]
H_i = t_H_i[np.newaxis,:]
t_T_r = input[i, 16:32]
T_r = t_T_r.reshape(4,4)
t_T_i = input[i, 32:48]
T_i = t_T_i.reshape(4,4)
R = np.concatenate((R_r, R_i), axis=1)
H = np.concatenate((H_r, H_i), axis=1)
temp_t1 = np.concatenate((T_r,T_i),axis=1)
temp_t2 = np.concatenate((-T_i,T_r),axis=1)
T = np.concatenate((temp_t1,temp_t2),axis=0)
phi_r = np.zeros((4,4))
row, col = np.diag_indices(4)
phi_r[row,col] = X[i,0:4]
phi_i = np.zeros((4, 4))
row, col = np.diag_indices(4)
phi_i[row, col] = 1 - np.power(X[i, 0:4],2)
temp_phi1 = np.concatenate((phi_r,phi_i),axis=1)
temp_phi2 = np.concatenate((-phi_i, phi_r), axis=1)
phi = np.concatenate((temp_phi1,temp_phi2),axis=0)
temp1 = np.matmul(R,phi)
temp2 = np.matmul(temp1,T) # error
H_hat = H + temp2
t_Q_r = np.zeros((4,4))
t_Q_r[np.triu_indices(4,1)] = X[i,4:10]
Q_r = t_Q_r + t_Q_r.T
row,col = np.diag_indices(4)
Q_r[row,col] = X[i,10:14]
Q_i = np.zeros((4,4))
Q_i[np.triu_indices(4,1)] = X[i,14:20]
Q_i = Q_i - Q_i.T
temp_Q1 = np.concatenate((Q_r,Q_i),axis=1)
temp_Q2 = np.concatenate((-Q_i,Q_r),axis=1)
Q = np.concatenate((temp_Q1,temp_Q2),axis=0)
t_H_hat_r = H_hat[0,0:4]
H_hat_r = t_H_hat_r[np.newaxis,:]
t_H_hat_i= H_hat[0,4:8]
H_hat_i = t_H_hat_i[np.newaxis,:]
temp_H1 = np.concatenate((-H_hat_i.T,H_hat_r.T),axis=0)
H_hat_H = np.concatenate((H_hat.T,temp_H1),axis=1)
temp_result1 = np.matmul(H_hat,Q)
temp_result2 = np.matmul(temp_result1,H_hat_H)
Loss += np.log10(1+temp_result2[0][0])
Loss = t.from_numpy(np.array(Loss / batch_size))
return Loss
#staticmethod
def backward(ctx,grad_output):
print('gradient')
return grad_output
def criterion(output,input):
return loss.apply(output,input)
This is my loss function. But it present the error:
Traceback (most recent call last):
File "/Users/mrfang/channel_capacity/training.py", line 24, in
loss.backward() File "/Users/mrfang/anaconda3/lib/python3.6/site-packages/torch/tensor.py",
line 150, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph) File
"/Users/mrfang/anaconda3/lib/python3.6/site-packages/torch/autograd/init.py",
line 99, in backward
allow_unreachable=True) # allow_unreachable flag RuntimeError: function lossBackward returned an incorrect number of gradients
(expected 2, got 1)
How could I fix it. Thanks very much
Your forward(ctx,x,INPUT) takes two inputs, x and INPUT, thus backward should output two gradients as well, grad_x and grad_INPUT.
In addition, in your snippet, you're not really computing a custom gradient, so you could compute that with Pytorch's autograd, without having to define a special Function.
If this is working code and you're going to define the custom loss, here's a quick boilerplate of what backward should comprise:
#staticmethod
def forward(ctx, x, INPUT):
# this is required so they're available during the backwards call
ctx.save_for_backward(x, INPUT)
# custom forward
#staticmethod
def backward(ctx, grad_output):
x, INPUT = ctx.saved_tensors
grad_x = grad_INPUT = None
# compute grad here
return grad_x, grad_INPUT
You don't need to return gradients for inputs that don't require it, thus you can return None for them.
More info here and here.

Resources