RuntimeError: stack expects a non-empty TensorList - python-3.x

first of all I thank , I tried to train model with pytorch but I got the following error:
RuntimeError: stack expects a non-empty TensorList .I am trying to model a extract features point cloud using deep learning in pytorch.
I get the following error . Could anyone help on this? ************** ***************
Thanks!
def training_loop(gpu, training_dataloader, model, loss_fn, optimizer):
losses = []
correct = 0
batch_results = dict()
conf_mat = np.zeros((10,10))
for batch_n, batch in enumerate(training_dataloader): #batch[batch, pos, ptr, y]
batch_size = int(batch.batch.size()[0] / sample_points)
if dimensionality == 3:
# Input dim [:,3] for your geometry x,y,z
X = batch.pos.cuda(non_blocking=True).view(batch_size, sample_points, -1) + torch.normal(
torch.zeros(batch_size, sample_points, dimensionality), torch.full((batch_size, sample_points,
dimensionality), fill_value=0.1)).cuda(gpu)
else:
# Input dim [:,6] for your geometry x,y,z and normals nx,ny,nz
X = torch.cat((batch.pos.cuda(non_blocking=True), batch.normal.cuda(non_blocking=True)), 1).view(batch_size, sample_points, -1) + torch.normal(
torch.zeros(batch_size, sample_points, dimensionality), torch.full((batch_size, sample_points,
dimensionality), fill_value=0.1)).cuda(gpu)
y = batch.y.cuda(non_blocking=True).flatten() #size (batch_size) --> torch.Size([8])
# Compute predictions
pred = model(None, X) #size (batch_size,classes) --> torch.Size([8, 10])
if overall_classes_loss:
# weighted CE Loss over all classes
loss = loss_fn(pred, y)
else:
# weighted batchwise Loss
sample_count = np.array([[x, batch.y.tolist().count(x)] for x in batch.y])[:,1]
batch_weights = 1. / sample_count
batch_weights = torch.from_numpy(batch_weights)
batch_weights = batch_weights.double()
loss = element_weighted_loss(pred, batch.y, batch_weights, gpu)
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
print(f"Loss: {loss}")
tensor_list_y = [torch.ones_like(y) for _ in range(dist.get_world_size())]
tensor_list_pred = [torch.ones_like(y) for _ in range(dist.get_world_size())]
torch.distributed.all_gather(tensor_list_y, y, group=None, async_op=False)
torch.distributed.all_gather(tensor_list_pred, pred.argmax(1), group=None, async_op=False)
tensor_list_y = torch.cat(tensor_list_y)
tensor_list_pred = torch.cat(tensor_list_pred)
# Confusion Matrix
conf_mat += confusion_matrix(tensor_list_y.cpu().detach().numpy(), tensor_list_pred.cpu().detach().numpy(), labels=np.arange(0,10))
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
# Save batch predictions
batch_results[batch_n] = {'true':tensor_list_y, 'pred':tensor_list_pred}
if verbosity == True:
print(f"\n\nTRAIN on GPU:{gpu}: True Label {y} - Prediction {pred.argmax(1)} - Loss {loss}")
truevalue = '\t\t'.join(classes[items] for items in y.tolist())
predvalues = '\t\t'.join(classes[items] for items in pred.argmax(1).tolist())
print(f"INFO on GPU:{gpu}: TRAIN - True Value\t {truevalue}")
print(f"INFO on GPU:{gpu}: TRAIN - Predictions\t {predvalues}")
if batch_n % 25 == 0:
torch.distributed.reduce(loss, 0)
"""if gpu == 0:
# print predictions and true values
#truevalue = '\t\t'.join(classes[items] for items in y.tolist())
#predvalues = '\t\t'.join(classes[items] for items in pred.argmax(1).tolist())
#print(f"\n\nINFO on GPU{gpu}: TRAIN - True Value\t {truevalue}")
#print(f"INFO on GPU{gpu}: TRAIN - Predictions\t {predvalues}")
#print("INFO: TRAIN - Correctness\t", pred.argmax(1) == y)
#print(f"INFO: TRAIN - Single Batch Test Accuracy {correct * 100 / batch_size}\n\n")
loss, current = loss.item(), batch_n * len(X)
#print(f"loss: {loss:>7f}")"""
#print(f"conf_mat: {conf_mat}")
#print(f"batch_results: {batch_results}")
return torch.tensor(losses, device=f"cuda:{gpu}"), torch.tensor(correct, device=f"cuda:{gpu}"), batch_results, conf_mat
def train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, scheduler, dir_path, initial_epoch):
epoch_losses = []
training_accuracies = []
test_losses = []
test_accuracies = []
learning_rates = []
counter = 0 #early stopping counter
batchwise_results = dict()
# Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=20)
for i in range(initial_epoch, initial_epoch + epochs):
if gpu == 0:
if initial_epoch > 0:
print(f"\n\nEpoch {i}\n-------------------------------")
else:
print(f"\n\nEpoch {i + 1}\n-------------------------------")
# TRAIN
losses, training_accuracy, train_batch_result, train_conf_mat = training_loop(gpu, training_dataloader, model, loss_fn, optimizer)
average_loss = torch.mean(losses)
torch.distributed.reduce(average_loss, 0, torch.distributed.ReduceOp.SUM)
torch.distributed.reduce(training_accuracy, 0, torch.distributed.ReduceOp.SUM)
# TEST
test_accuracy, test_loss, test_batch_result, test_conf_mat = test_loop(gpu, test_dataloader, model, loss_fn)
torch.distributed.reduce(test_accuracy, 0, torch.distributed.ReduceOp.SUM)
torch.distributed.reduce(test_loss, 0, torch.distributed.ReduceOp.SUM)
# save results
batchwise_results[i] = {'train':train_batch_result, 'test':test_batch_result}
if gpu == 0: # the following operations are performed only by the process running in the first gpu
average_loss = average_loss / torch.tensor(gpus, dtype=torch.float) # average loss among all gpus
test_accuracy = test_accuracy / torch.tensor(len(test_dataloader.dataset),
dtype=torch.float) * torch.tensor(100.0)
training_accuracy = training_accuracy / torch.tensor(len(training_dataloader.dataset),
dtype=torch.float) * torch.tensor(100.0)
test_loss = test_loss / torch.tensor(gpus, dtype=torch.float)
epoch_losses.append(average_loss.item())
training_accuracies.append(training_accuracy.item())
test_losses.append(test_loss.item())
test_accuracies.append(test_accuracy.item())
learning_rates.append((optimizer.param_groups[0])["lr"])
print(f"\nBatch size: {batch_size * int(gpus)}")
print(f"average Training Loss: {average_loss.item():.6f}")
print(f"average Test Loss: {test_loss.item():.6f}")
print(f"\naverage Training Acc: {training_accuracy.item():.6f}")
print(f"average Test Acc: {test_accuracy.item():.6f}")
printLearningRate(optimizer)
scheduler.step(test_loss)
"""# stepwise learning rate decay
if average_loss.item() <= 0.35:
for param_group in optimizer.param_groups:
print("Learning rate changed to 0.007")
param_group['lr'] = 0.007
if average_loss.item() <= 0.30:
for param_group in optimizer.param_groups:
print("Learning rate changed to 0.004")
param_group['lr'] = 0.004"""
# saving model checkpoint
save_checkpoint(model, optimizer, scheduler, i, epoch_losses, training_accuracies, test_losses, test_accuracies, learning_rates,
os.path.join(dir_path, f"epoch{i}.pth"), {key: value for key, value in batchwise_results[i].items() if key == 'train'}, {key: value for key, value in batchwise_results[i].items() if key == 'test'}, train_conf_mat, test_conf_mat)
#TODO: implement ONNX Export
# early stopping scheduler
if early_stopping(test_losses) == True:
counter += 1
print(f"Early Stopping counter: {counter} of {patience}")
else:
counter += 0
if counter < patience:
pass
else:
print("\n\nEarly Stopping activated")
print(f"Training stopped at Epoch{i + 1}")
dist.destroy_process_group()
exit()
class DistributedWeightedSampler(Sampler):
def __init__(self, dataset, data_source: Optional[Sized], num_replicas: Optional[int] = None,
rank: Optional[int] = None, shuffle: bool = True, seed: int = 0, replacement: bool = True):
super().__init__(data_source)
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size()
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank()
if rank >= num_replicas or rank < 0:
raise ValueError("Invalid rank {}, rank should be in the interval [0, {}]".format(rank, num_replicas - 1))
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas
self.shuffle = shuffle
self.seed = seed
self.replacement = replacement #sample can be drown again in that row if True
def calculate_weights(self, targets):
class_sample_count = np.array([len(np.where(self.dataset.data.y == t)[0]) for t in np.unique(self.dataset.data.y)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in self.dataset.data.y])
samples_weight = torch.from_numpy(samples_weight)
samples_weigth = samples_weight.double()
return samples_weigth
def __iter__(self):
# deterministically shuffle based on epoch
if self.shuffle:
g = torch.Generator()
g.manual_seed(self.seed + self.epoch)
indices = torch.randperm(len(self.dataset), generator=g).tolist()
else:
indices = list(range(len(self.dataset)))
# add extra samples to make it evenly divisible
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
# subsample
indices = indices[self.rank:self.total_size:self.num_replicas]
assert len(indices) == self.num_samples
# get targets (you can alternatively pass them in __init__, if this op is expensive)
# data.data.y == labels
targets = self.dataset.data.y
targets = targets[self.rank:self.total_size:self.num_replicas]
#assert len(targets) == self.num_samples
weights = self.calculate_weights(targets)
weighted_indices = torch.multinomial(weights, self.num_samples, self.replacement).tolist()
return iter(weighted_indices)
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
def train(gpu, gpus, world_size):
torch.manual_seed(0)
torch.cuda.set_device(gpu)
try:
#dist.init_process_group(backend='nccl', world_size=world_size, rank=gpu) #for distributed GPU training
dist.init_process_group(backend='gloo', world_size=world_size, rank=gpu) #as a fallback option
except RuntimeError:
print("\n\nINFO:RuntimeError is raised >> Used gloo backend instead of nccl!\n")
dist.init_process_group(backend='gloo', world_size=world_size, rank=gpu) #as a fallback option
dir_path = None
if gpu == 0:
dir_path = "stackgraphConvPool3DPnet"
createdir(dir_path)
training_number = next_training_number(dir_path)
dir_path = os.path.join(dir_path, f"train{training_number}")
createdir(dir_path)
#save hyper-parameters in txt protocol file
save_hyperparameters(dir_path, 'hyperparameters.txt')
print("\nINFO: Protocol File saved successfully . . .")
#copy crucial py-files in current train folder
#shutil.copy2(os.path.basename('__file__'), dir_path)
#shutil.copy2('stackGraphConvPool3DPnet.py', dir_path)
#shutil.copy2('shrinkingunit.py', dir_path)
#shutil.copy2('utilities.py', dir_path)
#print("\nINFO: Script Files copied successfully . . .")
model = Classifier(shrinkingLayers, mlpClassifier)
torch.cuda.set_device(gpu)
model.cuda(gpu)
#setting up optimizer
if optimizer_str == "SGD":
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, weight_decay=weight_decay)
elif optimizer_str == "RMSprop":
optimizer = torch.optim.RMSprop(model.parameters(), learning_rate, weight_decay=weight_decay)
else:
optimizer = torch.optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay)
# single-program multiple-data training paradigm (Distributed Data-Parallel Training)
model = DDP(model, device_ids=[gpu])
if dimensionality == 3:
training_data = ModelNet("ModelNet10_train_data", transform=lambda x: NormalizeScale()(SamplePoints(num=sample_points)(x)))
else:
training_data = ModelNet("ModelNet10_train_data", transform=lambda x: NormalizeScale()(NormalizeRotation()(SamplePoints(num=sample_points, remove_faces=True, include_normals=True)(x))))
training_sampler = DistributedWeightedSampler(training_data, data_source=None, num_replicas=world_size) #weight unbalanced classes by 1/cls_count
training_dataloader = DataLoader(dataset=training_data, batch_size=batch_size, shuffle=data_shuffle, num_workers=0,
pin_memory=True, sampler=training_sampler)
if dimensionality == 3:
test_data = ModelNet("ModelNet10_test_data", train=False, transform=lambda x: NormalizeScale()(SamplePoints(num=sample_points)(x)))
else:
test_data = ModelNet("ModelNet10_test_data", train=False, transform=lambda x: NormalizeScale()(NormalizeRotation()(SamplePoints(num=sample_points, remove_faces=True, include_normals=True)(x))))
test_sampler = DistributedWeightedSampler(test_data,data_source=None, num_replicas=world_size) #weight unbalanced classes by 1/cls_count
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=data_shuffle, num_workers=0,
pin_memory=True, sampler=test_sampler)
"""# save sampled data for later result visualisation
try:
#export_path = os.path.join("stackgraphConvPool3DPnet", "train" + str(next_training_number("stackgraphConvPool3DPnet")-1))
#export_sampled_data(training_dataloader, os.path.join(export_path, "train_sampledPoints.pth"))
#export_sampled_data(test_dataloader, os.path.join(export_path, "test_sampledPoints.pth"))
print("\nINFO: Sampled 3D data successfully saved . . .")
except Exception as e:
print(f"\nERROR: Sampled 3D data could not saved successfully . . . - this process does not executed - caused by {e}")"""
# weighted CE Loss over all Classes C
class_sample_count = np.array([len(np.where(training_data.data.y == t)[0]) for t in np.unique(training_data.data.y)])
weight = 1. / class_sample_count
weight = torch.from_numpy(weight)
weight = weight.float()
loss_fn = nn.CrossEntropyLoss(weight=weight).cuda(gpu)
# continue training from certain checkpoint
continue_from_scratch = True if args.resume is None else False
if continue_from_scratch:
if gpu == 0:
print("\nINFO: Train from scratch has started . . .")
train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, 0)
else:
checkpoint_path = "stackgraphConvPool3DPnet/" + args.resume
if gpu == 0:
print(f"\nINFO: Train has started from certain checkpoint {checkpoint_path.split('/')[2].split('.')[0]} in {checkpoint_path.split('/')[1]} . . .")
model.load_state_dict(torch.load(checkpoint_path)['model_state_dict'], strict=False)
optimizer.load_state_dict(torch.load(checkpoint_path)['optimizer_state_dict'])
final_epoch = (torch.load("stackgraphConvPool3DPnet/" + args.resume)['epoch'])+1
train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, final_epoch)
INFO: Train from scratch has started . . .
Epoch 1
-------------------------------
Exception in thread Exception in thread Thread-8:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 973, in _bootstrap_inner
Thread-7:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 973, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 910, in run
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 910, in run
self._target(*self._args, **self._kwargs)
File ~\Desktop\Forum\unit.py", line 615, in kmeansAppender
self._target(*self._args, **self._kwargs)
File ~\Desktop\Forum\unit.py", line 615, in kmeansAppender
x, y, z = module(input)
File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
x, y, z = module(input)
File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File ~\Desktop\Forum\unit.py", line 148, in forward
labels = np.apply_along_axis(lambda x: x + (i*self.k), axis=0, arr=kmeans.labels_)
AttributeError: 'KMeans' object has no attribute 'labels_'
return forward_call(*input, **kwargs)
File ~\Desktop\Forum\unit.py", line 148, in forward
labels = np.apply_along_axis(lambda x: x + (i*self.k), axis=0, arr=kmeans.labels_)
AttributeError: 'KMeans' object has no attribute 'labels_'
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Input In [1], in <cell line: 720>()
734 #change state
735 if args.train_state == True:
--> 736 train(args.local_rank, gpus, world_size)
737 else:
738 infer(args.local_rank, gpus, world_size, args.checkpoint, args.data)
Input In [1], in train(gpu, gpus, world_size)
672 if gpu == 0:
673 print("\nINFO: Train from scratch has started . . .")
--> 674 train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, 0)
675 else:
676 checkpoint_path = "stackgraphConvPool3DPnet/" + args.resume
Input In [1], in train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, scheduler, dir_path, initial_epoch)
454 print(f"\n\nEpoch {i + 1}\n-------------------------------")
456 # TRAIN
--> 457 losses, training_accuracy, train_batch_result, train_conf_mat = training_loop(gpu, training_dataloader, model, loss_fn, optimizer)
458 average_loss = torch.mean(losses)
459 torch.distributed.reduce(average_loss, 0, torch.distributed.ReduceOp.SUM)
Input In [1], in training_loop(gpu, training_dataloader, model, loss_fn, optimizer)
249 y = batch.y.cuda(non_blocking=True).flatten() #size (batch_size) --> torch.Size([8])
251 # Compute predictions
--> 252 pred = model(None, X) #size (batch_size,classes) --> torch.Size([8, 10])
254 if overall_classes_loss:
255 # weighted CE Loss over all classes
256 loss = loss_fn(pred, y)
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\parallel\distributed.py:1008, in DistributedDataParallel.forward(self, *inputs, **kwargs)
1004 if self._join_config.enable:
1005 # Notify joined ranks whether they should sync in backwards pass or not.
1006 self._check_global_requires_backward_grad_sync(is_joined_rank=False)
-> 1008 output = self._run_ddp_forward(*inputs, **kwargs)
1010 # sync params according to location (before/after forward) user
1011 # specified as part of hook, if hook was specified.
1012 if self._check_sync_bufs_post_fwd():
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\parallel\distributed.py:969, in DistributedDataParallel._run_ddp_forward(self, *inputs, **kwargs)
962 if self.device_ids:
963 inputs, kwargs = _to_kwargs(
964 inputs,
965 kwargs,
966 self.device_ids[0],
967 self.use_side_stream_for_tensor_copies
968 )
--> 969 return module_to_run(*inputs[0], **kwargs[0])
970 else:
971 return module_to_run(*inputs, **kwargs)
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~\Desktop\Forum\unit.py:657, in Classifier.forward(self, x, pos)
655 feature_matrix_batch = pos.unsqueeze(0)
656 # feature_matrix_batch size = (1,N,I,D) where N=batch number, I=members, D=member dimensionality
--> 657 output = self.neuralNet(feature_matrix_batch)
658 # output size = (S,N,D) where S= stack size, N=batch number, D'=member dimensionality
659 output = torch.mean(output, dim=0)
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\container.py:139, in Sequential.forward(self, input)
137 def forward(self, input):
138 for module in self:
--> 139 input = module(input)
140 return input
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~\Desktop\Forum\unit.py:448, in ShrinkingUnitStack.forward(self, feature_matrix_batch)
446 feature_matrix_batch = self.selfCorrStack(feature_matrix_batch)
447 # feature_matrix_batch size = (S',N,I,D) where S'=stack_size, N=batch number, I=members, D=member dimensionality
--> 448 feature_matrix_batch_, conv_feature_matrix_batch, cluster_index = self.kmeansConvStack(feature_matrix_batch)
449 feature_matrix_batch = self.localAdaptFeaAggreStack(feature_matrix_batch, conv_feature_matrix_batch)
450 output = self.graphMaxPoolStack(feature_matrix_batch, cluster_index)
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~\Desktop\Forum\unit.py:519, in KMeansConvStack.forward(self, feature_matrix_batch)
517 def forward(self, feature_matrix_batch: torch.Tensor):
518 # feature_matrix_batch size = (S,N,I,D) where S=stack size, N=batch number, I=members, D=member dimensionality
--> 519 feature_matrix_batch, conv_feature_matrix_batch, cluster_index = kmeansConvThreader(self.kmeansConvStack,
520 feature_matrix_batch)
521 # feature_matrix_batch size = (S,N,I,D) where where S=stack_size, N=batch number, I=members, D=member dimensionality
522 # conv_feature_matrix_batch size = (S,N,I,D) where where S=stack_size, N=batch number, I=members, D=member dimensionality
523 # cluster_index size = (S,M) where S=stack_size, M=N*I
524 return feature_matrix_batch, conv_feature_matrix_batch, cluster_index
File ~\Desktop\Forum\unit.py:611, in kmeansConvThreader(modules, input_tensor)
609 list2_append = list(map(lambda x: x[1], list2_append))
610 list3_append = list(map(lambda x: x[1], list3_append))
--> 611 return torch.stack(list1_append), torch.stack(list2_append), torch.stack(list3_append)
RuntimeError: stack expects a non-empty TensorList
def forward(self, feature_matrix_batch):
# feature_matrix_batch size = (N,I,D) where N=batch number, I=members, D=member dimensionality
N, I, D = feature_matrix_batch.size()
clusters = []
for i, feature_matrix in enumerate(feature_matrix_batch):
kmeans = KMeans(n_clusters=self.k, init=self.kmeansInit, n_init=self.n_init)
labels = np.apply_along_axis(lambda x: x + (i*self.k), axis=0, arr=kmeans.labels_)
def kmeansConvThreader(modules, input_tensor):
list1_append = []
list2_append = []
list3_append = []
threads = []
for i, t in enumerate(input_tensor):
threads.append(
Thread(target=kmeansAppender, args=(modules[i], t, list1_append, list2_append, list3_append, i)))
[t.start() for t in threads]
[t.join() for t in threads]
list1_append.sort()
list2_append.sort()
list3_append.sort()
list1_append = list(map(lambda x: x[1], list1_append))
list2_append = list(map(lambda x: x[1], list2_append))
list3_append = list(map(lambda x: x[1], list3_append))
return torch.stack(list1_append), torch.stack(list2_append), torch.stack(list3_append)
AttributeError: 'KMeans' object has no attribute 'labels_'
RuntimeError: stack expects a non-empty TensorList
Thanks for your help

Related

How to make a mlflow model predict?

I recently made a GNN model using TransformerConv and TopKPooling, it is smooth while training, but I have problems when I want to use it to predict, it kept telling me that the TransformerConv doesn't have the 'aggr_module' attribute
This is my network:
class GNN(torch.nn.Module):
def __init__(self, feature_size, model_params):
super(GNN, self).__init__()
embedding_size = model_params["model_embedding_size"]
n_heads = model_params["model_attention_heads"]
self.n_layers = model_params["model_layers"]
dropout_rate = model_params["model_dropout_rate"]
top_k_ratio = model_params["model_top_k_ratio"]
self.top_k_every_n = model_params["model_top_k_every_n"]
dense_neurons = model_params["model_dense_neurons"]
self.conv_layers = ModuleList([])
self.transf_layers = ModuleList([])
self.pooling_layers = ModuleList([])
self.bn_layers = ModuleList([])
# Transformation layer: transform original node features to embedding vector(size: embedding_size(defined in config.py))
self.conv1 = TransformerConv(feature_size,
embedding_size,
heads=n_heads,
dropout=dropout_rate,
#edge_dim=edge_dim,
beta=True)
self.transf1 = Linear(embedding_size*n_heads, embedding_size)
self.bn1 = BatchNorm1d(embedding_size)
# Other layers: message passing and pooling
for i in range(self.n_layers):
self.conv_layers.append(TransformerConv(embedding_size,
embedding_size,
heads=n_heads,
dropout=dropout_rate,
#edge_dim=edge_dim,
beta=True))
# map conv_layer output size back to emgedding_size(embedding_size*n_heads -> embedding_size)
self.transf_layers.append(Linear(embedding_size*n_heads, embedding_size))
# Batch normalization
self.bn_layers.append(BatchNorm1d(embedding_size))
# Top-k pooling to reduce the size of the graph
if i % self.top_k_every_n == 0:
self.pooling_layers.append(TopKPooling(embedding_size, ratio=top_k_ratio))
# Linear output layers: feed graph representation in & reduce until single value left
self.linear1 = Linear(embedding_size*2, dense_neurons)
self.linear2 = Linear(dense_neurons, int(dense_neurons/2))
self.linear3 = Linear(int(dense_neurons/2), 3) # same as the general form
def forward(self, x, edge_index, batch_index):
# Initial transformation
x = self.conv1(x, edge_index)
x = torch.relu(self.transf1(x))
x = torch.relu((x))
x = self.bn1(x)
# Holds the intermediate graph representations
global_representation = []
for i in range(self.n_layers):
x = self.conv_layers[i](x, edge_index)
x = torch.relu(self.transf_layers[i](x))
x = torch.relu((x))
x = self.bn_layers[i](x)
# Always aggregate last layer
if i % self.top_k_every_n == 0 or i == self.n_layers:
x , edge_index, edge_attr, batch_index, _, _ = self.pooling_layers[int(i/self.top_k_every_n)]( x,
edge_index,
None,
batch_index)
# Add current representation
global_representation.append(torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1))
x = sum(global_representation)
# Output block
x = torch.relu(self.linear1(x))
x = F.dropout(x, p=0.8, training=self.training)
x = torch.relu(self.linear2(x))
x = F.dropout(x, p=0.8, training=self.training)
x = self.linear3(x)
return x
One training:
def run_one_training(params):
params = params[0]
with mlflow.start_run() as run:
# Log parameters used in this experiment
for key in params.keys():
mlflow.log_param(key, params[key])
# Loading the dataset
print("Loading dataset...")
full_dataset = ProcessedDataset(root = "data/", filename = "fixtures_full.csv")
full_dataset.shuffle()
train_dataset = full_dataset[:3400] # around 80% of the full dataset
test_dataset = full_dataset[3400:3800]
# Prepare training
print("Preparing Training")
batch_size=params["batch_size"]
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
# Loading the model
print("Loading model...")
model_params = {k: v for k, v in params.items() if k.startswith("model_")}
model = GNN(feature_size=train_dataset[0].x.shape[1], model_params=model_params)
print(model)
model = model.to(device)
print(f"Number of parameters: {count_parameters(model)}")
mlflow.log_param("num_params", count_parameters(model))
class_weights = [1.0239, 1.2753, 0.8070]
class_weights= torch.tensor(class_weights,dtype=torch.float)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),
lr=params["learning_rate"],
momentum=params["sgd_momentum"],
weight_decay=params["weight_decay"])
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=params["scheduler_gamma"])
# Start training
best_loss = 1000
early_stopping_counter = 0
for epoch in range(300):
if early_stopping_counter <= 10: # = x * 5
# Training
model.train()
loss = train_one_epoch(epoch, model, train_loader, optimizer, loss_fn)
print(f"Epoch {epoch} | Train Loss {loss}")
mlflow.log_metric(key="Train loss", value=float(loss), step=epoch)
# Testing
model.eval()
if epoch % 5 == 0:
loss = test(epoch, model, test_loader, loss_fn)
print(f"Epoch {epoch} | Test Loss {loss}")
mlflow.log_metric(key="Test loss", value=float(loss), step=epoch)
# Update best loss
if float(loss) < best_loss:
best_loss = loss
# Save the currently best model
mlflow.pytorch.log_model(model, "model", signature=SIGNATURE)
early_stopping_counter = 0
else:
early_stopping_counter += 1
scheduler.step()
else:
print("Early stopping due to no improvement.")
return [best_loss]
print(f"Finishing training with best test loss: {best_loss}")
return [best_loss]
Train and Test
def train_one_epoch(epoch, model, train_loader, optimizer, loss_fn):
# Enumerate over the data
all_preds = []
all_labels = []
running_loss = 0.0
step = 0
for _, batch in enumerate(tqdm(train_loader)):
batch.x = torch.tensor(batch.x)
batch.x = batch.x.reshape((-1, *batch.x.shape[2:]))
# Use GPU
batch.to(device)
# Reset gradients
optimizer.zero_grad()
# Passing the node features and the connection info
pred = model(torch.tensor(batch.x).float(),
#batch.edge_attr.float(),
batch.edge_index,
batch.batch)
# Calculating the loss and gradients
loss = torch.sqrt(loss_fn(pred, batch.y.long()))
loss.backward()
optimizer.step()
# Update tracking
running_loss += loss.item()
step += 1
all_preds.append(np.argmax(pred.cpu().detach().numpy(), axis=1))
all_labels.append(batch.y.cpu().detach().numpy())
all_preds = np.concatenate(all_preds).ravel()
all_labels = np.concatenate(all_labels).ravel()
calculate_metrics(all_preds, all_labels, epoch, "train")
return running_loss/step
def test(epoch, model, test_loader, loss_fn):
all_preds = []
all_preds_raw = []
all_labels = []
running_loss = 0.0
step = 0
for batch in test_loader:
batch.x = torch.tensor(batch.x)
batch.x = batch.x.reshape((-1, *batch.x.shape[2:]))
batch.to(device)
pred = model(torch.tensor(batch.x).float(),
#batch.edge_attr.float(),
batch.edge_index,
batch.batch)
loss = torch.sqrt(loss_fn(pred, batch.y.long()))
# Update tracking
running_loss += loss.item()
step += 1
all_preds.append(np.argmax(pred.cpu().detach().numpy(), axis=1))
all_preds_raw.append(torch.sigmoid(pred).cpu().detach().numpy())
all_labels.append(batch.y.cpu().detach().numpy())
all_preds = np.concatenate(all_preds).ravel()
all_labels = np.concatenate(all_labels).ravel()
print(all_preds_raw[0][:10])
print(all_preds[:10])
print(all_labels[:10])
calculate_metrics(all_preds, all_labels, epoch, "test")
log_conf_matrix(all_preds, all_labels, epoch)
return running_loss/step
Predict:
def predict(model, test_loader):
all_preds = []
all_preds_raw = []
all_labels = []
for batch in test_loader:
batch.x = torch.tensor(batch.x)
batch.x = batch.x.reshape((-1, *batch.x.shape[2:]))
batch.to(device)
pred = model(torch.tensor(batch.x).float(),
#batch.edge_attr.float(),
batch.edge_index,
batch.batch)
all_preds.append(np.argmax(pred.cpu().detach().numpy(), axis=1))
all_preds_raw.append(torch.sigmoid(pred).cpu().detach().numpy())
all_labels.append(batch.y.cpu().detach().numpy())
all_preds = np.concatenate(all_preds).ravel()
all_labels = np.concatenate(all_labels).ravel()
return all_preds, all_preds_raw, all_labels
I was using mlflow to load my model and this is what I did:
import mlflow
logged_model = 'runs:/b18929aa871047f9892aa3c84a998d28/model'
# Load model
loaded_model = mlflow.pytorch.load_model(logged_model)
loaded_model = loaded_model.to(device)
loaded_model.eval()
loader = DataLoader(dataset, batch_size=len(dataset))
all_pred, all_pred_raw, all_label = predict(loaded_model, loader)
This is the error message
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Input In [23], in <cell line: 7>()
3 dataset = full_dataset[3800:]
5 loader = DataLoader(dataset, batch_size=len(dataset))
----> 7 all_pred, all_pred_raw, all_label = predict(loaded_model, loader)
Input In [20], in predict(epoch, model, test_loader, loss_fn)
143 batch.x = batch.x.reshape((-1, *batch.x.shape[2:]))
144 batch.to(device)
--> 145 pred = model(torch.tensor(batch.x).float(),
146 #batch.edge_attr.float(),
147 batch.edge_index,
148 batch.batch)
150 all_preds.append(np.argmax(pred.cpu().detach().numpy(), axis=1))
151 all_preds_raw.append(torch.sigmoid(pred).cpu().detach().numpy())
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1190, in Module._call_impl(self, *input, **kwargs)
1186 # If we don't have any hooks, we want to skip the rest of the logic in
1187 # this function, and just call forward.
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
File ~\FIFA_PROJECT\model.py:67, in GNN.forward(self, x, edge_index, batch_index)
63 def forward(self, x, edge_index, batch_index):
64 #def forward(self, x, edge_attr=None, edge_index, batch_index):
65 # Initial transformation
66 #x = self.conv1(x, edge_index, edge_attr)
---> 67 x = self.conv1(x, edge_index)
68 x = torch.relu(self.transf1(x))
69 x = torch.relu((x))
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1190, in Module._call_impl(self, *input, **kwargs)
1186 # If we don't have any hooks, we want to skip the rest of the logic in
1187 # this function, and just call forward.
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\lib\site-packages\torch_geometric\nn\conv\gcn_conv.py:198, in GCNConv.forward(self, x, edge_index, edge_weight)
195 x = self.lin(x)
197 # propagate_type: (x: Tensor, edge_weight: OptTensor)
--> 198 out = self.propagate(edge_index, x=x, edge_weight=edge_weight,
199 size=None)
201 if self.bias is not None:
202 out = out + self.bias
File ~\anaconda3\lib\site-packages\torch_geometric\nn\conv\message_passing.py:454, in MessagePassing.propagate(self, edge_index, size, **kwargs)
451 if res is not None:
452 aggr_kwargs = res[0] if isinstance(res, tuple) else res
--> 454 out = self.aggregate(out, **aggr_kwargs)
456 for hook in self._aggregate_forward_hooks.values():
457 res = hook(self, (aggr_kwargs, ), out)
File ~\anaconda3\lib\site-packages\torch_geometric\nn\conv\message_passing.py:578, in MessagePassing.aggregate(self, inputs, index, ptr, dim_size)
565 def aggregate(self, inputs: Tensor, index: Tensor,
566 ptr: Optional[Tensor] = None,
567 dim_size: Optional[int] = None) -> Tensor:
568 r"""Aggregates messages from neighbors as
569 :math:`\square_{j \in \mathcal{N}(i)}`.
570
(...)
576 as specified in :meth:`__init__` by the :obj:`aggr` argument.
577 """
--> 578 return self.aggr_module(inputs, index, ptr=ptr, dim_size=dim_size,
579 dim=self.node_dim)
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1265, in Module.__getattr__(self, name)
1263 if name in modules:
1264 return modules[name]
-> 1265 raise AttributeError("'{}' object has no attribute '{}'".format(
1266 type(self).__name__, name))
AttributeError: 'TransformerConv' object has no attribute 'aggr_module'
Please I'm begging :(
I wrote the predict function but it didn't come out as expected.
Pls send help, would be grateful for any suggestions.
I’ve gotten the solution from pyg discussion here
So basically you can get around this by iterating over all `MessagePassing layers and setting:
loaded_model = mlflow.pytorch.load_model(logged_model)
for conv in loaded_model.conv_layers:
conv.aggr_module = SumAggregation()
This should fix the problem!

When I make pytorch attention Can you give me a idea to fix loss

Here is my model
class Build_Model(nn.Module):
def __init__(self,args) :
super(Build_Model, self).__init__()
self.hidden_size = args.dec_size
self.embedding = nn.Embedding(args.n_vocab, args.d_model)
self.enc_lstm = nn.LSTM(input_size =args.d_model, hidden_size=args.d_model,batch_first=True)
self.dec_lstm = nn.LSTM(input_size =args.d_model, hidden_size=args.d_model,batch_first=True)
self.soft_prob = nn.Softmax(dim=-1)
self.softmax_linear = nn.Linear(args.d_model*2,len(vocab))
self.softmax_linear_function = nn.Softmax(dim = -1)
def forward(self, enc_inputs, dec_inputs) :
enc_hidden = self.embedding(enc_inputs)
dec_hidden = self.embedding(dec_inputs)
enc_hidden , (enc_h_state,enc_c_state) = self.enc_lstm(enc_hidden)
dec_hidden,(dec_h_state,dec_c_state) = self.dec_lstm(dec_hidden,(enc_h_state,enc_c_state))
attn_score = torch.matmul(dec_hidden, torch.transpose(enc_hidden,2,1))
attn_prob = self.soft_prob(attn_score)
attn_out = torch.matmul(attn_prob,enc_hidden)
cat_hidden = torch.cat((attn_out, dec_hidden),-1)
y_pred = self.softmax_linear_function(self.softmax_linear(cat_hidden))
y_pred = torch.argmax(y_pred,dim =-1)
print('y_pred = ',y_pred.shape)
y_pred = y_pred.view(-1, 150)
print('2y_pred = ',y_pred.shape)
return y_pred
Here is the loss function
def lm_loss(y_true, y_pred):
print(y_pred.shape)
y_pred_argmax = y_pred
#y_pred_argmax = y_pred_argmax.view(-1,150)
print(y_true.shape, y_pred_argmax.shape)
criterion = nn.CrossEntropyLoss(reduction="none")
loss = criterion(y_true.float(), y_pred_argmax.float()[0])
#mask = tf.not_equal(y_true, 0)
mask = torch.not_equal(y_pred_argmax,0)
#mask = tf.cast(mask, tf.float32)
mask = mask.type(torch.FloatTensor).to(device)
loss *= mask
#loss = tf.reduce_sum(loss) / tf.maximum(tf.reduce_sum(mask), 1)
loss = torch.sum(loss) / torch.maximum(torch.sum(mask),1)
return loss
The last is evaluation
optimizer.zero_grad()
print(train_enc_inputs.shape,train_dec_inputs.shape, train_dec_labels.shape )
y_pred = model(train_enc_inputs,train_dec_inputs)
#y_pred = torch.argmax(y_pred,dim =-1)
print(y_pred.shape )
loss = lm_loss(train_dec_labels, y_pred)
The output is here:
torch.Size([32, 120]) torch.Size([32, 150]) torch.Size([32, 150])
y_pred = torch.Size([32, 150])
2y_pred = torch.Size([32, 150])
torch.Size([32, 150])
torch.Size([32, 150])
torch.Size([32, 150]) torch.Size([32, 150])
The error traceback:
ValueError Traceback (most recent call last)
<ipython-input-159-cc8976139dd5> in <module>()
9 #y_pred = torch.argmax(y_pred,dim =-1)
10 print(y_pred.shape )
---> 11 loss = lm_loss(train_dec_labels, y_pred)
12 n_step += 1
13 if n_step % 10 == 0:
3 frames
<ipython-input-158-39ba03042d04> in lm_loss(y_true, y_pred)
15 print(y_true.shape, y_pred_argmax.shape)
16 criterion = nn.CrossEntropyLoss(reduction="none")
---> 17 loss = criterion(y_true.float(), y_pred_argmax.float()[0])
18 #mask = tf.not_equal(y_true, 0)
19 mask = torch.not_equal(y_pred_argmax,0)
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
1119 def forward(self, input: Tensor, target: Tensor) -> Tensor:
1120 return F.cross_entropy(input, target, weight=self.weight,
-> 1121 ignore_index=self.ignore_index, reduction=self.reduction)
1122
1123
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2822 if size_average is not None or reduce is not None:
2823 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2824 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2825
2826
ValueError: Expected input batch_size (32) to match target batch_size (150).
How can I fix it?
There are a few issues with your usage of nn.CrossEntropyLoss:
You are supposed to call nn.CrossEntropyLoss with criterion(y_pred, y_true), you seem to have switched the two.
y_pred contains the output logits of your network i.e. it hasn't been passed through a softmax: you need to remove self.softmax_linear_function in your model)
Also y_pred should contain all components and not be the results of an argmax.
y_true is passed in dense format: it contains the true class labels and has one dimension less than the prediction y_pred.

RuntimeError: all elements of input should be between 0 and 1

I want to use an RNN with bilstm layers using pytorch on protein embeddings. It worked with Linear Layer but when i use Bilstm i have a Runtime error. Sorry if its not clear its my first publication and i will be grateful if someone can help me.
from collections import Counter, OrderedDict
from typing import Optional
import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn.functional as F # noqa
from deepchain import log
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch import Tensor, nn
num_layers=2
hidden_size=256
from torch.utils.data import DataLoader, TensorDataset
def classification_dataloader_from_numpy(
x: np.ndarray, y: np.array, batch_size: int = 32
) -> DataLoader:
"""Build a dataloader from numpy for classification problem
This dataloader is use only for classification. It detects automatically the class of
the problem (binary or multiclass classification)
Args:
x (np.ndarray): [description]
y (np.array): [description]
batch_size (int, optional): [description]. Defaults to None.
Returns:
DataLoader: [description]
"""
n_class: int = len(np.unique(y))
if n_class > 2:
log.info("This is a classification problem with %s classes", n_class)
else:
log.info("This is a binary classification problem")
# y is float for binary classification, int for multiclass
y_tensor = torch.tensor(y).long() if len(np.unique(y)) > 2 else torch.tensor(y).float()
tensor_set = TensorDataset(torch.tensor(x).float(), y_tensor)
loader = DataLoader(tensor_set, batch_size=batch_size)
return loader
class RNN(pl.LightningModule):
"""A `pytorch` based deep learning model"""
def __init__(self, input_shape: int, n_class: int, num_layers, n_neurons: int = 128, lr: float = 1e-3):
super(RNN,self).__init__()
self.lr = lr
self.n_neurons=n_neurons
self.num_layers=num_layers
self.input_shape = input_shape
self.output_shape = 1 if n_class <= 2 else n_class
self.activation = nn.Sigmoid() if n_class <= 2 else nn.Softmax(dim=-1)
self.lstm = nn.LSTM(self.input_shape, self.n_neurons, num_layers, batch_first=True, bidirectional=True)
self.fc= nn.Linear(self.n_neurons, self.output_shape)
def forward(self, x):
h0=torch.zeros(self.num_layers, x_size(0), self.n_neurons).to(device)
c0=torch.zeros(self.num_layers, x_size(0), self.n_neurons).to(device)
out, _=self.lstm(x,(h0, c0))
out=self.fc(out[:, -1, :])
return self.fc(x)
def training_step(self, batch, batch_idx):
"""training_step defined the train loop. It is independent of forward"""
x, y = batch
y_hat = self.fc(x).squeeze()
y = y.squeeze()
if self.output_shape > 1:
y_hat = torch.log(y_hat)
loss = self.loss(y_hat, y)
self.log("train_loss", loss, on_epoch=True, on_step=False)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
"""training_step defined the train loop. It is independent of forward"""
x, y = batch
y_hat = self.fc(x).squeeze()
y = y.squeeze()
if self.output_shape > 1:
y_hat = torch.log(y_hat)
loss = self.loss(y_hat, y)
self.log("val_loss", loss, on_epoch=True, on_step=False)
return {"val_loss": loss}
def configure_optimizers(self):
"""(Optional) Configure training optimizers."""
return torch.optim.Adam(self.parameters(),lr=self.lr)
def compute_class_weight(self, y: np.array, n_class: int):
"""Compute class weight for binary/multiple classification
If n_class=2, only compute weights for the positve class.
If n>2, compute for all classes.
Args:
y ([np.array]):vector of int represented the class
n_class (int) : number fo class to use
"""
if n_class == 2:
class_count: typing.Counter = Counter(y)
cond_binary = (0 in class_count) and (1 in class_count)
assert cond_binary, "Must have O and 1 class for binary classification"
weight = class_count[0] / class_count[1]
else:
weight = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
return torch.tensor(weight).float()
def fit(
self,
x: np.ndarray,
y: np.array,
epochs: int = 10,
batch_size: int = 32,
class_weight: Optional[str] = None,
validation_data: bool = True,
**kwargs
):
assert isinstance(x, np.ndarray), "X should be a numpy array"
assert isinstance(y, np.ndarray), "y should be a numpy array"
assert class_weight in (
None,
"balanced",
), "the only choice available for class_weight is 'balanced'"
n_class = len(np.unique(y))
weight = None
self.input_shape = x.shape[1]
self.output_shape = 1 if n_class <= 2 else n_class
self.activation = nn.Sigmoid() if n_class <= 2 else nn.Softmax(dim=-1)
if class_weight == "balanced":
weight = self.compute_class_weight(y, n_class)
self.loss = nn.NLLLoss(weight) if self.output_shape > 1 else nn.BCELoss(weight)
if validation_data:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
train_loader = classification_dataloader_from_numpy(
x_train, y_train, batch_size=batch_size
)
val_loader = classification_dataloader_from_numpy(x_val, y_val, batch_size=batch_size)
else:
train_loader = classification_dataloader_from_numpy(x, y, batch_size=batch_size)
val_loader = None
self.trainer = pl.Trainer(max_epochs=epochs, **kwargs)
self.trainer.fit(self, train_loader, val_loader)
def predict(self, x):
"""Run inference on data."""
if self.output_shape is None:
log.warning("Model is not fitted. Can't do predict")
return
return self.forward(x).detach().numpy()
def save(self, path: str):
"""Save the state dict model with torch"""
torch.save(self.fc.state_dict(), path)
log.info("Save state_dict parameters in model.pt")
def load_state_dict(self, state_dict: "OrderedDict[str, Tensor]", strict: bool = False):
"""Load state_dict saved parameters
Args:
state_dict (OrderedDict[str, Tensor]): state_dict tensor
strict (bool, optional): [description]. Defaults to False.
"""
self.fc.load_state_dict(state_dict, strict=strict)
self.fc.eval()
mlp = RNN(input_shape=1024, n_neurons=1024, num_layers=2, n_class=2)
mlp.fit(embeddings_train, np.array(y_train),validation_data=(embeddings_test, np.array(y_test)), epochs=30)
mlp.save("model.pt")
These are the errors that are occured. I really need help and i remain at your disposal for further informations.
Error 1
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-154-e5fde11a675c> in <module>
1 # init MLP model, train it on the data, then save model
2 mlp = RNN(input_shape=1024, n_neurons=1024, num_layers=2, n_class=2)
----> 3 mlp.fit(embeddings_train, np.array(y_train),validation_data=(embeddings_test, np.array(y_test)), epochs=30)
4 mlp.save("model.pt")
<ipython-input-153-a8d51af53bb5> in fit(self, x, y, epochs, batch_size, class_weight, validation_data, **kwargs)
134 val_loader = None
135 self.trainer = pl.Trainer(max_epochs=epochs, **kwargs)
--> 136 self.trainer.fit(self, train_loader, val_loader)
137 def predict(self, x):
138 """Run inference on data."""
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
456 )
457
--> 458 self._run(model)
459
460 assert self.state.stopped
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model)
754
755 # dispatch `start_training` or `start_evaluating` or `start_predicting`
--> 756 self.dispatch()
757
758 # plugin will finalized fitting (e.g. ddp_spawn will load trained model)
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in dispatch(self)
795 self.accelerator.start_predicting(self)
796 else:
--> 797 self.accelerator.start_training(self)
798
799 def run_stage(self):
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
94
95 def start_training(self, trainer: 'pl.Trainer') -> None:
---> 96 self.training_type_plugin.start_training(trainer)
97
98 def start_evaluating(self, trainer: 'pl.Trainer') -> None:
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
142 def start_training(self, trainer: 'pl.Trainer') -> None:
143 # double dispatch to initiate the training loop
--> 144 self._results = trainer.run_stage()
145
146 def start_evaluating(self, trainer: 'pl.Trainer') -> None:
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
805 if self.predicting:
806 return self.run_predict()
--> 807 return self.run_train()
808
809 def _pre_training_routine(self):
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_train(self)
840 self.progress_bar_callback.disable()
841
--> 842 self.run_sanity_check(self.lightning_module)
843
844 self.checkpoint_connector.has_trained = False
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_sanity_check(self, ref_model)
1105
1106 # run eval step
-> 1107 self.run_evaluation()
1108
1109 self.on_sanity_check_end()
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_evaluation(self, on_epoch)
960 # lightning module methods
961 with self.profiler.profile("evaluation_step_and_end"):
--> 962 output = self.evaluation_loop.evaluation_step(batch, batch_idx, dataloader_idx)
963 output = self.evaluation_loop.evaluation_step_end(output)
964
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/trainer/evaluation_loop.py in evaluation_step(self, batch, batch_idx, dataloader_idx)
172 model_ref._current_fx_name = "validation_step"
173 with self.trainer.profiler.profile("validation_step"):
--> 174 output = self.trainer.accelerator.validation_step(args)
175
176 # capture any logged information
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py in validation_step(self, args)
224
225 with self.precision_plugin.val_step_context(), self.training_type_plugin.val_step_context():
--> 226 return self.training_type_plugin.validation_step(*args)
227
228 def test_step(self, args: List[Union[Any, int]]) -> Optional[STEP_OUTPUT]:
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in validation_step(self, *args, **kwargs)
159
160 def validation_step(self, *args, **kwargs):
--> 161 return self.lightning_module.validation_step(*args, **kwargs)
162
163 def test_step(self, *args, **kwargs):
<ipython-input-153-a8d51af53bb5> in validation_step(self, batch, batch_idx)
78 if self.output_shape > 1:
79 y_hat = torch.log(y_hat)
---> 80 loss = self.loss(y_hat, y)
81 self.log("val_loss", loss, on_epoch=True, on_step=False)
82 return {"val_loss": loss}
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/torch/nn/modules/loss.py in forward(self, input, target)
611 def forward(self, input: Tensor, target: Tensor) -> Tensor:
612 assert self.weight is None or isinstance(self.weight, Tensor)
--> 613 return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
614
615
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/torch/nn/functional.py in binary_cross_entropy(input, target, weight, size_average, reduce, reduction)
2760 weight = weight.expand(new_size)
2761
-> 2762 return torch._C._nn.binary_cross_entropy(input, target, weight, reduction_enum)
2763
2764
RuntimeError: all elements of input should be between 0 and 1
Error 2
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-139-b7e8b13763ef> in <module>
1 # Model evaluation
----> 2 y_pred = mlp(embeddings_val).squeeze().detach().numpy()
3 model_evaluation_accuracy(np.array(y_val), y_pred)
/opt/conda/envs/bio-transformers/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
<ipython-input-136-e2fc535640ab> in forward(self, x)
55 self.fc= nn.Linear(self.hidden_size, self.output_shape)
56 def forward(self, x):
---> 57 h0=torch.zeros(self.num_layers, x_size(0), self.hidden_size).to(device)
58 c0=torch.zeros(self.num_layers, x_size(0), self.hidden_size).to(device)
59 out, _=self.lstm(x,(h0, c0))
NameError: name 'x_size' is not defined
I am adding this as an answer because it would be too hard to put in comment.
The main problem that you have is about BCE loss. IIRC BCE loss expects p(y=1), so your output should be between 0 and 1. If you want to use logits (which is also more numerically stable), you should use BinaryCrossEntropyWithLogits.
As you mention in one of the comments, you are using the sigmoid activation but something about your forward function looks off to me. Mainly the last line of your forward function is
return self.fc(x)
This does not use sigmoid activation. Moreover you are only using input, x for producing the output. The LSTM outputs are just being discarded? I think, it would be a good idea to add some prints statements or breakpoints to make sure that the intermediate outputs are as you expect them to be.
I got the error RuntimeError: all elements of input should be between 0 and 1 because my x data had NaN entries.
I just bumped into this myself. It looks like both you and I missed adding a sigmoid function at the end of the forward function. This update should fix your problem.
def forward(self, x):
h0=torch.zeros(self.num_layers, x_size(0), self.n_neurons).to(device)
c0=torch.zeros(self.num_layers, x_size(0), self.n_neurons).to(device)
out, _=self.lstm(x,(h0, c0))
out=self.fc(out[:, -1, :])
return torch.sigmoid(out)

Why am I getting a Pytorch Runtime Error on Test Set

I have a model that is a binary image classification model with the resnext model. I keep getting a run time error when it gets to the test set. Error message is
RuntimeError: Expected object of backend CPU but got backend CUDA for argument #2 'weight'
I am sending my test set tensors to my GPU like my train model. I've looked at the following and I'm doing what was suggested here as stated above.
Here is my model code:
resnext = models.resnext50_32x4d(pretrained=True)
resnext = resnext.to(device)
for param in resnext.parameters():
param.requires_grad = True
resnext.classifier = nn.Sequential(nn.Linear(2048, 1000),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(1000, 2),
nn.Softmax(dim = 1))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnext.classifier.parameters(), lr=0.001)
import time
start_time = time.time()
epochs = 1
max_trn_batch = 5
max_tst_batch = 156
y_val_list = []
policy_list = []
train_losses = []
test_losses = []
train_correct = []
test_correct = []
for i in range(epochs):
for i in tqdm(range(0, max_trn_batch)):
trn_corr = 0
tst_corr = 0
# Run the training batches
for b, (X_train, y_train, policy) in enumerate(train_loader):
#print(y_train, policy)
X_train = X_train.to(device)
y_train = y_train.to(device)
if b == max_trn_batch:
break
b+=1
# Apply the model
y_pred = resnext(X_train)
loss = criterion(y_pred, y_train)
# Tally the number of correct predictions
predicted = torch.max(y_pred.data, 1)[1]
batch_corr = (predicted == y_train).sum()
trn_corr += batch_corr
# Update parameters
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Print interim results
if b%1 == 0:
print(f'epoch: {i:2} batch: {b:4} [{100*b:6}/63610] loss: {loss.item():10.8f} \
accuracy: {trn_corr.item()/(100*b):7.3f}%')
train_losses.append(loss)
train_correct.append(trn_corr)
# Run the testing batches
with torch.no_grad():
for b, (X_test, y_test, policy) in enumerate(test_loader):
policy_list.append(policy)
X_test.to(device)
y_test.to(device)
if b == max_tst_batch:
break
# Apply the model
y_val = resnext(X_test)
y_val_list.append(y_val.data)
# Tally the number of correct predictions
predicted = torch.max(y_val.data, 1)[1]
tst_corr += (predicted == y_test).sum()
loss = criterion(y_val, y_test)
test_losses.append(loss)
test_correct.append(tst_corr)
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed
Here is the full traceback:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-84-48bce2e8d4fa> in <module>
60
61 # Apply the model
---> 62 y_val = resnext(X_test)
63 y_val_list.append(y_val.data)
64 # Tally the number of correct predictions
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
545 result = self._slow_forward(*input, **kwargs)
546 else:
--> 547 result = self.forward(*input, **kwargs)
548 for hook in self._forward_hooks.values():
549 hook_result = hook(self, input, result)
C:\ProgramData\Anaconda3\lib\site-packages\torchvision\models\resnet.py in forward(self, x)
194
195 def forward(self, x):
--> 196 x = self.conv1(x)
197 x = self.bn1(x)
198 x = self.relu(x)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
545 result = self._slow_forward(*input, **kwargs)
546 else:
--> 547 result = self.forward(*input, **kwargs)
548 for hook in self._forward_hooks.values():
549 hook_result = hook(self, input, result)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\conv.py in forward(self, input)
341
342 def forward(self, input):
--> 343 return self.conv2d_forward(input, self.weight)
344
345 class Conv3d(_ConvNd):
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\conv.py in conv2d_forward(self, input, weight)
338 _pair(0), self.dilation, self.groups)
339 return F.conv2d(input, weight, self.bias, self.stride,
--> 340 self.padding, self.dilation, self.groups)
341
342 def forward(self, input):
RuntimeError: Expected object of backend CPU but got backend CUDA for argument #2 'weight'
Again, my tensors and the model are sent to the GPU so I'm not sure what is going on. Does anyone see my mistake?
[...] my tensors and the model are sent to the GPU [...]
Not the test Tensors. It is a simple mistake:
X_test.to(device)
y_test.to(device)
should be
X_test = X_test.to(device)
y_test = y_test.to(device)

Unexpected data types when trying to train a pytorch model

I'm putting together a basic neural network to learn pytorch. Attempting to train it always fails with the message "Expected object of scalar type Float but got scalar type Double for argument #4 'mat1'". I suspect I'm doing something wrong with putting the data together, but I don't know what.
The data in question is a couple of one-dimensional lists of numbers that I've generated, which should be linearly separable.
I've pasted my code below.
class MyDataset(Dataset):
def __init__(self, xs, ys):
assert len(xs) == len(ys), "Input and output tensors must be the same length"
self.xs = np.array(xs, dtype=np.double)
self.ys = np.array(ys, dtype=np.double)
def __getitem__(self, idx):
return (self.xs[idx], self.ys[idx])
def __len__(self):
return len(self.xs)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layer1 = nn.Linear(1, 1)
def forward(self, x):
x = F.relu(self.layer1(x))
return x
def train(data, validation, net, epochs=100):
learning_rate = 0.01
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
for epoch in range(0, epochs):
print('Beginning epoch ', epoch+1)
training_losses = []
validation_losses = []
for x_batch, y_batch in data:
optimizer.zero_grad()
yhat = net(x_batch)
loss = criterion(y_batch, yhat)
loss.backward()
optimizer.step()
optimizer.zero_grad()
training_losses.append(loss)
with torch.no_grad():
for x_batch, y_batch in validation:
net.eval()
yhat = net(x_batch)
loss = criterion(y_batch, yhat)
validation_losses.append(loss)
print('Ending epoch ', epoch+1, 'Training loss: ', np.mean(training_losses), 'Validation loss: ', np.mean(validation_losses))
And this is how I'm generating the data and attempting to train it:
num_samples = 10000
foos = [100 + np.random.normal(scale=20) for x in range(0, num_samples)]
bars = [200 + np.random.normal(scale=20) for x in range(0, num_samples)]
xs = foos + bars
xs = torch.tensor([[x] for x in xs])
ys = np.concatenate([np.zeros(num_samples), np.ones(num_samples)])
ys = torch.tensor([[y] for y in ys])
dataset = MyDataset(xs, ys)
train_dataset, val_dataset = random_split(dataset, [16000, 4000])
train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)
net = Net()
train(train_loader, val_loader, net)
Finally, here's the stack trace:
<ipython-input-114-ab674ae015a5> in train(data, validation, net, epochs)
13 print('x_batch: ', type(x_batch[0].item()))
14 print('y_batch: ', type(y_batch[0].item()))
---> 15 yhat = net(x_batch)
16 loss = criterion(y_batch, yhat)
17 loss.backward()
/usr/local/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
487 result = self._slow_forward(*input, **kwargs)
488 else:
--> 489 result = self.forward(*input, **kwargs)
490 for hook in self._forward_hooks.values():
491 hook_result = hook(self, input, result)
<ipython-input-58-ec2e6d981760> in forward(self, x)
5
6 def forward(self, x):
----> 7 x = F.relu(self.layer1(x))
8 return x
/usr/local/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
487 result = self._slow_forward(*input, **kwargs)
488 else:
--> 489 result = self.forward(*input, **kwargs)
490 for hook in self._forward_hooks.values():
491 hook_result = hook(self, input, result)
/usr/local/lib/python3.6/site-packages/torch/nn/modules/linear.py in forward(self, input)
65 #weak_script_method
66 def forward(self, input):
---> 67 return F.linear(input, self.weight, self.bias)
68
69 def extra_repr(self):
/usr/local/lib/python3.6/site-packages/torch/nn/functional.py in linear(input, weight, bias)
1350 if input.dim() == 2 and bias is not None:
1351 # fused op is marginally faster
-> 1352 ret = torch.addmm(torch.jit._unwrap_optional(bias), input, weight.t())
1353 else:
1354 output = input.matmul(weight.t())
RuntimeError: Expected object of scalar type Float but got scalar type Double for argument #4 'mat1'
I've attempted to debug by logging the types of x_batch and y_batch from within the train method, but they're both showing as float, so I'm stumped as to where the Double is coming from.
Any suggestions?
PyTorch uses single-precision floats by default.
In the lines:
self.xs = np.array(xs, dtype=np.double)
self.ys = np.array(ys, dtype=np.double)
Replace np.double with np.float32.

Resources