Related
first of all I thank , I tried to train model with pytorch but I got the following error:
RuntimeError: stack expects a non-empty TensorList .I am trying to model a extract features point cloud using deep learning in pytorch.
I get the following error . Could anyone help on this? ************** ***************
Thanks!
def training_loop(gpu, training_dataloader, model, loss_fn, optimizer):
losses = []
correct = 0
batch_results = dict()
conf_mat = np.zeros((10,10))
for batch_n, batch in enumerate(training_dataloader): #batch[batch, pos, ptr, y]
batch_size = int(batch.batch.size()[0] / sample_points)
if dimensionality == 3:
# Input dim [:,3] for your geometry x,y,z
X = batch.pos.cuda(non_blocking=True).view(batch_size, sample_points, -1) + torch.normal(
torch.zeros(batch_size, sample_points, dimensionality), torch.full((batch_size, sample_points,
dimensionality), fill_value=0.1)).cuda(gpu)
else:
# Input dim [:,6] for your geometry x,y,z and normals nx,ny,nz
X = torch.cat((batch.pos.cuda(non_blocking=True), batch.normal.cuda(non_blocking=True)), 1).view(batch_size, sample_points, -1) + torch.normal(
torch.zeros(batch_size, sample_points, dimensionality), torch.full((batch_size, sample_points,
dimensionality), fill_value=0.1)).cuda(gpu)
y = batch.y.cuda(non_blocking=True).flatten() #size (batch_size) --> torch.Size([8])
# Compute predictions
pred = model(None, X) #size (batch_size,classes) --> torch.Size([8, 10])
if overall_classes_loss:
# weighted CE Loss over all classes
loss = loss_fn(pred, y)
else:
# weighted batchwise Loss
sample_count = np.array([[x, batch.y.tolist().count(x)] for x in batch.y])[:,1]
batch_weights = 1. / sample_count
batch_weights = torch.from_numpy(batch_weights)
batch_weights = batch_weights.double()
loss = element_weighted_loss(pred, batch.y, batch_weights, gpu)
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
print(f"Loss: {loss}")
tensor_list_y = [torch.ones_like(y) for _ in range(dist.get_world_size())]
tensor_list_pred = [torch.ones_like(y) for _ in range(dist.get_world_size())]
torch.distributed.all_gather(tensor_list_y, y, group=None, async_op=False)
torch.distributed.all_gather(tensor_list_pred, pred.argmax(1), group=None, async_op=False)
tensor_list_y = torch.cat(tensor_list_y)
tensor_list_pred = torch.cat(tensor_list_pred)
# Confusion Matrix
conf_mat += confusion_matrix(tensor_list_y.cpu().detach().numpy(), tensor_list_pred.cpu().detach().numpy(), labels=np.arange(0,10))
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
# Save batch predictions
batch_results[batch_n] = {'true':tensor_list_y, 'pred':tensor_list_pred}
if verbosity == True:
print(f"\n\nTRAIN on GPU:{gpu}: True Label {y} - Prediction {pred.argmax(1)} - Loss {loss}")
truevalue = '\t\t'.join(classes[items] for items in y.tolist())
predvalues = '\t\t'.join(classes[items] for items in pred.argmax(1).tolist())
print(f"INFO on GPU:{gpu}: TRAIN - True Value\t {truevalue}")
print(f"INFO on GPU:{gpu}: TRAIN - Predictions\t {predvalues}")
if batch_n % 25 == 0:
torch.distributed.reduce(loss, 0)
"""if gpu == 0:
# print predictions and true values
#truevalue = '\t\t'.join(classes[items] for items in y.tolist())
#predvalues = '\t\t'.join(classes[items] for items in pred.argmax(1).tolist())
#print(f"\n\nINFO on GPU{gpu}: TRAIN - True Value\t {truevalue}")
#print(f"INFO on GPU{gpu}: TRAIN - Predictions\t {predvalues}")
#print("INFO: TRAIN - Correctness\t", pred.argmax(1) == y)
#print(f"INFO: TRAIN - Single Batch Test Accuracy {correct * 100 / batch_size}\n\n")
loss, current = loss.item(), batch_n * len(X)
#print(f"loss: {loss:>7f}")"""
#print(f"conf_mat: {conf_mat}")
#print(f"batch_results: {batch_results}")
return torch.tensor(losses, device=f"cuda:{gpu}"), torch.tensor(correct, device=f"cuda:{gpu}"), batch_results, conf_mat
def train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, scheduler, dir_path, initial_epoch):
epoch_losses = []
training_accuracies = []
test_losses = []
test_accuracies = []
learning_rates = []
counter = 0 #early stopping counter
batchwise_results = dict()
# Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=20)
for i in range(initial_epoch, initial_epoch + epochs):
if gpu == 0:
if initial_epoch > 0:
print(f"\n\nEpoch {i}\n-------------------------------")
else:
print(f"\n\nEpoch {i + 1}\n-------------------------------")
# TRAIN
losses, training_accuracy, train_batch_result, train_conf_mat = training_loop(gpu, training_dataloader, model, loss_fn, optimizer)
average_loss = torch.mean(losses)
torch.distributed.reduce(average_loss, 0, torch.distributed.ReduceOp.SUM)
torch.distributed.reduce(training_accuracy, 0, torch.distributed.ReduceOp.SUM)
# TEST
test_accuracy, test_loss, test_batch_result, test_conf_mat = test_loop(gpu, test_dataloader, model, loss_fn)
torch.distributed.reduce(test_accuracy, 0, torch.distributed.ReduceOp.SUM)
torch.distributed.reduce(test_loss, 0, torch.distributed.ReduceOp.SUM)
# save results
batchwise_results[i] = {'train':train_batch_result, 'test':test_batch_result}
if gpu == 0: # the following operations are performed only by the process running in the first gpu
average_loss = average_loss / torch.tensor(gpus, dtype=torch.float) # average loss among all gpus
test_accuracy = test_accuracy / torch.tensor(len(test_dataloader.dataset),
dtype=torch.float) * torch.tensor(100.0)
training_accuracy = training_accuracy / torch.tensor(len(training_dataloader.dataset),
dtype=torch.float) * torch.tensor(100.0)
test_loss = test_loss / torch.tensor(gpus, dtype=torch.float)
epoch_losses.append(average_loss.item())
training_accuracies.append(training_accuracy.item())
test_losses.append(test_loss.item())
test_accuracies.append(test_accuracy.item())
learning_rates.append((optimizer.param_groups[0])["lr"])
print(f"\nBatch size: {batch_size * int(gpus)}")
print(f"average Training Loss: {average_loss.item():.6f}")
print(f"average Test Loss: {test_loss.item():.6f}")
print(f"\naverage Training Acc: {training_accuracy.item():.6f}")
print(f"average Test Acc: {test_accuracy.item():.6f}")
printLearningRate(optimizer)
scheduler.step(test_loss)
"""# stepwise learning rate decay
if average_loss.item() <= 0.35:
for param_group in optimizer.param_groups:
print("Learning rate changed to 0.007")
param_group['lr'] = 0.007
if average_loss.item() <= 0.30:
for param_group in optimizer.param_groups:
print("Learning rate changed to 0.004")
param_group['lr'] = 0.004"""
# saving model checkpoint
save_checkpoint(model, optimizer, scheduler, i, epoch_losses, training_accuracies, test_losses, test_accuracies, learning_rates,
os.path.join(dir_path, f"epoch{i}.pth"), {key: value for key, value in batchwise_results[i].items() if key == 'train'}, {key: value for key, value in batchwise_results[i].items() if key == 'test'}, train_conf_mat, test_conf_mat)
#TODO: implement ONNX Export
# early stopping scheduler
if early_stopping(test_losses) == True:
counter += 1
print(f"Early Stopping counter: {counter} of {patience}")
else:
counter += 0
if counter < patience:
pass
else:
print("\n\nEarly Stopping activated")
print(f"Training stopped at Epoch{i + 1}")
dist.destroy_process_group()
exit()
class DistributedWeightedSampler(Sampler):
def __init__(self, dataset, data_source: Optional[Sized], num_replicas: Optional[int] = None,
rank: Optional[int] = None, shuffle: bool = True, seed: int = 0, replacement: bool = True):
super().__init__(data_source)
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size()
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank()
if rank >= num_replicas or rank < 0:
raise ValueError("Invalid rank {}, rank should be in the interval [0, {}]".format(rank, num_replicas - 1))
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas
self.shuffle = shuffle
self.seed = seed
self.replacement = replacement #sample can be drown again in that row if True
def calculate_weights(self, targets):
class_sample_count = np.array([len(np.where(self.dataset.data.y == t)[0]) for t in np.unique(self.dataset.data.y)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in self.dataset.data.y])
samples_weight = torch.from_numpy(samples_weight)
samples_weigth = samples_weight.double()
return samples_weigth
def __iter__(self):
# deterministically shuffle based on epoch
if self.shuffle:
g = torch.Generator()
g.manual_seed(self.seed + self.epoch)
indices = torch.randperm(len(self.dataset), generator=g).tolist()
else:
indices = list(range(len(self.dataset)))
# add extra samples to make it evenly divisible
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
# subsample
indices = indices[self.rank:self.total_size:self.num_replicas]
assert len(indices) == self.num_samples
# get targets (you can alternatively pass them in __init__, if this op is expensive)
# data.data.y == labels
targets = self.dataset.data.y
targets = targets[self.rank:self.total_size:self.num_replicas]
#assert len(targets) == self.num_samples
weights = self.calculate_weights(targets)
weighted_indices = torch.multinomial(weights, self.num_samples, self.replacement).tolist()
return iter(weighted_indices)
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
def train(gpu, gpus, world_size):
torch.manual_seed(0)
torch.cuda.set_device(gpu)
try:
#dist.init_process_group(backend='nccl', world_size=world_size, rank=gpu) #for distributed GPU training
dist.init_process_group(backend='gloo', world_size=world_size, rank=gpu) #as a fallback option
except RuntimeError:
print("\n\nINFO:RuntimeError is raised >> Used gloo backend instead of nccl!\n")
dist.init_process_group(backend='gloo', world_size=world_size, rank=gpu) #as a fallback option
dir_path = None
if gpu == 0:
dir_path = "stackgraphConvPool3DPnet"
createdir(dir_path)
training_number = next_training_number(dir_path)
dir_path = os.path.join(dir_path, f"train{training_number}")
createdir(dir_path)
#save hyper-parameters in txt protocol file
save_hyperparameters(dir_path, 'hyperparameters.txt')
print("\nINFO: Protocol File saved successfully . . .")
#copy crucial py-files in current train folder
#shutil.copy2(os.path.basename('__file__'), dir_path)
#shutil.copy2('stackGraphConvPool3DPnet.py', dir_path)
#shutil.copy2('shrinkingunit.py', dir_path)
#shutil.copy2('utilities.py', dir_path)
#print("\nINFO: Script Files copied successfully . . .")
model = Classifier(shrinkingLayers, mlpClassifier)
torch.cuda.set_device(gpu)
model.cuda(gpu)
#setting up optimizer
if optimizer_str == "SGD":
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, weight_decay=weight_decay)
elif optimizer_str == "RMSprop":
optimizer = torch.optim.RMSprop(model.parameters(), learning_rate, weight_decay=weight_decay)
else:
optimizer = torch.optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay)
# single-program multiple-data training paradigm (Distributed Data-Parallel Training)
model = DDP(model, device_ids=[gpu])
if dimensionality == 3:
training_data = ModelNet("ModelNet10_train_data", transform=lambda x: NormalizeScale()(SamplePoints(num=sample_points)(x)))
else:
training_data = ModelNet("ModelNet10_train_data", transform=lambda x: NormalizeScale()(NormalizeRotation()(SamplePoints(num=sample_points, remove_faces=True, include_normals=True)(x))))
training_sampler = DistributedWeightedSampler(training_data, data_source=None, num_replicas=world_size) #weight unbalanced classes by 1/cls_count
training_dataloader = DataLoader(dataset=training_data, batch_size=batch_size, shuffle=data_shuffle, num_workers=0,
pin_memory=True, sampler=training_sampler)
if dimensionality == 3:
test_data = ModelNet("ModelNet10_test_data", train=False, transform=lambda x: NormalizeScale()(SamplePoints(num=sample_points)(x)))
else:
test_data = ModelNet("ModelNet10_test_data", train=False, transform=lambda x: NormalizeScale()(NormalizeRotation()(SamplePoints(num=sample_points, remove_faces=True, include_normals=True)(x))))
test_sampler = DistributedWeightedSampler(test_data,data_source=None, num_replicas=world_size) #weight unbalanced classes by 1/cls_count
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=data_shuffle, num_workers=0,
pin_memory=True, sampler=test_sampler)
"""# save sampled data for later result visualisation
try:
#export_path = os.path.join("stackgraphConvPool3DPnet", "train" + str(next_training_number("stackgraphConvPool3DPnet")-1))
#export_sampled_data(training_dataloader, os.path.join(export_path, "train_sampledPoints.pth"))
#export_sampled_data(test_dataloader, os.path.join(export_path, "test_sampledPoints.pth"))
print("\nINFO: Sampled 3D data successfully saved . . .")
except Exception as e:
print(f"\nERROR: Sampled 3D data could not saved successfully . . . - this process does not executed - caused by {e}")"""
# weighted CE Loss over all Classes C
class_sample_count = np.array([len(np.where(training_data.data.y == t)[0]) for t in np.unique(training_data.data.y)])
weight = 1. / class_sample_count
weight = torch.from_numpy(weight)
weight = weight.float()
loss_fn = nn.CrossEntropyLoss(weight=weight).cuda(gpu)
# continue training from certain checkpoint
continue_from_scratch = True if args.resume is None else False
if continue_from_scratch:
if gpu == 0:
print("\nINFO: Train from scratch has started . . .")
train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, 0)
else:
checkpoint_path = "stackgraphConvPool3DPnet/" + args.resume
if gpu == 0:
print(f"\nINFO: Train has started from certain checkpoint {checkpoint_path.split('/')[2].split('.')[0]} in {checkpoint_path.split('/')[1]} . . .")
model.load_state_dict(torch.load(checkpoint_path)['model_state_dict'], strict=False)
optimizer.load_state_dict(torch.load(checkpoint_path)['optimizer_state_dict'])
final_epoch = (torch.load("stackgraphConvPool3DPnet/" + args.resume)['epoch'])+1
train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, final_epoch)
INFO: Train from scratch has started . . .
Epoch 1
-------------------------------
Exception in thread Exception in thread Thread-8:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 973, in _bootstrap_inner
Thread-7:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 973, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 910, in run
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 910, in run
self._target(*self._args, **self._kwargs)
File ~\Desktop\Forum\unit.py", line 615, in kmeansAppender
self._target(*self._args, **self._kwargs)
File ~\Desktop\Forum\unit.py", line 615, in kmeansAppender
x, y, z = module(input)
File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
x, y, z = module(input)
File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File ~\Desktop\Forum\unit.py", line 148, in forward
labels = np.apply_along_axis(lambda x: x + (i*self.k), axis=0, arr=kmeans.labels_)
AttributeError: 'KMeans' object has no attribute 'labels_'
return forward_call(*input, **kwargs)
File ~\Desktop\Forum\unit.py", line 148, in forward
labels = np.apply_along_axis(lambda x: x + (i*self.k), axis=0, arr=kmeans.labels_)
AttributeError: 'KMeans' object has no attribute 'labels_'
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Input In [1], in <cell line: 720>()
734 #change state
735 if args.train_state == True:
--> 736 train(args.local_rank, gpus, world_size)
737 else:
738 infer(args.local_rank, gpus, world_size, args.checkpoint, args.data)
Input In [1], in train(gpu, gpus, world_size)
672 if gpu == 0:
673 print("\nINFO: Train from scratch has started . . .")
--> 674 train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, 0)
675 else:
676 checkpoint_path = "stackgraphConvPool3DPnet/" + args.resume
Input In [1], in train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, scheduler, dir_path, initial_epoch)
454 print(f"\n\nEpoch {i + 1}\n-------------------------------")
456 # TRAIN
--> 457 losses, training_accuracy, train_batch_result, train_conf_mat = training_loop(gpu, training_dataloader, model, loss_fn, optimizer)
458 average_loss = torch.mean(losses)
459 torch.distributed.reduce(average_loss, 0, torch.distributed.ReduceOp.SUM)
Input In [1], in training_loop(gpu, training_dataloader, model, loss_fn, optimizer)
249 y = batch.y.cuda(non_blocking=True).flatten() #size (batch_size) --> torch.Size([8])
251 # Compute predictions
--> 252 pred = model(None, X) #size (batch_size,classes) --> torch.Size([8, 10])
254 if overall_classes_loss:
255 # weighted CE Loss over all classes
256 loss = loss_fn(pred, y)
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\parallel\distributed.py:1008, in DistributedDataParallel.forward(self, *inputs, **kwargs)
1004 if self._join_config.enable:
1005 # Notify joined ranks whether they should sync in backwards pass or not.
1006 self._check_global_requires_backward_grad_sync(is_joined_rank=False)
-> 1008 output = self._run_ddp_forward(*inputs, **kwargs)
1010 # sync params according to location (before/after forward) user
1011 # specified as part of hook, if hook was specified.
1012 if self._check_sync_bufs_post_fwd():
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\parallel\distributed.py:969, in DistributedDataParallel._run_ddp_forward(self, *inputs, **kwargs)
962 if self.device_ids:
963 inputs, kwargs = _to_kwargs(
964 inputs,
965 kwargs,
966 self.device_ids[0],
967 self.use_side_stream_for_tensor_copies
968 )
--> 969 return module_to_run(*inputs[0], **kwargs[0])
970 else:
971 return module_to_run(*inputs, **kwargs)
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~\Desktop\Forum\unit.py:657, in Classifier.forward(self, x, pos)
655 feature_matrix_batch = pos.unsqueeze(0)
656 # feature_matrix_batch size = (1,N,I,D) where N=batch number, I=members, D=member dimensionality
--> 657 output = self.neuralNet(feature_matrix_batch)
658 # output size = (S,N,D) where S= stack size, N=batch number, D'=member dimensionality
659 output = torch.mean(output, dim=0)
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\container.py:139, in Sequential.forward(self, input)
137 def forward(self, input):
138 for module in self:
--> 139 input = module(input)
140 return input
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~\Desktop\Forum\unit.py:448, in ShrinkingUnitStack.forward(self, feature_matrix_batch)
446 feature_matrix_batch = self.selfCorrStack(feature_matrix_batch)
447 # feature_matrix_batch size = (S',N,I,D) where S'=stack_size, N=batch number, I=members, D=member dimensionality
--> 448 feature_matrix_batch_, conv_feature_matrix_batch, cluster_index = self.kmeansConvStack(feature_matrix_batch)
449 feature_matrix_batch = self.localAdaptFeaAggreStack(feature_matrix_batch, conv_feature_matrix_batch)
450 output = self.graphMaxPoolStack(feature_matrix_batch, cluster_index)
File C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~\Desktop\Forum\unit.py:519, in KMeansConvStack.forward(self, feature_matrix_batch)
517 def forward(self, feature_matrix_batch: torch.Tensor):
518 # feature_matrix_batch size = (S,N,I,D) where S=stack size, N=batch number, I=members, D=member dimensionality
--> 519 feature_matrix_batch, conv_feature_matrix_batch, cluster_index = kmeansConvThreader(self.kmeansConvStack,
520 feature_matrix_batch)
521 # feature_matrix_batch size = (S,N,I,D) where where S=stack_size, N=batch number, I=members, D=member dimensionality
522 # conv_feature_matrix_batch size = (S,N,I,D) where where S=stack_size, N=batch number, I=members, D=member dimensionality
523 # cluster_index size = (S,M) where S=stack_size, M=N*I
524 return feature_matrix_batch, conv_feature_matrix_batch, cluster_index
File ~\Desktop\Forum\unit.py:611, in kmeansConvThreader(modules, input_tensor)
609 list2_append = list(map(lambda x: x[1], list2_append))
610 list3_append = list(map(lambda x: x[1], list3_append))
--> 611 return torch.stack(list1_append), torch.stack(list2_append), torch.stack(list3_append)
RuntimeError: stack expects a non-empty TensorList
def forward(self, feature_matrix_batch):
# feature_matrix_batch size = (N,I,D) where N=batch number, I=members, D=member dimensionality
N, I, D = feature_matrix_batch.size()
clusters = []
for i, feature_matrix in enumerate(feature_matrix_batch):
kmeans = KMeans(n_clusters=self.k, init=self.kmeansInit, n_init=self.n_init)
labels = np.apply_along_axis(lambda x: x + (i*self.k), axis=0, arr=kmeans.labels_)
def kmeansConvThreader(modules, input_tensor):
list1_append = []
list2_append = []
list3_append = []
threads = []
for i, t in enumerate(input_tensor):
threads.append(
Thread(target=kmeansAppender, args=(modules[i], t, list1_append, list2_append, list3_append, i)))
[t.start() for t in threads]
[t.join() for t in threads]
list1_append.sort()
list2_append.sort()
list3_append.sort()
list1_append = list(map(lambda x: x[1], list1_append))
list2_append = list(map(lambda x: x[1], list2_append))
list3_append = list(map(lambda x: x[1], list3_append))
return torch.stack(list1_append), torch.stack(list2_append), torch.stack(list3_append)
AttributeError: 'KMeans' object has no attribute 'labels_'
RuntimeError: stack expects a non-empty TensorList
Thanks for your help
Edit: I figured that leaving out the early stopping solves the problem, but that is not an option for me. Is there a way to include early stopping in a grid search together with multiprocessing?
I am trying to run GridSeachCV in a Jupyter Notebook for multiple datasets, one after another, using multiprocessing for each grid search. This works fine for the first grid search, but after that I keep getting errors for all other grid searches. I even get errors when I run the same cell again, that worked initially.
import numpy as np
from sklearn.model_selection import GridSearchCV
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
def create_model(n_weights=1000, hidden_layers=1, learning_rate=0.001):
# formulas derived from nWeights = sum (d(l-1)+1)*d(l) for all layers l with output dim d(l)
if hidden_layers == 1: # 100% of neurons in first hidden layer
neurons = [ceil((n_weights - 1) / 3)]
elif hidden_layers == 2: # 70% / 30% split of neurons
x = 1/7 * (np.sqrt(21 * n_weights + 79) - 10)
neurons = list(map(floor,[7/3 * x, x]))
elif hidden_layers == 3: # 50% / 30% / 20% split
x = 1/21 * (np.sqrt(84 * n_weights + 205) - 17)
neurons = list(map(floor, [5/2 * x, 3/2 * x, x]))
else:
raise Exception('Only 1, 2 or 3 layers allowed')
model = Sequential([Dense(neurons[0], activation='relu', input_dim=1)])
for n in neurons[1:]:
model.add(Dense(n, activation='relu'))
model.add(Dense(1))
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')
return model
batch_size = [64, 128]
learning_rate = [0.01, 0.001]
hidden_layers = [1, 2, 3]
n_weights = [10, 30, 60]
p_grid = dict(n_weights=n_weights, hidden_layers=hidden_layers, batch_size=batch_size, learning_rate=learning_rate)
earlyStop = keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
epochs = 5000
# first grid search
model1 = KerasRegressor(create_model, epochs=epochs, verbose=0)
grid1 = GridSearchCV(estimator=model, param_grid=p_grid, n_jobs=-1, cv=4, verbose=1)
result1 = grid1.fit(X_train1, y_train1, callbacks=[earlyStop])
# second grid search
model2 = KerasRegressor(create_model, epochs=epochs, verbose=0)
grid2 = GridSearchCV(estimator=model, param_grid=p_grid, n_jobs=-1, cv=4, verbose=1)
result2 = grid2.fit(X_train1, y_train1, callbacks=[earlyStop])
The above code runs two identical grid searches on two different data sets. The first one works fine, but the second fails in this line
grid2 = GridSearchCV(estimator=model, param_grid=p_grid, n_jobs=-1, cv=4, verbose=1)
with the error
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\oli-w\anaconda3\envs\tensorflow\lib\site-packages\joblib\externals\loky\backend\queues.py", line 153, in _feed
obj_ = dumps(obj, reducers=reducers)
File "C:\Users\oli-w\anaconda3\envs\tensorflow\lib\site-packages\joblib\externals\loky\backend\reduction.py", line 271, in dumps
dump(obj, buf, reducers=reducers, protocol=protocol)
File "C:\Users\oli-w\anaconda3\envs\tensorflow\lib\site-packages\joblib\externals\loky\backend\reduction.py", line 264, in dump
_LokyPickler(file, reducers=reducers, protocol=protocol).dump(obj)
File "C:\Users\oli-w\anaconda3\envs\tensorflow\lib\site-packages\joblib\externals\cloudpickle\cloudpickle_fast.py", line 602, in dump
return Pickler.dump(self, obj)
TypeError: cannot pickle '_thread.RLock' object
"""
The above exception was the direct cause of the following exception:
PicklingError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_9700/1799096090.py in <module>
1 # second grid search
2 model2 = KerasRegressor(create_model, epochs=epochs, verbose=0)
----> 3 grid2 = GridSearchCV(estimator=model, param_grid=p_grid, n_jobs=-1, cv=4, verbose=1)
4 result2 = grid2.fit(X_train1, y_train1, callbacks=[earlyStop])
~\anaconda3\envs\tensorflow\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
889 return results
890
--> 891 self._run_search(evaluate_candidates)
892
893 # multimetric is determined here because in the case of a callable
~\anaconda3\envs\tensorflow\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1390 def _run_search(self, evaluate_candidates):
1391 """Search all candidates in param_grid"""
-> 1392 evaluate_candidates(ParameterGrid(self.param_grid))
1393
1394
~\anaconda3\envs\tensorflow\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
836 )
837
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
~\anaconda3\envs\tensorflow\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1054
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
~\anaconda3\envs\tensorflow\lib\site-packages\joblib\parallel.py in retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
~\anaconda3\envs\tensorflow\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\anaconda3\envs\tensorflow\lib\concurrent\futures\_base.py in result(self, timeout)
437 raise CancelledError()
438 elif self._state == FINISHED:
--> 439 return self.__get_result()
440 else:
441 raise TimeoutError()
~\anaconda3\envs\tensorflow\lib\concurrent\futures\_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
PicklingError: Could not pickle the task to send it to the workers.
I would understand if the grid search didn't work at all, but it confuses me that it works once and then starts throwing errors. I was only able to run another grid search after I restarted the kernel. What can I do in order to make multiple grid searches possible in one sitting without restarting the kernel?
I have a trained LSTM-AE, of which the architecture is as follows:
In brief, I have an LSTM-AE of depth 3, the number of cells on the LSTM layers on the encoder side are [120, 80, 50] (and symmetric for the decoder). I built the model using the code shown on this page. For information, because I want to train the LSTM-AT directly on variable-length time series, so I didn't specify the timestamps in the input layer, which means the model is trained on batches of size 1 (one time series per batch).
I can extract the encoder just fine, but I cannot do the same for the decoder :-(... My goal is to check, given a vector of 50 features (which are extracted by the encoder), whether the decoder can reconstruct the input series.
Here's my attempt so far:
# load the full autoencoder
model = load_model(path_to_model)
# reconstruct the decoder
in_layer = Input(shape=(None, 50))
time_dist = model.layers[-1]
dec_1 = model.layers[-2]
dec_2 = model.layers[-3]
dec_3 = model.layers[-4]
rep_vec = model.layers[-5]
out_layer = time_dist(dec_1(dec_2(dec_3(rep_vec(in_layer)))))
decoder = Model(in_layer, out_layer, name='decoder')
res = decoder(input_feature) # input_feature has shape (50,)
I obtained this error:
InvalidArgumentError: slice index 1 of dimension 0 out of bounds. [Op:StridedSlice] name: decoder/repeat/strided_slice/
If you are interested in the full error log...
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
Input In [86], in <module>
13 out_layer = time_dist(dec_1(dec_2(dec_3(rep_vec(in_layer)))))
14 decoder = Model(in_layer, out_layer, name='decoder')
---> 15 res = decoder(input_feature)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1030, in Layer.__call__(self, *args, **kwargs)
1026 inputs = self._maybe_cast_inputs(inputs, input_list)
1028 with autocast_variable.enable_auto_cast_variables(
1029 self._compute_dtype_object):
-> 1030 outputs = call_fn(inputs, *args, **kwargs)
1032 if self._activity_regularizer:
1033 self._handle_activity_regularization(inputs, outputs)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/functional.py:420, in Functional.call(self, inputs, training, mask)
401 #doc_controls.do_not_doc_inheritable
402 def call(self, inputs, training=None, mask=None):
403 """Calls the model on new inputs.
404
405 In this case `call` just reapplies
(...)
418 a list of tensors if there are more than one outputs.
419 """
--> 420 return self._run_internal_graph(
421 inputs, training=training, mask=mask)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/functional.py:556, in Functional._run_internal_graph(self, inputs, training, mask)
553 continue # Node is not computable, try skipping.
555 args, kwargs = node.map_arguments(tensor_dict)
--> 556 outputs = node.layer(*args, **kwargs)
558 # Update tensor_dict.
559 for x_id, y in zip(node.flat_output_ids, nest.flatten(outputs)):
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1030, in Layer.__call__(self, *args, **kwargs)
1026 inputs = self._maybe_cast_inputs(inputs, input_list)
1028 with autocast_variable.enable_auto_cast_variables(
1029 self._compute_dtype_object):
-> 1030 outputs = call_fn(inputs, *args, **kwargs)
1032 if self._activity_regularizer:
1033 self._handle_activity_regularization(inputs, outputs)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/layers/core.py:919, in Lambda.call(self, inputs, mask, training)
915 return var
917 with backprop.GradientTape(watch_accessed_variables=True) as tape,\
918 variable_scope.variable_creator_scope(_variable_creator):
--> 919 result = self.function(inputs, **kwargs)
920 self._check_variables(created_variables, tape.watched_variables())
921 return result
File D:/PhD/Code/feature_learning/train_models/train_lstmae.py:30, in repeat_vector(args)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:206, in add_dispatch_support.<locals>.wrapper(*args, **kwargs)
204 """Call target, and fall back on dispatchers if there is a TypeError."""
205 try:
--> 206 return target(*args, **kwargs)
207 except (TypeError, ValueError):
208 # Note: convert_to_eager_tensor currently raises a ValueError, not a
209 # TypeError, when given unexpected types. So we need to catch both.
210 result = dispatch(wrapper, args, kwargs)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:1040, in _slice_helper(tensor, slice_spec, var)
1038 var_empty = constant([], dtype=dtypes.int32)
1039 packed_begin = packed_end = packed_strides = var_empty
-> 1040 return strided_slice(
1041 tensor,
1042 packed_begin,
1043 packed_end,
1044 packed_strides,
1045 begin_mask=begin_mask,
1046 end_mask=end_mask,
1047 shrink_axis_mask=shrink_axis_mask,
1048 new_axis_mask=new_axis_mask,
1049 ellipsis_mask=ellipsis_mask,
1050 var=var,
1051 name=name)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:206, in add_dispatch_support.<locals>.wrapper(*args, **kwargs)
204 """Call target, and fall back on dispatchers if there is a TypeError."""
205 try:
--> 206 return target(*args, **kwargs)
207 except (TypeError, ValueError):
208 # Note: convert_to_eager_tensor currently raises a ValueError, not a
209 # TypeError, when given unexpected types. So we need to catch both.
210 result = dispatch(wrapper, args, kwargs)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:1213, in strided_slice(input_, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, var, name)
1210 if strides is None:
1211 strides = ones_like(begin)
-> 1213 op = gen_array_ops.strided_slice(
1214 input=input_,
1215 begin=begin,
1216 end=end,
1217 strides=strides,
1218 name=name,
1219 begin_mask=begin_mask,
1220 end_mask=end_mask,
1221 ellipsis_mask=ellipsis_mask,
1222 new_axis_mask=new_axis_mask,
1223 shrink_axis_mask=shrink_axis_mask)
1225 parent_name = name
1227 if var is not None:
File ~/venv/lib/python3.8/site-packages/tensorflow/python/ops/gen_array_ops.py:10505, in strided_slice(input, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, name)
10503 return _result
10504 except _core._NotOkStatusException as e:
> 10505 _ops.raise_from_not_ok_status(e, name)
10506 except _core._FallbackException:
10507 pass
File ~/venv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:6897, in raise_from_not_ok_status(e, name)
6895 message = e.message + (" name: " + name if name is not None else "")
6896 # pylint: disable=protected-access
-> 6897 six.raise_from(core._status_to_exception(e.code, message), None)
File <string>:3, in raise_from(value, from_value)
InvalidArgumentError: slice index 1 of dimension 0 out of bounds. [Op:StridedSlice] name: decoder/repeat/strided_slice/
I appreciate very much any advice you would give me!
Edit
Here is the code I used to build the mode:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.backend import shape
def repeat_vector(args):
"""Builds the repeat vector layer dynamically by the size of the input series"""
layer_to_repeat = args[0]
sequence_layer = args[1]
return RepeatVector(shape(sequence_layer)[1])(layer_to_repeat)
n_atts = 3 # time series of 3 measurements
n_units = [120, 80, 50] # encoder - 1st layer: 120, 2nd layer: 80, 3rd layer: 50 (and symmetric for decoder)
n_layers = len(n_units)
init = GlorotUniform(seed=420)
reg = None
optimizer = Adam(learning_rate=0.0001)
activ = 'tanh'
loss_metric = 'mse'
inputs = Input(shape=(None, n_atts), name='input_layer')
# the encoder
encoded = LSTM(n_units[0], name='encoder_1', return_sequences=(n_layers != 1), kernel_initializer=init,
kernel_regularizer=reg, activation=activ)(inputs)
for i in range(1, n_layers):
if i != n_layers - 1:
encoded = LSTM(n_units[i], name='encoder_{}'.format(i + 1), return_sequences=(n_layers != 1),
kernel_initializer=init, kernel_regularizer=reg, activation=activ)(encoded)
else:
encoded = LSTM(n_units[i], name='encoder_{}'.format(i + 1), return_sequences=False,
kernel_initializer=init, kernel_regularizer=reg, activation=activ)(encoded)
# repeat the vector (plug the encoder to the decoder)
repeated = Lambda(repeat_vector, output_shape=(None, n_units[-1]), name='repeat')([encoded, inputs])
# the decoder
decoded = LSTM(n_units[n_layers - 1], return_sequences=True, name='decoder_1',
kernel_initializer=init, kernel_regularizer=reg, activation=activ)(repeated) # first layer
for i in range(1, n_layers):
decoded = LSTM(n_units[n_layers - 1 - i], return_sequences=True, name='decoder_{}'.format(i + 1),
kernel_initializer=init, kernel_regularizer=reg, activation=activ)(decoded)
# last layer
tdist = TimeDistributed(Dense(n_atts))(decoded)
# compile the model
model = Model(inputs, tdist, name='lstm-ae')
model.compile(optimizer=optimizer, loss=loss_metric)
For information, I use tensorflow 2.5.
Because the number of units is read from a config file, I wrote the code this way to add the layers programmatically.
I am trying to use bert pretrained model for intent classification. here is my code in jupyter notebok.
class DataPreparation:
text_column = "text"
label_column = "intent"
def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
self.tokenizer = tokenizer
self.max_seq_len = 0
self.classes = classes
((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self.prepare_data, [train, test])
print("max seq_len", self.max_seq_len)
self.max_seq_len = min(self.max_seq_len, max_seq_len)
self.train_x, self.test_x = map(self.data_padding, [self.train_x, self.test_x])
def prepare_data(self, df):
x, y = [], []
for _, row in tqdm(df.iterrows()):
text, label = row[DataPreparation.text_column], row[DataPreparation.label_column]
tokens = self.tokenizer.tokenize(text)
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
self.max_seq_len = max(self.max_seq_len, len(token_ids))
x.append(token_ids)
y.append(self.classes.index(label))
return np.array(x), np.array(y)
def data_padding(self, ids):
x = []
for input_ids in ids:
input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
x.append(np.array(input_ids))
return np.array(x)
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
def model_defination(max_seq_len, bert_ckpt_file):
with tf.io.gfile.GFile(bert_config_file, "r") as reader:
bc = StockBertConfig.from_json_string(reader.read())
bert_params = map_stock_config_to_params(bc)
bert_params.adapter_size = None
bert = BertModelLayer.from_params(bert_params, name="bert")
input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32',name="input_ids")
bert_output = bert(input_ids)
print("bert shape", bert_output.shape)
cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
cls_out = keras.layers.Dropout(0.5)(cls_out)
logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
logits = keras.layers.Dropout(0.5)(logits)
logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)
model = keras.Model(inputs=input_ids, outputs=logits)
model.build(input_shape=(None, max_seq_len))
load_stock_weights(bert, bert_ckpt_file)
return model
classes = train.intent.unique().tolist()
data = DataPreparation(train, test, tokenizer, classes, max_seq_len=128)
data.train_x.shape
data.train_y[0]
model = model_defination(data.max_seq_len, bert_ckpt_file)
Now when I am trying to call the function, I am getting error. The parameter values have max_seq_len = 55, bert_ckpt_file = bert checkpoint file.
when i create the model i am getting the below error:
TypeError Traceback (most recent call last)
<ipython-input-17-af3e534b3882> in <module>
----> 1 model = model_defination(data.max_seq_len, bert_ckpt_file)
<ipython-input-16-a83a622dafe3> in model_defination(max_seq_len, bert_ckpt_file)
9 input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32',name="input_ids")
10 #input_spec = tf.keras.layers.InputSpec(ndim=3)
---> 11 bert_output = bert(input_ids)
12
13 print("bert shape", bert_output.shape)
~\Anaconda3\lib\site-packages\keras\engine\base_layer.py in __call__(self, *args, **kwargs)
974 # >> model = tf.keras.Model(inputs, outputs)
975 if _in_functional_construction_mode(self, inputs, args, kwargs, input_list):
--> 976 return self._functional_construction_call(inputs, args, kwargs,
977 input_list)
978
~\Anaconda3\lib\site-packages\keras\engine\base_layer.py in _functional_construction_call(self, inputs, args, kwargs, input_list)
1112 layer=self, inputs=inputs, build_graph=True, training=training_value):
1113 # Check input assumptions set after layer building, e.g. input shape.
-> 1114 outputs = self._keras_tensor_symbolic_call(
1115 inputs, input_masks, args, kwargs)
1116
~\Anaconda3\lib\site-packages\keras\engine\base_layer.py in _keras_tensor_symbolic_call(self, inputs, input_masks, args, kwargs)
846 return tf.nest.map_structure(keras_tensor.KerasTensor, output_signature)
847 else:
--> 848 return self._infer_output_signature(inputs, args, kwargs, input_masks)
849
850 def _infer_output_signature(self, inputs, args, kwargs, input_masks):
~\Anaconda3\lib\site-packages\keras\engine\base_layer.py in _infer_output_signature(self, inputs, args, kwargs, input_masks)
886 self._maybe_build(inputs)
887 inputs = self._maybe_cast_inputs(inputs)
--> 888 outputs = call_fn(inputs, *args, **kwargs)
889
890 self._handle_activity_regularization(inputs, outputs)
~\Anaconda3\lib\site-packages\tensorflow\python\autograph\impl\api.py in wrapper(*args, **kwargs)
693 except Exception as e: # pylint:disable=broad-except
694 if hasattr(e, 'ag_error_metadata'):
--> 695 raise e.ag_error_metadata.to_exception(e)
696 else:
697 raise
TypeError: in user code:
C:\Users\kamrul.moin\Anaconda3\lib\site-packages\bert\model.py:80 call *
output = self.encoders_layer(embedding_output, mask=mask, training=training)
C:\Users\kamrul.moin\Anaconda3\lib\site-packages\keras\engine\base_layer.py:1030 __call__ **
self._maybe_build(inputs)
C:\Users\kamrul.moin\Anaconda3\lib\site-packages\keras\engine\base_layer.py:2659 _maybe_build
self.build(input_shapes) # pylint:disable=not-callable
C:\Users\kamrul.moin\Anaconda3\lib\site-packages\bert\transformer.py:209 build
self.input_spec = keras.layers.InputSpec(shape=input_shape)
C:\Users\kamrul.moin\Anaconda3\lib\site-packages\keras\engine\base_layer.py:2777 __setattr__
super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(name, value) # pylint: disable=bad-super-call
C:\Users\kamrul.moin\Anaconda3\lib\site-packages\tensorflow\python\training\tracking\base.py:530 _method_wrapper
result = method(self, *args, **kwargs)
C:\Users\kamrul.moin\Anaconda3\lib\site-packages\keras\engine\base_layer.py:1296 input_spec
raise TypeError('Layer input_spec must be an instance of InputSpec. '
TypeError: Layer input_spec must be an instance of InputSpec. Got: InputSpec(shape=(None, 55, 768), ndim=3)
I have solved the error. It was due to the shape of my training data. I added index as a column in the training data. After reset the index column in the training data i get it right.
The below code solved the error:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
I try to write a seq2seq decoder with the tensorflow tf.contrib.seq2seq package.
I am wondering if my code is correct and if there is better way to rewrite it. The documentation is not easy to read.
Or my question can be: how can I easily debug this kind of code? How can I inspect some intermediate results in tensorflow?
class Decoder:
def __init__(self, embedding, hidden_size, num_layers=1, max_length=15):
self.embedding = embedding
self.hidden_size = hidden_size
self.num_layers = num_layers
self.cell = tf.nn.rnn_cell.GRUCell(hidden_size)
self.linear = tf.Variable(tf.random_normal(shape=(self.hidden_size, cn_total_words))*0.1)
def __call__(self, inputs, state, encoder_outputs, encoder_state, decoder_length, mode="train"):
with tf.variable_scope("decoder") as scope:
inputs = tf.nn.embedding_lookup(self.embedding, inputs)
encoder_state = tf.tile(tf.expand_dims(encoder_state, 1), (1, tf.shape(inputs)[1], 1))
attention_mechanism = tf.contrib.seq2seq.LuongAttention(self.hidden_size, encoder_outputs)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(self.cell, attention_mechanism, self.hidden_size)
if mode == "train":
helper = tf.contrib.seq2seq.TrainingHelper(inputs=inputs, sequence_length=decoder_length)
elif mode == "infer":
helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.embedding,
start_tokens=tf.tile([en_dict["BOS"]], [tf.shape(inputs)[0]]), end_token=en_dict["EOS"])
decoder = tf.contrib.seq2seq.BasicDecoder(cell=attn_cell, helper=helper,
initial_state=attn_cell.zero_state(tf.shape(inputs)[0], tf.float32))
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder)
outputs = tf.concat([tf.expand_dims(out, 1) for out in outputs], 1)
outputs = tf.tensordot(outputs, self.linear, axes=[[2], [0]])
return outputs, state
I got the following error when running the code
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last)
~/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py
in apply_op(self, op_type_name, name, **keywords)
434 preferred_dtype=default_dtype,
--> 435 as_ref=input_arg.is_ref)
436 if input_arg.number_attr and len(
~/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py
in internal_convert_n_to_tensor(values, dtype, name, as_ref,
preferred_dtype)
736 as_ref=as_ref,
--> 737 preferred_dtype=preferred_dtype))
738 return ret
~/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py
in internal_convert_to_tensor(value, dtype, name, as_ref,
preferred_dtype)
675 if ret is None:
--> 676 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
677
~/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py
in _TensorTensorConversionFunction(t, dtype, name, as_ref)
548 "Tensor conversion requested dtype %s for Tensor with dtype %s: %r"
--> 549 % (dtype.name, t.dtype.name, str(t)))
550 return t
ValueError: Tensor conversion requested dtype float32 for Tensor with
dtype int32: 'Tensor("seq2seq-train/decoder/ExpandDims_2:0", shape=(?,
1, ?), dtype=int32)'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call
last) in ()
4 emb_en = np.random.uniform(low=-0.1, high=0.1, size=(en_total_words, hidden_size))
5 emb_cn = np.random.uniform(low=-0.1, high=0.1, size=(cn_total_words, hidden_size))
----> 6 model = Seq2Seq(hidden_size, num_layers, emb_en, emb_cn)
7 sess = tf.Session()
8 init = tf.global_variables_initializer()
in init(self, hidden_size,
num_layers, embed_words_en, embed_words_cn)
81 encoder_outputs, encoder_state = self.encoder(self.encoder_inputs, self.encoder_length)
82 decoder_length = tf.cast(tf.reduce_sum(self.decoder_mask, 1), tf.int32)
---> 83 decoder_outputs, decoder_state = self.decoder(self.decoder_inputs, encoder_state, encoder_outputs,
encoder_state, decoder_length)
84
85 # decoder_outputs.append(decoder_out)
in call(self, inputs, state,
encoder_outputs, encoder_state, decoder_length, mode)
50
51 outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder)
---> 52 outputs = tf.concat([tf.expand_dims(out, 1) for out in outputs], 1)
53
54 outputs = tf.tensordot(outputs, self.linear, axes=[[2], [0]])
~/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py
in concat(values, axis, name) 1064 return
gen_array_ops._concat_v2(values=values, 1065
axis=axis,
-> 1066 name=name) 1067 1068
~/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py
in _concat_v2(values, axis, name)
491 """
492 result = _op_def_lib.apply_op("ConcatV2", values=values, axis=axis,
--> 493 name=name)
494 return result
495
~/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py
in apply_op(self, op_type_name, name, **keywords)
461 (prefix, dtype.name))
462 else:
--> 463 raise TypeError("%s that don't all match." % prefix)
464 else:
465 raise TypeError("%s that are invalid." % prefix)
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have
types [float32, int32] that don't all match.