Tensorflow: How to use a generator for fit() which runs in parallel with multiple processes - python-3.x

I am trying to train a model on a data set which does not fit in my RAM.
Therefore I am using a data generator which inherits from tensorflow.keras.utils.Sequence as shown below.
This is working. However because I am doing processing on the images my training is CPU bound. When looking in GPU-Z my GPU is only at 10-20% but one of my CPU Cores is at its max.
To solve this I am trying to run the generator in parallel on all my 16 cores. However when I set use_multiprocessing=True in the fit() function the program freezes. And using workers=8 does not speed up the process just produces batches in uneven intervals.
ex.:
batch 1-8 is processed immediately than there is some delay and than batch 9-16 is processed.
The code below shows what I am trying to do.
#read the dataset
x, o_y = reader.read_dataset_whole(ETLCharacterGroups.kanji)
#split data into 90/10 percent parts
percentage = round(len(x) / 100 * 80)
x_train = x[:percentage]
x_test = x[percentage:]
y_train = o_y[:percentage]
y_test = o_y[percentage:]
def distort_sample(img : Image) -> (Image, [int], [int]):
"""
Distort the given image randomly.
Randomly applies the transformations:
- rotation
- shear
- scale
- translate
- sharpen
- blur
Returns the distorted image.
"""
offset, scale = (0, 0), (64, 64)
t = random.choice(["sine"]) # "rotate", "shear", "scale",
f = random.choice(["blur", "sharpen", "smooth"])
# randomly apply transformations...
# rotate image
if("rotate" in t):
img = img.rotate(random.uniform(-30, 30))
# shear image
if("shear" in t):
y_shear = random.uniform(-0.2, 0.2)
x_shear = random.uniform(-0.2, 0.2)
img = img.transform(img.size, PImage.AFFINE, (1, x_shear, 0, y_shear, 1, 0))
# scale and translate image
if("scale" in t):
#scale the image
size_x = random.randrange(20, 63)
size_y = random.randrange(20, 63)
scale = (size_x, size_y)
offset = (math.ceil((64 - size_x) / 2), math.ceil((64 - size_y) / 2))
img = img.resize(scale)
# put it again on a black background (translated)
background = PImage.new('L', (64, 64))
trans_x = random.randrange(0, math.floor((64 - size_x)))
trans_y = random.randrange(0, math.floor((64 - size_y)))
offset = (trans_x, trans_y)
background.paste(img, offset)
img = background
if("sine" in t):
t_img = np.array(img)
A = t_img.shape[0] / 3.0
w = 2.0 / t_img.shape[1]
shift = lambda x: random.uniform(0.15, 0.2) * A * np.sin(-2*np.pi*x * w)
for i in range(t_img.shape[0]):
t_img[:,i] = np.roll(t_img[:,i], int(shift(i)))
img = PImage.fromarray(t_img)
# blur
if("blur" in f):
img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.5, 1.2)))
# sharpen
if("sharpen" in f):
img = img.filter(ImageFilter.SHARPEN)
# smooth
if("smooth" in f):
img = img.filter(ImageFilter.SMOOTH)
return img, offset, scale
class DataGenerator(tf.keras.utils.Sequence):
def __init__(self, x_col, y_col, batch_size, mode="training", shuffle=True):
self.batch_size = batch_size
self.undistorted_images = batch_size // 2
self.shuffle = shuffle
self.indices = len(x_col)
self.x_col = x_col
self.y_col = y_col
def __len__(self):
return self.indices // self.batch_size
def on_epoch_end(self):
if(False):
rng_state = np.random.get_state()
np.random.shuffle(x)
np.random.set_state(rng_state)
np.random.shuffle(o_y)
def __getitem__(self, index):
X, Y = [], []
for i in range(index * self.undistorted_images, (index+1) * self.undistorted_images):
base_img = self.x_col[i]
img = PImage.fromarray(np.uint8(base_img.reshape(64, 64) * 255))
# distort_sample() creates random variations of an image
img, *unused = distort_sample(img)
# add transformed image
X.append(np.array(img).reshape(64, 64, 1))
Y.append(self.y_col[i])
# add base image
X.append(base_img)
Y.append(self.y_col[i])
return np.array(X), np.array(Y)
#instantiate generators
training_generator = DataGenerator(x_col = x_train, y_col = y_train, batch_size = 256)
validation_generator = DataGenerator(x_col = x_test, y_col = y_test, batch_size = 256)
#train the model
hist = model.fit(
x=training_generator,
epochs=100,
validation_data=training_generator,
max_queue_size=50,
workers=8,
#use_multiprocessing=True <- this freezes the program
)

In the end I needed to make the Data generator use multi processing. To do this, the arrays needed to be stored in shared memory and than used in the sub processes.
import multiprocessing as mp
import numpy as np
from PIL import Image as PImage
from PIL import ImageFilter
import random
import math
import tensorflow as tf
shared_dict = {}
def distort_sample(img : PImage) -> (PImage, [int], [int]):
"""
Distort the given image randomly.
Randomly applies the transformations:
rotation, shear, scale, translate,
Randomly applies the filter:
sharpen, blur, smooth
Returns the distorted image.
"""
offset, scale = (0, 0), (64, 64)
t = random.choice(["sine", "rotate", "shear", "scale"])
f = random.choice(["blur", "sharpen", "smooth"])
# randomly apply transformations...
# rotate image
if("rotate" in t):
img = img.rotate(random.uniform(-15, 15))
# shear image
if("shear" in t):
y_shear = random.uniform(-0.2, 0.2)
x_shear = random.uniform(-0.2, 0.2)
img = img.transform(img.size, PImage.AFFINE, (1, x_shear, 0, y_shear, 1, 0))
# scale and translate image
if("scale" in t):
#scale the image
size_x = random.randrange(25, 63)
size_y = random.randrange(25, 63)
scale = (size_x, size_y)
offset = (math.ceil((64 - size_x) / 2), math.ceil((64 - size_y) / 2))
img = img.resize(scale)
# put it again on a black background (translated)
background = PImage.new('L', (64, 64))
trans_x = random.randrange(0, math.floor((64 - size_x)))
trans_y = random.randrange(0, math.floor((64 - size_y)))
offset = (trans_x, trans_y)
background.paste(img, offset)
img = background
if("sine" in t):
t_img = np.array(img)
A = t_img.shape[0] / 3.0
w = 2.0 / t_img.shape[1]
shift_factor = random.choice([-1, 1]) * random.uniform(0.15, 0.2)
shift = lambda x: shift_factor * A * np.sin(-2*np.pi*x * w)
for i in range(t_img.shape[0]):
t_img[:,i] = np.roll(t_img[:,i], int(shift(i)))
img = PImage.fromarray(t_img)
# blur
if("blur" in f):
img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.5, 1.2)))
# sharpen
if("sharpen" in f):
img = img.filter(ImageFilter.SHARPEN)
# smooth
if("smooth" in f):
img = img.filter(ImageFilter.SMOOTH)
return img, offset, scale
def generator_func(start_index, end_index, x_shape, y_shape):
X, Y = [], []
x_loc = np.frombuffer(shared_dict["x"], dtype="float16").reshape(x_shape)
y_loc = np.frombuffer(shared_dict["y"], dtype="b").reshape(y_shape)
for i in range(start_index, end_index):
base_img = x_loc[i]
img = PImage.fromarray(np.uint8(base_img.reshape(64, 64) * 255))
img, *unused = distort_sample(img)
# add transformed image
X.append(np.array(img).reshape(64, 64, 1))
Y.append(y_loc[i])
X.append(np.array(img).reshape(64, 64, 1))
Y.append(y_loc[i])
# add base image
#X.append(base_img)
#Y.append(y_loc[i])
return X, Y
def generator_initializer(_x_shared, _y_shared):
shared_dict["x"] = _x_shared
shared_dict["y"] = _y_shared
def generator_func(start_index, end_index, x_shape, y_shape):
X, Y = [], []
x_loc = np.frombuffer(shared_dict["x"], dtype="float16").reshape(x_shape)
y_loc = np.frombuffer(shared_dict["y"], dtype="b").reshape(y_shape)
for i in range(start_index, end_index):
base_img = x_loc[i]
img = PImage.fromarray(np.uint8(base_img.reshape(64, 64) * 255))
img, *unused = distort_sample(img)
# add transformed image
X.append(np.array(img).reshape(64, 64, 1))
Y.append(y_loc[i])
X.append(np.array(img).reshape(64, 64, 1))
Y.append(y_loc[i])
# add base image
#X.append(base_img)
#Y.append(y_loc[i])
return X, Y
class DataGenerator(tf.keras.utils.Sequence):
def __init__(self, num_samples, batch_size,
percentage, mode,
x_shared, y_shared,
x_np_shape, y_np_shape,
processes, shuffle=True):
self.num_samples = num_samples
# 50% original images + 50% augmented images
self.batch_size = batch_size // 2
self.percentage = percentage
# an offset to devide the data set into test and train
self.start_index = 0
if(mode == "testing"):
self.start_index = num_samples - (num_samples // 100 * percentage)
# is this a train or a test generator
self.mode = mode
# how many processes should be used for this generator
self.processes = processes
# should the arrays be shuffled after each epoch
self.shuffle = shuffle
self.x_np_shape = x_np_shape
self.y_np_shape = y_np_shape
# a pool of processes for generating augmented data
self.pool = mp.Pool(processes=self.processes,
initializer=generator_initializer,
initargs=(x_shared, y_shared))
def __len__(self):
return (self.num_samples // 100 * self.percentage) // self.batch_size
def on_epoch_end(self):
if(False):
rng_state = np.random.get_state()
np.random.shuffle(x_np)
np.random.set_state(rng_state)
np.random.shuffle(y_np)
def __getitem__(self, index):
arguments = []
slice_size = self.batch_size // self.processes
current_batch = index * self.batch_size
for i in range(self.processes):
slice_start = self.start_index + (current_batch + i * slice_size)
slice_end = self.start_index + (current_batch + (i+1) * slice_size)
arguments.append([slice_start, slice_end, self.x_np_shape, self.y_np_shape])
return_values = self.pool.starmap(generator_func, arguments)
X, Y = [], []
for imgs, labels in return_values:
X.append(imgs)
Y.append(labels)
return np.concatenate(X).astype(np.float16), np.concatenate(Y).astype(np.float16)

Related

Object Detection - RuntimeError: stack expects each tensor to be equal size

I created a custom dataset for object detection named ReceiptDataset as below.
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
class ReceiptDataset(torch.utils.data.Dataset):
def __init__(self, train_dir,width,height,labels,transforms=None):
self.images = os.listdir(train_dir)
self.width = width
self.height = height
self.train_dir = train_dir
self.labels = labels
self.transforms = transforms
def __getitem__(self,idx):
img_name = self.images[idx]
img_path = os.path.join(self.train_dir,img_name)
#print(f"img_name: {img_name}")
img = cv2.imread(img_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
img_res = cv2.resize(img_rgb,(self.width,self.height), cv2.INTER_AREA)
img_res /= 255.0
annot = self.labels[str(img_name)]
lbls = []
boxes = []
target = {}
ht, wt, _ = img.shape
#print(f"img_res shape: {img_res.shape}, orig shape: {wt}, {ht}")
for item in annot:
x,y,box_wt,box_ht,lbl = item
x_min = x
x_max = x + box_wt
y_min = y
y_max = y + box_ht
x_min_corr = (x_min / wt) * self.width
x_max_corr = (x_max /wt ) * self.width
y_min_corr = (y_min / ht) * self.height
y_max_corr = (y_max / ht) * self.height
boxes.append([x_min_corr, y_min_corr, x_max_corr, y_max_corr])
lbls.append( classes.index(str(lbl)) )
#print(f"dls_lbls: {lbls}, {len(lbls)}")
#lbls += [-1] * (NUM_CLASSES - len(lbls))
boxes = torch.as_tensor(boxes, dtype=torch.float32)
lbls = torch.as_tensor(lbls, dtype=torch.int64)
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
target["boxes"] = boxes
target["labels"] = lbls
target["image_id"] = torch.as_tensor(idx)
target["area"] = area
target["iscrowd"] = iscrowd
#print(f"dls_lbls -- 2: {target['labels']}, { target['labels'].shape }")
if self.transforms:
trans = self.transforms(image=img_res,
bboxes = target["boxes"],
labels=lbls
)
img_res = trans["image"]
target["boxes"] = torch.Tensor(trans["bboxes"])
return img_res, target
def __len__(self):
return len(self.images)
and I created an instance with:
train_dataset = ReceiptDataset("label-detector/images",width,height,plabels)
and my training snippet is :
from engine import train_one_epoch, evaluate
for epoch in range(num_epochs):
train_one_epoch(model,optim,train_loader,device,epoch,print_freq=2)
lr_scheduler.step()
evaluate(model,test_loader,device)
but anytime I run the training loop, I’m getting a runtime error:
RuntimeError: stack expects each tensor to be equal size, but got [11,4] at entry 0 and [9,4] at entry 1
There are 17 classes in total and each image has a minimum of 4 annotations.
I noticed the problem seems to be coming from my labels list/tensor in the dataset class, the size of the labels list/tensor varies based on the number of annotated items in an image, but I can’t seem to figure out a way to fix this.
Thank you!
I solved it by implementing a custom collate function for the dataloader that returns a batch of my dataset as needed by my model.
def collate_fn_seq(batch):
images = [ item[0] for item in batch ]
targets = [ item[1] for item in batch ]
imgs = []
for image in images:
img = torch.from_numpy(image).permute(2, 0, 1)
imgs.append(img)
boxes = [target["boxes"] for target in targets]
labels = [target["labels"] for target in targets]
image_ids = [ target["image_id"] for target in targets ]
areas = [target["area"] for target in targets]
iscrowds = [target["iscrowd"] for target in targets]
tars = []
for i in range(len(batch)):
box = boxes[i]
label = labels[i]
image_id = image_ids[i]
area = areas[i]
iscrowd = iscrowds[i]
target = {"boxes": box, "labels": label, "image_id": image_id, "area": area, "iscrowd": iscrowd}
tars.append(target)
return imgs, tars
and included it in my dataloaders using:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_seq)

Evaluation gives 0 score

I am working on a instance segmentation problem on mask rcnn with pytorch. Training part is working with below code but evaluation gives 0 score at every mAP. What's the problem in the code?
More info:
I use Albumentations for transforms and some files from pytorch vision for training.
Some problems I've been through:
When I use coco for bbox format instead of pascal voc it gives following error.
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
When put labels out of the convert_seg_boolMask function, it gives following error.
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
def get_transforms(train=False):
if train:
transform = A.Compose([
ToTensorV2()
],bbox_params=A.BboxParams("pascal_voc",label_fields=["labels","iscrowd"]))
else:
transform = A.Compose([
ToTensorV2()
],bbox_params=A.BboxParams("pascal_voc",label_fields=["labels","iscrowd"]))
return transform
class Dataset(datasets.VisionDataset):
def __init__(self, coco_, data_dir, transform=None, target_transform=None, transforms=None):
super().__init__(data_dir, transforms, transform, target_transform)
self.coco_info = coco_
self.data_dir = data_dir
self.transforms = transforms
if isinstance(self.coco_info,dict):
self.ids = [x["id"] for x in self.coco_info["images"] if len(self._load_target(x["id"]))>0]
def _load_image(self, id: int):
name = loadImgs(self.coco_info["images"],id)[0]['file_name']
image = cv2.imread(os.path.join(self.data_dir, name))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)/255
return image
def _load_target(self, id):
return loadAnns(self.coco_info["annotations"],id)
def n_classes(self):
category_names = list(set(sorted([a["name"] for a in self.coco_info["categories"]])))
self.classes = ["__background__"]+[i for i in category_names]
return self.classes
def __getitem__(self,idx):
id = self.ids[idx]
image = self._load_image(id)
target = copy.deepcopy(self._load_target(id))
image_shape = (image.shape[0],image.shape[1])
img_info = {
"img_shape":image_shape,
"image_id":id,
"labels":[t["category_id"]for t in target],
"segmentation":[t["segmentation"][0] for t in target],
"id": [t["id"] for t in target]
}
mask, labels = self.convert_seg_to_boolMask(img_info)
obj_ids = np.unique(mask)
obj_ids = obj_ids[1:]
masks = torch.tensor(mask == obj_ids[:, None, None])
boxes = []
bbox = np.array([t["bbox"] for t in target])
for xmin,ymin,width,height in bbox:
xmax = xmin+width
ymax = ymin+height
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.tensor(boxes)
labels = torch.tensor(labels)
image_id = torch.tensor([id])
iscrowd = torch.tensor([t["iscrowd"] for t in target])
transformed = self.transforms(image=image, masks=masks, bboxes=boxes, labels=labels, iscrowd=iscrowd)
image = transformed['image']
masks = torch.tensor(transformed["masks"])
boxes = torch.tensor(transformed['bboxes'])
labels = torch.tensor(transformed["labels"])
iscrowd = torch.tensor(transformed["iscrowd"])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
image_id = torch.tensor(image_id)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
return image, target
def __len__(self):
return len(self.ids)
def convert_seg_to_boolMask(self,img_info):
mask = np.zeros(img_info["img_shape"], dtype=np.uint8)
mask = Image.fromarray(mask)
draw = ImageDraw.Draw(mask)
for seg, i in zip(img_info["segmentation"],img_info["id"]):
points = [tuple([k,l]) for k,l in zip(seg[0::2],seg[1::2])]
draw.polygon(xy=points,
outline=tuple([i]),
fill=tuple([i]))
mask = np.array(mask)
labels = img_info["labels"]
return mask, labels

Pytorch transformer forward function masks implementation for decoder forward function

I am trying to use and learn PyTorch Transformer with DeepMind math dataset. I have tokenized (char not word) sequence that is fed into model. Models forward function is doing once forward for encoder and multiple forwards for decoder (till all batch outputs reach token, this is still TODO).
I am struggling with Transformer masks and decoder forward as it throws the error:
k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
RuntimeError: shape '[-1, 24, 64]' is invalid for input of size 819200.
Source is N = 32, S = 50, E = 512. Target is N = 32, S = 3, E = 512.
It is possible that I have wrong implementation of masks or that source and target lengths are different, not realy sure.
class PositionalEncoding(nn.Module):
# function to positionally encode src and target sequencies
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class MyTransformerModel(nn.Module):
# should implement init and forward function
# define separate functions for masks
# define forward function with
# implement:
# embedding layer
# positional encoding
# encoder layer
# decoder layer
# final classification layer
# encoder -> forward once
# decoder -> forward multiple times (for one encoder forward)
# decoder output => concatenate to input e.g. decoder_input = torch.cat([decoder_input], [decoder_output])
# early stopping => all in batch reach <eos> token
def __init__(self, vocab_length = 30, sequence_length = 512, num_encoder_layers = 3, num_decoder_layers = 2, num_hidden_dimension = 256, feed_forward_dimensions = 1024, attention_heads = 8, dropout = 0.1, pad_idx = 3, device = "CPU", batch_size = 32):
super(MyTransformerModel, self).__init__()
self.src_embedding = nn.Embedding(vocab_length, sequence_length)
self.pos_encoder = PositionalEncoding(sequence_length, dropout)
self.src_mask = None # attention mask
self.memory_mask = None # attention mask
self.pad_idx = pad_idx
self.device = device
self.batch_size = batch_size
self.transformer = nn.Transformer(
sequence_length,
attention_heads,
num_encoder_layers,
num_decoder_layers,
feed_forward_dimensions,
dropout,
)
def src_att_mask(self, src_len):
mask = (torch.triu(torch.ones(src_len, src_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def no_peak_att_mask(self, batch_size, src_len, time_step):
mask = np.zeros((batch_size, src_len), dtype=bool)
mask[:, time_step: ] = 1 # np.NINF
mask = torch.from_numpy(mask)
return mask
def make_src_key_padding_mask(self, src):
# mask "<pad>"
src_mask = src.transpose(0, 1) == self.pad_idx
return src_mask.to(self.device)
def make_trg_key_padding_mask(self, trg):
tgt_mask = trg.transpose(0, 1) == self.pad_idx
return tgt_mask.to(self.device)
def forward(self, src, trg):
src_seq_length, N = src.shape
trg_seq_length, N = trg.shape
embed_src = self.src_embedding(src)
position_embed_src = self.pos_encoder(embed_src)
embed_trg = self.src_embedding(trg)
position_embed_trg = self.pos_encoder(embed_trg)
src_padding_mask = self.make_src_key_padding_mask(src)
trg_padding_mask = self.make_trg_key_padding_mask(trg)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
time_step = 1
att_mask = self.no_peak_att_mask(self.batch_size, src_seq_length, time_step).to(self.device)
encoder_output = self.transformer.encoder.forward(position_embed_src, src_key_padding_mask = src_padding_mask)
# TODO : implement loop for transformer decoder forward fn, implement early stopping
# where to feed decoder_output?
decoder_output = self.transformer.decoder.forward(position_embed_trg, encoder_output, trg_mask, att_mask, trg_padding_mask, src_padding_mask)
return decoder_output
Can anyone pin point where I have made a mistake?
It looks like I have messed dimensions order (as Transformer does not have batch first option). Corrected code is below:
class MyTransformerModel(nn.Module):
def __init__(self, d_model = 512, vocab_length = 30, sequence_length = 512, num_encoder_layers = 3, num_decoder_layers = 2, num_hidden_dimension = 256, feed_forward_dimensions = 1024, attention_heads = 8, dropout = 0.1, pad_idx = 3, device = "CPU", batch_size = 32):
#, ninp, device, nhead=8, nhid=2048, nlayers=2, dropout=0.1, src_pad_idx = 1, max_len=5000, forward_expansion= 4):
super(MyTransformerModel, self).__init__()
self.src_embedding = nn.Embedding(vocab_length, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.vocab_length = vocab_length
self.d_model = d_model
self.src_mask = None # attention mask
self.memory_mask = None # attention mask
self.pad_idx = pad_idx
self.device = device
self.batch_size = batch_size
self.transformer = nn.Transformer(
d_model,
attention_heads,
num_encoder_layers,
num_decoder_layers,
feed_forward_dimensions,
dropout,
)
self.fc = nn.Linear(d_model, vocab_length)
# self.init_weights() <= used in tutorial
def src_att_mask(self, src_len):
mask = (torch.triu(torch.ones(src_len, src_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def no_peak_att_mask(self, batch_size, src_len, time_step):
mask = np.zeros((batch_size, src_len), dtype=bool)
mask[:, time_step: ] = 1 # np.NINF
mask = torch.from_numpy(mask)
# mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def make_src_key_padding_mask(self, src):
# mask "<pad>"
src_mask = src.transpose(0, 1) == self.pad_idx
# src_mask = src == self.pad_idx
# (N, src_len)
return src_mask.to(self.device)
def make_trg_key_padding_mask(self, trg):
# same as above -> expected tgt_key_padding_mask: (N, T)
tgt_mask = trg.transpose(0, 1) == self.pad_idx
# tgt_mask = trg == self.pad_idx
# (N, src_len)
return tgt_mask.to(self.device)
def init_weights(self):
initrange = 0.1
nn.init.uniform_(self.encoder.weight, -initrange, initrange)
nn.init.zeros_(self.decoder.weight)
nn.init.uniform_(self.decoder.weight, -initrange, initrange)
def forward(self, src, trg):
N, src_seq_length = src.shape
N, trg_seq_length = trg.shape
# S - source sequence length
# T - target sequence length
# N - batch size
# E - feature number
# src: (S, N, E) (sourceLen, batch, features)
# tgt: (T, N, E)
# src_mask: (S, S)
# tgt_mask: (T, T)
# memory_mask: (T, S)
# src_key_padding_mask: (N, S)
# tgt_key_padding_mask: (N, T)
# memory_key_padding_mask: (N, S)
src = rearrange(src, 'n s -> s n')
trg = rearrange(trg, 'n t -> t n')
print("src shape {}".format(src.shape))
print(src)
print("trg shape {}".format(trg.shape))
print(trg)
embed_src = self.src_embedding(src)
print("embed_src shape {}".format(embed_src.shape))
print(embed_src)
position_embed_src = self.pos_encoder(embed_src)
print("position_embed_src shape {}".format(position_embed_src.shape))
print(position_embed_src)
embed_trg = self.src_embedding(trg)
print("embed_trg shape {}".format(embed_trg.shape))
print(embed_trg)
position_embed_trg = self.pos_encoder(embed_trg)
# position_embed_trg = position_embed_trg.transpose(0, 1)
print("position_embed_trg shape {}".format(position_embed_trg.shape))
print(position_embed_trg)
src_padding_mask = self.make_src_key_padding_mask(src)
print("KEY - src_padding_mask shape {}".format(src_padding_mask.shape))
print("should be of shape: src_key_padding_mask: (N, S)")
print(src_padding_mask)
trg_padding_mask = self.make_trg_key_padding_mask(trg)
print("KEY - trg_padding_mask shape {}".format(trg_padding_mask.shape))
print("should be of shape: trg_key_padding_mask: (N, T)")
print(trg_padding_mask)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
print("trg_mask shape {}".format(trg_mask.shape))
print("trg_mask should be of shape tgt_mask: (T, T)")
print(trg_mask)
# att_mask = self.src_att_mask(trg_seq_length).to(self.device)
time_step = 1
# error => memory_mask: expected shape! (T, S) !!! this is not a key_padding_mask!
# att_mask = self.no_peak_att_mask(self.batch_size, src_seq_length, time_step).to(self.device)
# print("att_mask shape {}".format(att_mask.shape))
# print("att_mask should be of shape memory_mask: (T, S)")
# print(att_mask)
att_mask = None
# get encoder output
# forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None)
# forward encoder just once for a batch
# attention forward of encoder expects => src, src_mask, src_key_padding_mask +++ possible positional encoding error !!!
encoder_output = self.transformer.encoder.forward(position_embed_src, src_key_padding_mask = src_padding_mask)
print("encoder_output")
print("encoder_output shape {}".format(encoder_output.shape))
print(encoder_output)
# forward decoder till all in batch did not reach <eos>?
# def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
# memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
# memory_key_padding_mask: Optional[Tensor] = None)
# first forward
decoder_output = self.transformer.decoder.forward(position_embed_trg, encoder_output, trg_mask, att_mask, trg_padding_mask, src_padding_mask)
# TODO: target in => target out shifted by one, loop till all in batch meet stopping criteria || max len is reached
#
print("decoder_output")
print("decoder_output shape {}".format(decoder_output.shape))
print(decoder_output)
output = rearrange(decoder_output, 't n e -> n t e')
output = self.fc(output)
print("output")
print("output shape {}".format(output.shape))
print(output)
predicted = F.log_softmax(output, dim=-1)
print("predicted")
print("predicted shape {}".format(predicted.shape))
print(predicted)
# top k
top_value, top_index = torch.topk(predicted, k=1)
top_index = torch.squeeze(top_index)
print("top_index")
print("top_index shape {}".format(top_index.shape))
print(top_index)
print("top_value")
print("top_value shape {}".format(top_value.shape))
print(top_value)
return top_index

Keras Neural Style Transfer: backend.gradients returns None

I am trying to implement Neural Style Transfer using Keras and trying to keep it as simple as possible. While trying to find gradient using backend.gradients() function of keras, it returns [None]. My code is as follows:
content_image = cv2.imread("C:/Users/Max/Desktop/IMG_20170331_103755.jpg")
content_image = cv2.resize(content_image, (512,512))
style_image = cv2.imread("C:/Users/Max/Desktop/starry.jpg")
style_image = cv2.resize(style_image, (512,512))
content_array = np.asarray(content_image, dtype=np.float32)
content_array = np.expand_dims(content_array, axis=0)
style_array = np.asarray(style_image, dtype=np.float32)
style_array = np.expand_dims(style_array, axis=0)
# Constants:
epochs = 1
height = 512
width = 512
num_channels = 3
step_size = 10
content_layer = ['block2_conv2']
style_layer = ['block1_conv2', 'block2_conv2', 'block3_conv3','block4_conv3', 'block5_conv3']
loss_total = backend.variable(0.0)
# VGG16 Model:
model = VGG16(input_shape = [height, width, num_channels],weights='imagenet', include_top=False)
# Defining losses:
def content_loss(Content, Mixed):
content_loss = backend.mean(backend.square(Mixed - Content))
return content_loss
def gram(layer):
flat = backend.reshape(layer, shape=[1, -1])
gram = backend.dot(flat, backend.transpose(flat))
return gram
def style_loss(Style, Mixed):
S_G = gram(Style)
M_G = gram(Mixed)
size = height*width
return backend.sum(backend.square(S_G - M_G)) / (4. * (num_channels ** 2) * (size ** 2))
'''
def denoise(Image):
loss = backend.mean(backend.abs(Image[:,1:,:,:] - Image[:,:-1,:,:]) + backend.abs(Image[:,:,1:,:] - Image[:,:,:-1,:]))
return loss
'''
# Backend Functions:
output_c = backend.function(inputs = [model.layers[0].input] , outputs = [model.get_layer(content_layer[0]).output])
output_s = backend.function(inputs = [model.layers[0].input] , outputs = [model.get_layer(layer).output for layer in style_layer])
content_output = output_c([content_array])
style_output = output_s([style_array])
# Randomly generated image:
Mixed = np.random.uniform(0, 255, [1, height, width, 3]) - 128
# Loop:
for i in range(epochs):
mixed_c = output_c([Mixed])
mixed_c = mixed_c[0]
loss_c = content_loss(content_output[0], mixed_c)
total = []
mixed_s = output_s([Mixed])
for i in range(len(style_layer)):
style = style_loss(style_output[i], mixed_s[i])
total.append(style)
loss_s = backend.sum(total)
#loss_d = denoise(Mixed)
loss_total = w_c * loss_c + w_s * loss_s #+ w_d * loss_d
gradient = backend.gradients(loss_total, Mixed)
gradient = np.squeeze(gradient)
step_size = step_size / (np.std(gradient) + 1e-8)
Mixed -= gradient * step_size
What changes should i make to get the gradients working properly. I am clueless as to what went wrong.
Thanks!
You're taking gradient of Mixed which is a numpy array and not a variable. You need to define a tensor which will then have value of Mixed.
From Keras documentation:
gradients
keras.backend.gradients(loss, variables)
Returns the gradients of variables w.r.t. loss.
Arguments
loss: Scalar tensor to minimize.
variables: List of variables.

How to create layer0 input for input images with 3 channels

Hi I am following the http://deeplearning.net/tutorial/code/convolutional_mlp.py code to implement a conv neural net. I have input images where the channel is important and hence I want to have 3 channel feature map as layer 0 input.
So I need something like this
layer0_input = x.reshape((batch_size, 3, 240, 135)) # width 240, height 135, 3 channels
instead of
layer0_input = x.reshape((batch_size, 1, 28, 28)) # 28*28 normalized MNIST gray scale images
which will be used here
layer0 = LeNetConvPoolLayer(
rng,
input=layer0_input,
image_shape=(batch_size, 3, 240, 135),
filter_shape=(nkerns[0], 1, 5, 5),
poolsize=(2, 2)
)
where that x is provided to theano as
train_model = theano.function(
[index],
cost,
updates=updates,
givens={
x: train_set_x[index * batch_size: (index + 1) * batch_size],
y: train_set_y[index * batch_size: (index + 1) * batch_size]
}
)
So - my question is - how should I create (shape) that train_set_x ?
With (gray scale intensity - i.e single channel) train_set_x is created as
shared_x = theano.shared(numpy.asarray(data_x,
dtype=theano.config.floatX),
where data_x is a flattened numpy array of length 784 (for 28*28 pixels)
Thanks a lot for advice
I was able to get it working. I am pasting some code here which might help some one. Not very elegant - but works.
def shuffle_in_unison(a, b):
#courtsey http://stackoverflow.com/users/190280/josh-bleecher-snyder
assert len(a) == len(b)
shuffled_a = np.empty(a.shape, dtype=a.dtype)
shuffled_b = np.empty(b.shape, dtype=b.dtype)
permutation = np.random.permutation(len(a))
for old_index, new_index in enumerate(permutation):
shuffled_a[new_index] = a[old_index]
shuffled_b[new_index] = b[old_index]
return shuffled_a, shuffled_b
def createDataSet(imagefolder):
os.chdir(imagefolder)
# total number of files
number_of_files = len([item for item in os.listdir('.') if os.path.isfile(os.path.join('.', item))])
# get a shuffled list : I needed this because my image names were of the format n_x_<some details>.jpg
# where n was my target and x was a number from 0 to m-1 where m was the number of samples
# of the target value n. So I needed so shuffle and iterate while putting images in train
# test and validate arrays
image_index_array = range(0,number_of_files)
random.seed(12)
random.shuffle(image_index_array)
# split 80/10/10 - train/test/val
trainsize = int(number_of_files*.8)
testsize = int(number_of_files*.1)
valsize = number_of_files - trainsize - testsize
# create the random value arrays of train/test/val by slicing the total image index array
train_index_array = image_index_array[0:trainsize]
test_index_array = image_index_array[trainsize:trainsize+testsize]
validate_index_array = image_index_array[trainsize+testsize:]
# initialize the data structures
dataset = {'train':[[],[]],'test':[[],[]],'validate':[[],[]]}
i_counter = 0
train_X = []
train_y = []
test_X = []
test_y = []
val_X = []
val_y = []
for item in os.listdir('.'):
if not os.path.isfile(os.path.join('.', item)):
continue
if item.endswith('.pkl'):
continue
print 'Processing item ' + item
item_y = item.split('_')[0]
item_x = cv2.imread(item)
height, width = item_x.shape[:2]
# this was my requirement - skip it if you do not need it
if(height != 135 or width != 240):
continue
# get 3 channels
b,g,r = cv2.split(item_x)
item_x = [b,g,r]
item_x = np.array(item_x)
item_x = item_x.reshape(3,135*240)
if i_counter in test_index_array:
test_X.append(item_x)
test_y.append(item_y)
elif i_counter in validate_index_array:
val_X.append(item_x)
val_y.append(item_y)
else:
train_X.append(item_x)
train_y.append(item_y)
i_counter = i_counter + 1
# fix the dimensions. Flatten out the channel and intensity dimensions
train_X = np.array(train_X)
train_X = train_X.reshape(train_X.shape[0],train_X.shape[1]*train_X.shape[2])
test_X = np.array(test_X)
test_X = test_X.reshape(test_X.shape[0],test_X.shape[1]*test_X.shape[2])
val_X = np.array(val_X)
val_X = val_X.reshape(val_X.shape[0],val_X.shape[1]*val_X.shape[2])
train_y = np.array(train_y)
test_y = np.array(test_y)
val_y = np.array(val_y)
# shuffle the train and test arrays in unison
train_X,train_y = shuffle_in_unison(train_X,train_y)
test_X,test_y = shuffle_in_unison(test_X,test_y)
# pickle them
dataset['train'] = [train_X,train_y]
dataset['test'] = [test_X,test_y]
dataset['validate'] = [val_X,val_y]
output = open('pcount.pkl', 'wb')
cPickle.dump(dataset, output)
output.close`
Once you have this pickle file
You can use it in convolutional_mlp.py like this.
layer0_input = x.reshape((batch_size, 3, 135, 240))
# Construct the first convolutional pooling layer:
# filtering reduces the image size to (135-8+1 , 240-5+1) = (128, 236)
# maxpooling reduces this further to (128/2, 236/2) = (64, 118)
# 4D output tensor is thus of shape (batch_size, nkerns[0], 64, 118)
layer0 = LeNetConvPoolLayer(
rng,
input=layer0_input,
image_shape=(batch_size, 3, 135, 240),
filter_shape=(nkerns[0], 3, 8, 5),
poolsize=(2, 2)
)
The load_data function in logistic_sgd.py will need a small change as below
f = open(dataset, 'rb')
dump = cPickle.load(f)
train_set = dump['train']
valid_set = dump['validate']
test_set = dump['test']
f.close()
Hope this helps

Resources