PyTorch: how to use torchvision.transforms.AugMIx with torch.float32?
I am trying to apply data augmentation in image dataset by using torchvision.transforms.AugMIx, but I have the following error: TypeError: Only torch.uint8 image tensors are supported, but found torch.float32.
I tried to convert it to int, but I have another error.
My code where I am trying to use the AugMix function:
transform = torchvision.transforms.Compose(
torchvision.transforms.Resize((224, 224)), # resize to 224*224
torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # normalization
to_tensor = torchvision.transforms.ToTensor()
class BreastDataset(
def __init__(self, json_path, data_dir_path='./dataset', clinical_data_path=None, is_preloading=True):
self.data_dir_path = data_dir_path
self.is_preloading = is_preloading
with open(json_path) as f:
print(f"load data from {json_path}")
self.json_data = json.load(f)
def __len__(self):
return len(self.json_data)
def __getitem__(self, index):
label = int(self.json_data[index]["label"])
patient_id = self.json_data[index]["id"]
patch_paths = self.json_data[index]["patch_paths"]
data = {}
if self.is_preloading:
data["bag_tensor"] = self.bag_tensor_list[index]
data["bag_tensor"] = self.load_bag_tensor([os.path.join(self.data_dir_path, p_path) for p_path in patch_paths])
data["label"] = label
data["patient_id"] = patient_id
data["patch_paths"] = patch_paths
return data
def load_bag_tensor(self, patch_paths):
"""Load a bag data as tensor with shape [N, C, H, W]"""
patch_tensor_list = []
for p_path in patch_paths:
patch ="RGB")
patch_tensor = transform(patch) # [C, H, W]
patch_tensor = torch.unsqueeze(patch_tensor, dim=0) # [1, C, H, W]
bag_tensor =, dim=0) # [N, C, H, W]
return bag_tensor
Any help is appreciated! Thank you in advance!

For me applying AugMix first and then ToTensor() worked
transformation = transforms.Compose([
transforms.AugMix(severity= 6,mixture_width=2),
transforms.RandomGrayscale(p = 0.35)

torchvision.transforms.AugMix takes images at format uint8. It means that every pixels is 1 (gray) or 3 (rgb) numbers between 0 and 255 that is a classic format of image.
torch.Tensor.type(torch.float32) cast a uint8 tensor to float32 but it is not likely the single transformation that was applied in your image. The float32 images are often normalized to be in range [-1, 1] or [0, 1]. The common way to do so are:
img = img.type(torch.float32) / 128.0 - 1.0 # [-1, 1]
img = img.type(torch.float32) / 255.0 # [0, 1]
When you know in what cases you are you can recast to uint8:
img = (img + 1.0) * 128.0 # case [-1, 1]
img = img * 255.0 # case [0, 1]
img = torch.clip(img, 0.0, 255.0)
img = img.type(torch.uint8)


How to pad audio clips or mel spectrograms in pytorch custom dataloader?

I am trying to make an audio Siamese network while in the training loop I get a size mismatch in my tensors stack expects each tensor to be equal size, but got [1, 128, 121] at entry 0 and [1, 128, 205] at entry 1.
I am unsure where I messed up with my data since while gathering my data I made sure to pad all my audio clips to same size with background audio. So I have to implement a way to pad the audio clips some other way. I thought about padding clips to a static size bigger than all my clips in my custom dataloader but that still causes me to get the same error. Any ideas where I am messing up?
class OHDataset(data.Dataset):
def __init__(self, audio_dir, audio_dataset, transform = "mel_spectrogram"):
self.audio_labels = pd.read_csv(audio_dataset)
self.audio_dir = audio_dir
self.output_format = transform
def __len__(self):
return len(self.audio_labels)
def __getitem__(self, item, n_fft = 200, hop_length = 120):
positive = self.audio_labels.iloc[item, 0]
if(not bool('\d', positive))):
positive = self.audio_labels.iloc[item+1, 0]
anchor = re.sub(r'\d+', '', self.audio_labels.iloc[item, 0])
negative = self.audio_labels.iloc[random.randint(0, len(self.audio_labels)), 0]
pos_audio_path = os.path.join(self.audio_dir, positive + ".wav")
neg_audio_path = os.path.join(self.audio_dir, negative + ".wav")
anchor_audio_path = os.path.join(self.audio_dir, anchor + ".wav")
if(self.output_format == "spectrogram"):
pos_spectrogram = getSpectrogram(pos_audio_path, n_fft, hop_length)
neg_spectrogram = getSpectrogram(neg_audio_path, n_fft, hop_length)
anchor_spectrogram = getSpectrogram(anchor_audio_path, n_fft, hop_length)
return anchor_spectrogram, pos_spectrogram, neg_spectrogram
elif(self.output_format == "mel_spectrogram"):
pos_mel_spectrogram = getMELSpectrogram(pos_audio_path, n_fft, hop_length)
neg_mel_spectrogram = getMELSpectrogram(neg_audio_path, n_fft, hop_length)
anchor_mel_spectrogram = getMELSpectrogram(anchor_audio_path, n_fft, hop_length)
return anchor_mel_spectrogram, pos_mel_spectrogram, neg_mel_spectrogram
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (A, P, N) in enumerate(dataloader):
anchor = model(A).to(device)
positive = model(P).to(device)
negative = model(N).to(device)
loss = loss_fn(anchor, positive, negative)

Object Detection - RuntimeError: stack expects each tensor to be equal size

I created a custom dataset for object detection named ReceiptDataset as below.
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
class ReceiptDataset(
def __init__(self, train_dir,width,height,labels,transforms=None):
self.images = os.listdir(train_dir)
self.width = width
self.height = height
self.train_dir = train_dir
self.labels = labels
self.transforms = transforms
def __getitem__(self,idx):
img_name = self.images[idx]
img_path = os.path.join(self.train_dir,img_name)
#print(f"img_name: {img_name}")
img = cv2.imread(img_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
img_res = cv2.resize(img_rgb,(self.width,self.height), cv2.INTER_AREA)
img_res /= 255.0
annot = self.labels[str(img_name)]
lbls = []
boxes = []
target = {}
ht, wt, _ = img.shape
#print(f"img_res shape: {img_res.shape}, orig shape: {wt}, {ht}")
for item in annot:
x,y,box_wt,box_ht,lbl = item
x_min = x
x_max = x + box_wt
y_min = y
y_max = y + box_ht
x_min_corr = (x_min / wt) * self.width
x_max_corr = (x_max /wt ) * self.width
y_min_corr = (y_min / ht) * self.height
y_max_corr = (y_max / ht) * self.height
boxes.append([x_min_corr, y_min_corr, x_max_corr, y_max_corr])
lbls.append( classes.index(str(lbl)) )
#print(f"dls_lbls: {lbls}, {len(lbls)}")
#lbls += [-1] * (NUM_CLASSES - len(lbls))
boxes = torch.as_tensor(boxes, dtype=torch.float32)
lbls = torch.as_tensor(lbls, dtype=torch.int64)
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
target["boxes"] = boxes
target["labels"] = lbls
target["image_id"] = torch.as_tensor(idx)
target["area"] = area
target["iscrowd"] = iscrowd
#print(f"dls_lbls -- 2: {target['labels']}, { target['labels'].shape }")
if self.transforms:
trans = self.transforms(image=img_res,
bboxes = target["boxes"],
img_res = trans["image"]
target["boxes"] = torch.Tensor(trans["bboxes"])
return img_res, target
def __len__(self):
return len(self.images)
and I created an instance with:
train_dataset = ReceiptDataset("label-detector/images",width,height,plabels)
and my training snippet is :
from engine import train_one_epoch, evaluate
for epoch in range(num_epochs):
but anytime I run the training loop, I’m getting a runtime error:
RuntimeError: stack expects each tensor to be equal size, but got [11,4] at entry 0 and [9,4] at entry 1
There are 17 classes in total and each image has a minimum of 4 annotations.
I noticed the problem seems to be coming from my labels list/tensor in the dataset class, the size of the labels list/tensor varies based on the number of annotated items in an image, but I can’t seem to figure out a way to fix this.
Thank you!
I solved it by implementing a custom collate function for the dataloader that returns a batch of my dataset as needed by my model.
def collate_fn_seq(batch):
images = [ item[0] for item in batch ]
targets = [ item[1] for item in batch ]
imgs = []
for image in images:
img = torch.from_numpy(image).permute(2, 0, 1)
boxes = [target["boxes"] for target in targets]
labels = [target["labels"] for target in targets]
image_ids = [ target["image_id"] for target in targets ]
areas = [target["area"] for target in targets]
iscrowds = [target["iscrowd"] for target in targets]
tars = []
for i in range(len(batch)):
box = boxes[i]
label = labels[i]
image_id = image_ids[i]
area = areas[i]
iscrowd = iscrowds[i]
target = {"boxes": box, "labels": label, "image_id": image_id, "area": area, "iscrowd": iscrowd}
return imgs, tars
and included it in my dataloaders using:
train_loader =, batch_size=8, shuffle=True, collate_fn=collate_fn_seq)

Evaluation gives 0 score

I am working on a instance segmentation problem on mask rcnn with pytorch. Training part is working with below code but evaluation gives 0 score at every mAP. What's the problem in the code?
More info:
I use Albumentations for transforms and some files from pytorch vision for training.
Some problems I've been through:
When I use coco for bbox format instead of pascal voc it gives following error.
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
When put labels out of the convert_seg_boolMask function, it gives following error.
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
def get_transforms(train=False):
if train:
transform = A.Compose([
transform = A.Compose([
return transform
class Dataset(datasets.VisionDataset):
def __init__(self, coco_, data_dir, transform=None, target_transform=None, transforms=None):
super().__init__(data_dir, transforms, transform, target_transform)
self.coco_info = coco_
self.data_dir = data_dir
self.transforms = transforms
if isinstance(self.coco_info,dict):
self.ids = [x["id"] for x in self.coco_info["images"] if len(self._load_target(x["id"]))>0]
def _load_image(self, id: int):
name = loadImgs(self.coco_info["images"],id)[0]['file_name']
image = cv2.imread(os.path.join(self.data_dir, name))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)/255
return image
def _load_target(self, id):
return loadAnns(self.coco_info["annotations"],id)
def n_classes(self):
category_names = list(set(sorted([a["name"] for a in self.coco_info["categories"]])))
self.classes = ["__background__"]+[i for i in category_names]
return self.classes
def __getitem__(self,idx):
id = self.ids[idx]
image = self._load_image(id)
target = copy.deepcopy(self._load_target(id))
image_shape = (image.shape[0],image.shape[1])
img_info = {
"labels":[t["category_id"]for t in target],
"segmentation":[t["segmentation"][0] for t in target],
"id": [t["id"] for t in target]
mask, labels = self.convert_seg_to_boolMask(img_info)
obj_ids = np.unique(mask)
obj_ids = obj_ids[1:]
masks = torch.tensor(mask == obj_ids[:, None, None])
boxes = []
bbox = np.array([t["bbox"] for t in target])
for xmin,ymin,width,height in bbox:
xmax = xmin+width
ymax = ymin+height
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.tensor(boxes)
labels = torch.tensor(labels)
image_id = torch.tensor([id])
iscrowd = torch.tensor([t["iscrowd"] for t in target])
transformed = self.transforms(image=image, masks=masks, bboxes=boxes, labels=labels, iscrowd=iscrowd)
image = transformed['image']
masks = torch.tensor(transformed["masks"])
boxes = torch.tensor(transformed['bboxes'])
labels = torch.tensor(transformed["labels"])
iscrowd = torch.tensor(transformed["iscrowd"])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
image_id = torch.tensor(image_id)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
return image, target
def __len__(self):
return len(self.ids)
def convert_seg_to_boolMask(self,img_info):
mask = np.zeros(img_info["img_shape"], dtype=np.uint8)
mask = Image.fromarray(mask)
draw = ImageDraw.Draw(mask)
for seg, i in zip(img_info["segmentation"],img_info["id"]):
points = [tuple([k,l]) for k,l in zip(seg[0::2],seg[1::2])]
mask = np.array(mask)
labels = img_info["labels"]
return mask, labels

When ı compile yolov3 ı get take warnings

"""YOLO v3 output
import numpy as np
import keras.backend as K
from keras.models import load_model
import os
class YOLO:
def __init__(self, obj_threshold, nms_threshold):
# Arguments
obj_threshold: Integer, threshold for object.
nms_threshold: Integer, threshold for box.
self._t1 = obj_threshold
self._t2 = nms_threshold
self._yolo = load_model('data/yolo.h5')
def _process_feats(self, out, anchors, mask):
"""process output features.
# Arguments
out: Tensor (N, N, 3, 4 + 1 +80), output feature map of yolo.
anchors: List, anchors for box.
mask: List, mask for anchors.
# Returns
boxes: ndarray (N, N, 3, 4), x,y,w,h for per box.
box_confidence: ndarray (N, N, 3, 1), confidence for per box.
box_class_probs: ndarray (N, N, 3, 80), class probs for per box.
grid_h, grid_w, num_boxes = map(int, out.shape[1: 4])
anchors = [anchors[i] for i in mask]
# Reshape to batch, height, width, num_anchors, box_params.
anchors_tensor = K.reshape(K.variable(anchors),
[1, 1,len(anchors), 2])
out = out[0]
box_xy = K.get_value(K.sigmoid(out[..., :2]))
box_wh = K.get_value(K.exp(out[..., 2:4]) * anchors_tensor)
box_confidence = K.get_value(K.sigmoid(out[..., 4]))
box_confidence = np.expand_dims(box_confidence, axis=-1)
box_class_probs = K.get_value(K.sigmoid(out[..., 5:]))
col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
grid = np.concatenate((col, row), axis=-1)
box_xy += grid
box_xy /= (grid_w, grid_h)
box_wh /= (416, 416)
box_xy -= (box_wh / 2.)
boxes = np.concatenate((box_xy, box_wh), axis=-1)
return boxes, box_confidence, box_class_probs
def _filter_boxes(self, boxes, box_confidences, box_class_probs):
"""Filter boxes with object threshold.
# Arguments
boxes: ndarray, boxes of objects.
box_confidences: ndarray, confidences of objects.
box_class_probs: ndarray, class_probs of objects.
# Returns
boxes: ndarray, filtered boxes.
classes: ndarray, classes for boxes.
scores: ndarray, scores for boxes.
box_scores = box_confidences * box_class_probs
box_classes = np.argmax(box_scores, axis=-1)
box_class_scores = np.max(box_scores, axis=-1)
pos = np.where(box_class_scores >= self._t1)
boxes = boxes[pos]
classes = box_classes[pos]
scores = box_class_scores[pos]
return boxes, classes, scores
def _nms_boxes(self, boxes, scores):
"""Suppress non-maximal boxes.
# Arguments
boxes: ndarray, boxes of objects.
scores: ndarray, scores of objects.
# Returns
keep: ndarray, index of effective boxes.
x = boxes[:, 0]
y = boxes[:, 1]
w = boxes[:, 2]
h = boxes[:, 3]
areas = w * h
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
xx1 = np.maximum(x[i], x[order[1:]])
yy1 = np.maximum(y[i], y[order[1:]])
xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])
w1 = np.maximum(0.0, xx2 - xx1 + 1)
h1 = np.maximum(0.0, yy2 - yy1 + 1)
inter = w1 * h1
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= self._t2)[0]
order = order[inds + 1]
keep = np.array(keep)
return keep
def _yolo_out(self, outs, shape):
"""Process output of yolo base net.
# Argument:
outs: output of yolo base net.
shape: shape of original image.
# Returns:
boxes: ndarray, boxes of objects.
classes: ndarray, classes of objects.
scores: ndarray, scores of objects.
masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
[59, 119], [116, 90], [156, 198], [373, 326]]
boxes, classes, scores = [], [], []
for out, mask in zip(outs, masks):
b, c, s = self._process_feats(out, anchors, mask)
b, c, s = self._filter_boxes(b, c, s)
boxes = np.concatenate(boxes)
classes = np.concatenate(classes)
scores = np.concatenate(scores)
# Scale boxes back to original image shape.
width, height = shape[1], shape[0]
image_dims = [width, height, width, height]
boxes = boxes * image_dims
nboxes, nclasses, nscores = [], [], []
for c in set(classes):
inds = np.where(classes == c)
b = boxes[inds]
c = classes[inds]
s = scores[inds]
keep = self._nms_boxes(b, s)
if not nclasses and not nscores:
return None, None, None
boxes = np.concatenate(nboxes)
classes = np.concatenate(nclasses)
scores = np.concatenate(nscores)
return boxes, classes, scores
def predict(self, image, shape):
"""Detect the objects with yolo.
# Arguments
image: ndarray, processed input image.
shape: shape of original image.
# Returns
boxes: ndarray, boxes of objects.
classes: ndarray, classes of objects.
scores: ndarray, scores of objects.
outs = self._yolo.predict(image)
boxes, classes, scores = self._yolo_out(outs, shape)
return boxes, classes, scores
This is the yolo v3 code and when ı work main program ı take this error
InvalidArgumentError: Incompatible shapes: [13,13,2] vs. [1,1,3,2] [Op:Mul]
Main part is
import cv2
import numpy as np
from yolo_model import YOLO
yolo = YOLO(0.6, 0.5)
file = "data/coco_classes.txt"
with open(file) as f:
class_name = f.readlines()
all_classes = [c.strip() for c in class_name]
f = "dog_cat.jpg"
path = "images/"+f
image = cv2.imread(path)
pimage = cv2.resize(image, (416,416))
pimage = np.array(pimage, dtype = "float32")
pimage /= 255.0
pimage = np.expand_dims(pimage, axis = 0)
# yolo
boxes, classes, scores = yolo.predict(pimage, image.shape)
for box, score, cl in zip(boxes, scores, classes):
x,y,w,h = box
top = max(0, np.floor(x + 0.5).astype(int))
left = max(0, np.floor(y + 0.5).astype(int))
right = max(0, np.floor(x + w + 0.5).astype(int))
bottom = max(0, np.floor(y + h + 0.5).astype(int))
cv2.rectangle(image, (top,left), (right, bottom),(255,0,0),2)
cv2.putText(image, "{} {}".format(all_classes[cl],score),(top,left-6),cv2.FONT_HERSHEY_SIMPLEX,0.6, (0,0,255),1,cv2.LINE_AA)
I take problem in box_wh = K.get_value(K.exp(out[..., 2:4]) * anchors_tensor). Is multiply necessary? And what do box_wh?

Tensorflow: How to use a generator for fit() which runs in parallel with multiple processes

I am trying to train a model on a data set which does not fit in my RAM.
Therefore I am using a data generator which inherits from tensorflow.keras.utils.Sequence as shown below.
This is working. However because I am doing processing on the images my training is CPU bound. When looking in GPU-Z my GPU is only at 10-20% but one of my CPU Cores is at its max.
To solve this I am trying to run the generator in parallel on all my 16 cores. However when I set use_multiprocessing=True in the fit() function the program freezes. And using workers=8 does not speed up the process just produces batches in uneven intervals.
batch 1-8 is processed immediately than there is some delay and than batch 9-16 is processed.
The code below shows what I am trying to do.
#read the dataset
x, o_y = reader.read_dataset_whole(ETLCharacterGroups.kanji)
#split data into 90/10 percent parts
percentage = round(len(x) / 100 * 80)
x_train = x[:percentage]
x_test = x[percentage:]
y_train = o_y[:percentage]
y_test = o_y[percentage:]
def distort_sample(img : Image) -> (Image, [int], [int]):
Distort the given image randomly.
Randomly applies the transformations:
- rotation
- shear
- scale
- translate
- sharpen
- blur
Returns the distorted image.
offset, scale = (0, 0), (64, 64)
t = random.choice(["sine"]) # "rotate", "shear", "scale",
f = random.choice(["blur", "sharpen", "smooth"])
# randomly apply transformations...
# rotate image
if("rotate" in t):
img = img.rotate(random.uniform(-30, 30))
# shear image
if("shear" in t):
y_shear = random.uniform(-0.2, 0.2)
x_shear = random.uniform(-0.2, 0.2)
img = img.transform(img.size, PImage.AFFINE, (1, x_shear, 0, y_shear, 1, 0))
# scale and translate image
if("scale" in t):
#scale the image
size_x = random.randrange(20, 63)
size_y = random.randrange(20, 63)
scale = (size_x, size_y)
offset = (math.ceil((64 - size_x) / 2), math.ceil((64 - size_y) / 2))
img = img.resize(scale)
# put it again on a black background (translated)
background ='L', (64, 64))
trans_x = random.randrange(0, math.floor((64 - size_x)))
trans_y = random.randrange(0, math.floor((64 - size_y)))
offset = (trans_x, trans_y)
background.paste(img, offset)
img = background
if("sine" in t):
t_img = np.array(img)
A = t_img.shape[0] / 3.0
w = 2.0 / t_img.shape[1]
shift = lambda x: random.uniform(0.15, 0.2) * A * np.sin(-2*np.pi*x * w)
for i in range(t_img.shape[0]):
t_img[:,i] = np.roll(t_img[:,i], int(shift(i)))
img = PImage.fromarray(t_img)
# blur
if("blur" in f):
img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.5, 1.2)))
# sharpen
if("sharpen" in f):
img = img.filter(ImageFilter.SHARPEN)
# smooth
if("smooth" in f):
img = img.filter(ImageFilter.SMOOTH)
return img, offset, scale
class DataGenerator(tf.keras.utils.Sequence):
def __init__(self, x_col, y_col, batch_size, mode="training", shuffle=True):
self.batch_size = batch_size
self.undistorted_images = batch_size // 2
self.shuffle = shuffle
self.indices = len(x_col)
self.x_col = x_col
self.y_col = y_col
def __len__(self):
return self.indices // self.batch_size
def on_epoch_end(self):
rng_state = np.random.get_state()
def __getitem__(self, index):
X, Y = [], []
for i in range(index * self.undistorted_images, (index+1) * self.undistorted_images):
base_img = self.x_col[i]
img = PImage.fromarray(np.uint8(base_img.reshape(64, 64) * 255))
# distort_sample() creates random variations of an image
img, *unused = distort_sample(img)
# add transformed image
X.append(np.array(img).reshape(64, 64, 1))
# add base image
return np.array(X), np.array(Y)
#instantiate generators
training_generator = DataGenerator(x_col = x_train, y_col = y_train, batch_size = 256)
validation_generator = DataGenerator(x_col = x_test, y_col = y_test, batch_size = 256)
#train the model
hist =
#use_multiprocessing=True <- this freezes the program
In the end I needed to make the Data generator use multi processing. To do this, the arrays needed to be stored in shared memory and than used in the sub processes.
import multiprocessing as mp
import numpy as np
from PIL import Image as PImage
from PIL import ImageFilter
import random
import math
import tensorflow as tf
shared_dict = {}
def distort_sample(img : PImage) -> (PImage, [int], [int]):
Distort the given image randomly.
Randomly applies the transformations:
rotation, shear, scale, translate,
Randomly applies the filter:
sharpen, blur, smooth
Returns the distorted image.
offset, scale = (0, 0), (64, 64)
t = random.choice(["sine", "rotate", "shear", "scale"])
f = random.choice(["blur", "sharpen", "smooth"])
# randomly apply transformations...
# rotate image
if("rotate" in t):
img = img.rotate(random.uniform(-15, 15))
# shear image
if("shear" in t):
y_shear = random.uniform(-0.2, 0.2)
x_shear = random.uniform(-0.2, 0.2)
img = img.transform(img.size, PImage.AFFINE, (1, x_shear, 0, y_shear, 1, 0))
# scale and translate image
if("scale" in t):
#scale the image
size_x = random.randrange(25, 63)
size_y = random.randrange(25, 63)
scale = (size_x, size_y)
offset = (math.ceil((64 - size_x) / 2), math.ceil((64 - size_y) / 2))
img = img.resize(scale)
# put it again on a black background (translated)
background ='L', (64, 64))
trans_x = random.randrange(0, math.floor((64 - size_x)))
trans_y = random.randrange(0, math.floor((64 - size_y)))
offset = (trans_x, trans_y)
background.paste(img, offset)
img = background
if("sine" in t):
t_img = np.array(img)
A = t_img.shape[0] / 3.0
w = 2.0 / t_img.shape[1]
shift_factor = random.choice([-1, 1]) * random.uniform(0.15, 0.2)
shift = lambda x: shift_factor * A * np.sin(-2*np.pi*x * w)
for i in range(t_img.shape[0]):
t_img[:,i] = np.roll(t_img[:,i], int(shift(i)))
img = PImage.fromarray(t_img)
# blur
if("blur" in f):
img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.5, 1.2)))
# sharpen
if("sharpen" in f):
img = img.filter(ImageFilter.SHARPEN)
# smooth
if("smooth" in f):
img = img.filter(ImageFilter.SMOOTH)
return img, offset, scale
def generator_func(start_index, end_index, x_shape, y_shape):
X, Y = [], []
x_loc = np.frombuffer(shared_dict["x"], dtype="float16").reshape(x_shape)
y_loc = np.frombuffer(shared_dict["y"], dtype="b").reshape(y_shape)
for i in range(start_index, end_index):
base_img = x_loc[i]
img = PImage.fromarray(np.uint8(base_img.reshape(64, 64) * 255))
img, *unused = distort_sample(img)
# add transformed image
X.append(np.array(img).reshape(64, 64, 1))
X.append(np.array(img).reshape(64, 64, 1))
# add base image
return X, Y
def generator_initializer(_x_shared, _y_shared):
shared_dict["x"] = _x_shared
shared_dict["y"] = _y_shared
def generator_func(start_index, end_index, x_shape, y_shape):
X, Y = [], []
x_loc = np.frombuffer(shared_dict["x"], dtype="float16").reshape(x_shape)
y_loc = np.frombuffer(shared_dict["y"], dtype="b").reshape(y_shape)
for i in range(start_index, end_index):
base_img = x_loc[i]
img = PImage.fromarray(np.uint8(base_img.reshape(64, 64) * 255))
img, *unused = distort_sample(img)
# add transformed image
X.append(np.array(img).reshape(64, 64, 1))
X.append(np.array(img).reshape(64, 64, 1))
# add base image
return X, Y
class DataGenerator(tf.keras.utils.Sequence):
def __init__(self, num_samples, batch_size,
percentage, mode,
x_shared, y_shared,
x_np_shape, y_np_shape,
processes, shuffle=True):
self.num_samples = num_samples
# 50% original images + 50% augmented images
self.batch_size = batch_size // 2
self.percentage = percentage
# an offset to devide the data set into test and train
self.start_index = 0
if(mode == "testing"):
self.start_index = num_samples - (num_samples // 100 * percentage)
# is this a train or a test generator
self.mode = mode
# how many processes should be used for this generator
self.processes = processes
# should the arrays be shuffled after each epoch
self.shuffle = shuffle
self.x_np_shape = x_np_shape
self.y_np_shape = y_np_shape
# a pool of processes for generating augmented data
self.pool = mp.Pool(processes=self.processes,
initargs=(x_shared, y_shared))
def __len__(self):
return (self.num_samples // 100 * self.percentage) // self.batch_size
def on_epoch_end(self):
rng_state = np.random.get_state()
def __getitem__(self, index):
arguments = []
slice_size = self.batch_size // self.processes
current_batch = index * self.batch_size
for i in range(self.processes):
slice_start = self.start_index + (current_batch + i * slice_size)
slice_end = self.start_index + (current_batch + (i+1) * slice_size)
arguments.append([slice_start, slice_end, self.x_np_shape, self.y_np_shape])
return_values = self.pool.starmap(generator_func, arguments)
X, Y = [], []
for imgs, labels in return_values:
return np.concatenate(X).astype(np.float16), np.concatenate(Y).astype(np.float16)
