I am trying to implement a custom dataset loader. Firstly I resize the images and labels with the same ratio between (0.98, 1,1) then I randomly crop both images and labels with same parameters so that I can feed them into NN. However, I am getting an error from PyTorch functional. Here is my code:
class RandomCrop(object):
def __init__(self, size, padding=None, pad_if_needed=True, fill=0, padding_mode='constant'):
self.size = size
self.padding = padding
self.pad_if_needed = pad_if_needed
self.fill = fill
self.padding_mode = padding_mode
def get_params(img, output_size):
w, h = img.size
th, tw = output_size
if w == tw and h == th:
return 0, 0, h, w
i = random.randint(0, h - th)
j = random.randint(0, w - tw)
return i, j, th, tw
def __call__(self, data):
img,mask = data["image"],data["mask"]
# pad the width if needed
if self.pad_if_needed and img.size[0] < self.size[1]:
img = F.pad(img, (self.size[1] - img.size[0], 0), self.fill, self.padding_mode)
mask = F.pad(mask, (self.size[1] - mask.size[0], 0), self.fill, self.padding_mode)
# pad the height if needed
if self.pad_if_needed and img.size[1] < self.size[0]:
img = F.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode)
mask = F.pad(mask, (0, self.size[0] - mask.size[1]), self.fill, self.padding_mode)
i, j, h, w = self.get_params(img, self.size)
crop_image = transforms.functional.crop(img, i, j, h, w)
crop_mask = transforms.functional.crop(mask, i, j, h, w)
return{"image": crop_image, "mask": crop_mask }
Here is the error:
AttributeError: 'Image' object has no attribute 'dim'

Mistakenly I imported nn.functional.pad instead of the transforms.functional.pad. After changing it everything went smoothly


vtk: how to obtain the image pixel index from a world point

If I pick a world point from a image, How can I convert the world coordinate to image index?
import vtk
import numpy as np
from vtk.util.numpy_support import numpy_to_vtk
def numpyToVTK(data, multi_component=False, type='float'):
if type == 'float':
data_type = vtk.VTK_FLOAT
elif type == 'char':
data_type = vtk.VTK_UNSIGNED_CHAR
raise RuntimeError('unknown type')
if multi_component == False:
if len(data.shape) == 2:
data = data[:, :, np.newaxis]
flat_data_array = data.transpose(2,1,0).flatten()
vtk_data = numpy_to_vtk(num_array=flat_data_array, deep=True, array_type=data_type)
shape = data.shape
assert len(data.shape) == 3, 'only test for 2D RGB'
flat_data_array = data.transpose(1, 0, 2)
flat_data_array = np.reshape(flat_data_array, newshape=[-1, data.shape[2]])
vtk_data = numpy_to_vtk(num_array=flat_data_array, deep=True, array_type=data_type)
shape = [data.shape[0], data.shape[1], 1]
img = vtk.vtkImageData()
img.SetDimensions(shape[0], shape[1], shape[2])
return img
global sphereActor, textActor
sphereActor = None
textActor = None
def mouseMoveEvent(iren, event):
x, y = iren.GetEventPosition()
picker = vtk.vtkWorldPointPicker()
picker.Pick(x, y, 0, render)
worldPoint = picker.GetPickPosition()
## convert world point to image index
sphere = vtk.vtkSphereSource()
sphere.SetCenter(worldPoint[0], worldPoint[1], worldPoint[2])
sphereMapper = vtk.vtkPolyDataMapper()
global sphereActor, textActor
if sphereActor != None:
sphereActor = vtk.vtkActor()
sphereActor.GetProperty().SetColor(255, 0, 0)
if textActor != None:
textActor = vtk.vtkTextActor()
textActor.SetInput('world coordinate: (%.2f, %.2f, %.2f)'%(worldPoint[0], worldPoint[1], worldPoint[2]))
textActor.GetTextProperty().SetColor(1, 0, 0)
img = np.zeros(shape=[128, 128])
for i in range(128):
for j in range(128):
img[i, j] = i+j
vtkImg = numpyToVTK(img)
imgActor = vtk.vtkImageActor()
render = vtk.vtkRenderer()
# render.Render()
renWin = vtk.vtkRenderWindow()
iren = vtk.vtkRenderWindowInteractor()
iren.AddObserver('MouseMoveEvent', mouseMoveEvent)
In the above code, if I don't rotate the image, the world point is (x, y, 0):
And it is agree with what I know. For the world point (x, y, z) and the image index (i, j, k), the conversion should be:
worldPoint (x,y,z) = i*spacingX*directionX + j*spacingY*directionY + k*spacingZ*directionZ + originPoint
In the above code, the image is converted from numpy, thus:
directionX = [1, 0, 0]
directionY = [0, 1, 0]
directionZ = [0, 0, 1]
originPoint=[0, 0, 0]
In this way, x=i, y=j, z=k. Since this image is a 2D image, the k should be 0 and 'z' should also be 0.
Then, I rotate the image, z is not 0. Like the following picture.
I don't know why z is -0.24.
It means the following conversion is wrong. And how can I obtain the image index by the world point?
worldPoint (x,y,z) = i*spacingX*directionX + j*spacingY*directionY + k*spacingZ*directionZ + originPoint
Any suggestion is appreciated!
vtkImageData has the method TransformPhysicalPointToContinuousIndex for going from world space to image space and TransformIndexToPhysicalPoint to go the other way.
I don't think the computation you're doing is right, since direction is 3x3 rotation matrix.

When ı compile yolov3 ı get take warnings

"""YOLO v3 output
import numpy as np
import keras.backend as K
from keras.models import load_model
import os
class YOLO:
def __init__(self, obj_threshold, nms_threshold):
# Arguments
obj_threshold: Integer, threshold for object.
nms_threshold: Integer, threshold for box.
self._t1 = obj_threshold
self._t2 = nms_threshold
self._yolo = load_model('data/yolo.h5')
def _process_feats(self, out, anchors, mask):
"""process output features.
# Arguments
out: Tensor (N, N, 3, 4 + 1 +80), output feature map of yolo.
anchors: List, anchors for box.
mask: List, mask for anchors.
# Returns
boxes: ndarray (N, N, 3, 4), x,y,w,h for per box.
box_confidence: ndarray (N, N, 3, 1), confidence for per box.
box_class_probs: ndarray (N, N, 3, 80), class probs for per box.
grid_h, grid_w, num_boxes = map(int, out.shape[1: 4])
anchors = [anchors[i] for i in mask]
# Reshape to batch, height, width, num_anchors, box_params.
anchors_tensor = K.reshape(K.variable(anchors),
[1, 1,len(anchors), 2])
out = out[0]
box_xy = K.get_value(K.sigmoid(out[..., :2]))
box_wh = K.get_value(K.exp(out[..., 2:4]) * anchors_tensor)
box_confidence = K.get_value(K.sigmoid(out[..., 4]))
box_confidence = np.expand_dims(box_confidence, axis=-1)
box_class_probs = K.get_value(K.sigmoid(out[..., 5:]))
col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
grid = np.concatenate((col, row), axis=-1)
box_xy += grid
box_xy /= (grid_w, grid_h)
box_wh /= (416, 416)
box_xy -= (box_wh / 2.)
boxes = np.concatenate((box_xy, box_wh), axis=-1)
return boxes, box_confidence, box_class_probs
def _filter_boxes(self, boxes, box_confidences, box_class_probs):
"""Filter boxes with object threshold.
# Arguments
boxes: ndarray, boxes of objects.
box_confidences: ndarray, confidences of objects.
box_class_probs: ndarray, class_probs of objects.
# Returns
boxes: ndarray, filtered boxes.
classes: ndarray, classes for boxes.
scores: ndarray, scores for boxes.
box_scores = box_confidences * box_class_probs
box_classes = np.argmax(box_scores, axis=-1)
box_class_scores = np.max(box_scores, axis=-1)
pos = np.where(box_class_scores >= self._t1)
boxes = boxes[pos]
classes = box_classes[pos]
scores = box_class_scores[pos]
return boxes, classes, scores
def _nms_boxes(self, boxes, scores):
"""Suppress non-maximal boxes.
# Arguments
boxes: ndarray, boxes of objects.
scores: ndarray, scores of objects.
# Returns
keep: ndarray, index of effective boxes.
x = boxes[:, 0]
y = boxes[:, 1]
w = boxes[:, 2]
h = boxes[:, 3]
areas = w * h
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
xx1 = np.maximum(x[i], x[order[1:]])
yy1 = np.maximum(y[i], y[order[1:]])
xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])
w1 = np.maximum(0.0, xx2 - xx1 + 1)
h1 = np.maximum(0.0, yy2 - yy1 + 1)
inter = w1 * h1
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= self._t2)[0]
order = order[inds + 1]
keep = np.array(keep)
return keep
def _yolo_out(self, outs, shape):
"""Process output of yolo base net.
# Argument:
outs: output of yolo base net.
shape: shape of original image.
# Returns:
boxes: ndarray, boxes of objects.
classes: ndarray, classes of objects.
scores: ndarray, scores of objects.
masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
[59, 119], [116, 90], [156, 198], [373, 326]]
boxes, classes, scores = [], [], []
for out, mask in zip(outs, masks):
b, c, s = self._process_feats(out, anchors, mask)
b, c, s = self._filter_boxes(b, c, s)
boxes = np.concatenate(boxes)
classes = np.concatenate(classes)
scores = np.concatenate(scores)
# Scale boxes back to original image shape.
width, height = shape[1], shape[0]
image_dims = [width, height, width, height]
boxes = boxes * image_dims
nboxes, nclasses, nscores = [], [], []
for c in set(classes):
inds = np.where(classes == c)
b = boxes[inds]
c = classes[inds]
s = scores[inds]
keep = self._nms_boxes(b, s)
if not nclasses and not nscores:
return None, None, None
boxes = np.concatenate(nboxes)
classes = np.concatenate(nclasses)
scores = np.concatenate(nscores)
return boxes, classes, scores
def predict(self, image, shape):
"""Detect the objects with yolo.
# Arguments
image: ndarray, processed input image.
shape: shape of original image.
# Returns
boxes: ndarray, boxes of objects.
classes: ndarray, classes of objects.
scores: ndarray, scores of objects.
outs = self._yolo.predict(image)
boxes, classes, scores = self._yolo_out(outs, shape)
return boxes, classes, scores
This is the yolo v3 code and when ı work main program ı take this error
InvalidArgumentError: Incompatible shapes: [13,13,2] vs. [1,1,3,2] [Op:Mul]
Main part is
import cv2
import numpy as np
from yolo_model import YOLO
yolo = YOLO(0.6, 0.5)
file = "data/coco_classes.txt"
with open(file) as f:
class_name = f.readlines()
all_classes = [c.strip() for c in class_name]
f = "dog_cat.jpg"
path = "images/"+f
image = cv2.imread(path)
pimage = cv2.resize(image, (416,416))
pimage = np.array(pimage, dtype = "float32")
pimage /= 255.0
pimage = np.expand_dims(pimage, axis = 0)
# yolo
boxes, classes, scores = yolo.predict(pimage, image.shape)
for box, score, cl in zip(boxes, scores, classes):
x,y,w,h = box
top = max(0, np.floor(x + 0.5).astype(int))
left = max(0, np.floor(y + 0.5).astype(int))
right = max(0, np.floor(x + w + 0.5).astype(int))
bottom = max(0, np.floor(y + h + 0.5).astype(int))
cv2.rectangle(image, (top,left), (right, bottom),(255,0,0),2)
cv2.putText(image, "{} {}".format(all_classes[cl],score),(top,left-6),cv2.FONT_HERSHEY_SIMPLEX,0.6, (0,0,255),1,cv2.LINE_AA)
I take problem in box_wh = K.get_value(K.exp(out[..., 2:4]) * anchors_tensor). Is multiply necessary? And what do box_wh?

Python OpenCv2 place image over face found

I am loading several images will go over my face and I am having difficulty getting the image to go over the square for face created. I have looked at a many resources , but for some reason I am receiving an error when attempting to follow their method.
Every time I do so , I receive an error
ValueError: could not broadcast input array from shape (334,334,3) into shape (234,234,3)
I think the images might be too large, however I tried to resize them to see if they will fit to no avail.
here is my code:
import cv2
import sys
import logging as log
import datetime as dt
from time import sleep
import os
import random
from timeit import default_timer as timer
cascPath = "haarcascade_frontalface_default.xml"
faceCascade = cv2.CascadeClassifier(cascPath)
video_capture = cv2.VideoCapture(0)
anterior = 0
#s_img = cv2.imread("my.jpg")
increment = 0
for filename in os.listdir("Faces/"):
if filename.endswith(".png"):
FullFile = (os.path.join("Faces/", filename))
#ret, frame =
frame = cv2.imread(FullFile, cv2.IMREAD_UNCHANGED)
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = faceCascade.detectMultiScale( gray,scaleFactor=1.1, minNeighbors=5, minSize=(30, 30) )
edges = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 9)
for (x, y, w, h) in faces:
roi_color = frame[y:( y ) + ( h ), x:x + w]
status = cv2.imwrite('export/faces_detected'+ str( increment ) +'.png', roi_color)
increment = increment + 1
masks = []
for filename in os.listdir("export/"):
if filename.endswith(".png"):
FullFile = (os.path.join("export/", filename))
s_img = cv2.imread(FullFile)
Start = timer()
End = timer()
MasksSize = len(masks)
nrand = random.randint(0, MasksSize -1 )
increment = 0
while True:
if not video_capture.isOpened():
print('Unable to load camera.')
# Capture frame-by-frame
ret, frame =
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = faceCascade.detectMultiScale(
minSize=(30, 30)
edges = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 9)
# Draw a rectangle around the faces
for (x, y, w, h) in faces:
if (End - Start) > 3:
Start = timer()
End = timer()
nrand = random.randint(0, MasksSize -1 )
# -75 and +20 added to fit my face
cv2.rectangle(frame, (x, y - 75), (x+w, y+h+20), (0, 255, 0), 2)
s_img = masks[nrand]
increment = increment + 1
#maskresize = cv2.resize(s_img, (150, 150))
#frame[y:y+s_img.shape[0] , x:x+s_img.shape[1]] = s_img # problem occurs here with
# ValueError: could not broadcast input array from shape (334,334,3) into shape (234,234,3)
# I assume I am inserting somethign too big?
End = timer()
if anterior != len(faces):
anterior = len(faces)"faces: "+str(len(faces))+" at "+str(
# Display the resulting frame
cv2.imshow('Video', frame)
#cv2.imshow('Video', cartoon)
if cv2.waitKey(1) & 0xFF == ord('q'):
# Display the resulting frame
cv2.imshow('Video', frame)
# When everything is done, release the capture
In the following line,
frame[y:y+s_img.shape[0] , x:x+s_img.shape[1]] = s_img
you are trying to attempt to assign s_img to frame[y:y+s_img.shape[0] , x:x+s_img.shape[1]] which are of different shapes.
You can check the shapes of the two by printing the shape (it will be the same as the shapes mentioned in the error).
Try reshaping s_img to the same shape and then try to assign.
Refer to this link:
I used this function to resize the image to scale.
def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA):
# initialize the dimensions of the image to be resized and
# grab the image size
dim = None
(h, w) = image.shape[:2]
# if both the width and height are None, then return the
# original image
if width is None and height is None:
return image
# check to see if the width is None
if width is None:
# calculate the ratio of the height and construct the
# dimensions
r = height / float(h)
dim = (int(w * r), height)
# otherwise, the height is None
# calculate the ratio of the width and construct the
# dimensions
r = width / float(w)
dim = (width, int(h * r))
# resize the image
resized = cv2.resize(image, dim, interpolation = inter)
# return the resized image
return resized
Then later on called
r= image_resize(s_img, height = h, width=w)
frame[y:y+r.shape[0] , x:x+r.shape[1]] = r
Answer taken from here too:
Resize an image without distortion OpenCV

stylegan encoder print image is too small

I am trying to print a stylemix encoder image however my printed images are too small, I am not sure where am I doing wrong.
my latent space
jon = np.load('latent_representations/example0.npy')
drogo = np.load('latent_representations/example1.npy')
# Loading already learned latent directions
smile_direction = np.load('ffhq_dataset/latent_directions/smile.npy')
gender_direction = np.load('ffhq_dataset/latent_directions/gender.npy')
age_direction = np.load('ffhq_dataset/latent_directions/age.npy'
my draw style mix loop
def draw_style_mixing_figure(png, Gs, w, h, src_dlatents, dst_dlatents, style_ranges):
#src_dlatents =, None) # [seed, layer, component]
#dst_dlatents =, None)
src_images =, randomize_noise=False, **synthesis_kwargs)
dst_images =, randomize_noise=False, **synthesis_kwargs)
canvas ='RGB', (w * (len(src_dlatents) + 1), h * (len(dst_dlatents) + 1)), 'white')
for col, src_image in enumerate(list(src_images)):
canvas.paste(PIL.Image.fromarray(src_image, 'RGB'), ((col + 1) * w, 0))
for row, dst_image in enumerate(list(dst_images)):
canvas.paste(PIL.Image.fromarray(dst_image, 'RGB'), (0, (row + 1) * h))
row_dlatents = np.stack([dst_dlatents[row]] * len(src_dlatents))
row_dlatents[:, style_ranges[row]] = src_dlatents[:, style_ranges[row]]
row_images =, randomize_noise=False, **synthesis_kwargs)
for col, image in enumerate(list(row_images)):
canvas.paste(PIL.Image.fromarray(image, 'RGB'), ((col + 1) * w, (row + 1) * h))
return canvas.resize((512,512))
my printing image order
synthesis_kwargs = dict(output_transform=dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True), minibatch_size=1)
_Gs_cache = dict()
draw_style_mixing_figure(os.path.join(config.result_dir, 'style-mixing.png'), Gs, w=1024, h=1024, src_dlatents=jon.reshape((1, 12, 512)), dst_dlatents=drogo.reshape((1, 12, 512)), style_ranges=[range(1,1)]),
But resulting pictures are too small
any idea how to make them bigger?

Delay in output video stream when using YOLOV3 Detection using OpenCV

This is my Code of Mask Detection using YOLOV3 weights created by me. Whenever I run my Program, I experience a delay in my output Video of detection. This is the code please have a look.
import cv2
import numpy as np
net = cv2.dnn.readNet("yolov3_custom_final.weights", "yolov3_custom.cfg")
with open("", "r") as f:
classes =
cap = cv2.VideoCapture(0 + cv2.CAP_DSHOW)
while True:
ret, img =
height, weight, _ = img.shape
blob = cv2.dnn.blobFromImage(img, 1 / 255, (416, 416), (0, 0, 0), swapRB=True, crop=False)
output = net.getUnconnectedOutLayersNames()
layers = net.forward(output)
box = []
confidences = []
class_ids = []
for out in layers:
for detection in out:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.3:
centre_x = int(detection[0] * weight)
centre_y = int(detection[1] * height)
w = int(detection[2] * weight)
h = int(detection[3] * height)
x = int(centre_x - w / 2)
y = int(centre_y - h / 2)
box.append([x, y, w, h])
indexes = np.array(cv2.dnn.NMSBoxes(box, confidences, 0.5, 0.4))
colors = np.random.uniform(0, 255, size=(len(box), 3))
for i in indexes.flatten():
x, y, w, h = box[i]
label = str(classes[class_ids[i]])
confidence = str(round(confidences[i], 2))
color = colors[i]
cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
cv2.putText(img, label + "I" + confidence, (x, y + 20), font, 2, (255, 255, 255), 2)
cv2.imshow("Final", img)
if cv2.waitKey(1) & 0xff == ord("q"):
Can someone Please help me in this Issue or suggest a way to reduce the Lag in my Output videostream ?
As I have done some research over the Time, I have a found a Possible answer to this question. As I'm running my YOLO model in my local system which has no GPU, This is the factor that is causing a delay in the Output as it Processes a frame and takes another frame after completion.
