Adding multiple classes in Mask R-CNN - python-3.x

I am using Matterport Mask RCNN as my model and I'm trying to build my database for training. After much deliberation over the below problem, I think what I'm actually asking is how do I add more than one class (+ BG)?
I get the following AssertionError:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-21-c20768952b65> in <module>()
15
16 # display image with masks and bounding boxes
---> 17 display_instances(image, bbox, masks, class_ids/4, train_set.class_names)
/usr/local/lib/python3.6/dist-packages/mask_rcnn-2.1-py3.6.egg/mrcnn/visualize.py in display_instances(image, boxes, masks, class_ids, class_names, scores, title, figsize, ax, show_mask, show_bbox, colors, captions)
103 print("\n*** No instances to display *** \n")
104 else:
--> 105 assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]
106
107 # If no axis is passed, create one and automatically call show()
AssertionError:
The problem appears to come from this mask.shape[-1] == class_ids.shape[0] resulting in False which should not be the case.
I have now traced it back to the masks.shape[-1] is 4 times the value of the class_id.shape[0] and I think this may have something to do with having 4 classes in the data. Unfortunately, I haven't worked out how to solve this problem.
# load the masks for an image
def load_mask(self, image_id):
# get details of image
info = self.image_info[image_id]
# define box file location
path = info['annotation']
# load XML
boxes, w, h = self.extract_boxes(path)
# create one array for all masks, each on a different channel
masks = zeros([h, w, len(boxes)], dtype='uint8')
# create masks
class_ids = list()
for i in range(len(boxes)):
box = boxes[i]
row_s, row_e = box[1], box[3]
col_s, col_e = box[0], box[2]
masks[row_s:row_e, col_s:col_e, i] = 1
class_ids.append(self.class_names.index('Resistor'))
class_ids.append(self.class_names.index('LED'))
class_ids.append(self.class_names.index('Capacitor'))
class_ids.append(self.class_names.index('Diode'))
return masks, asarray(class_ids, dtype='int32')
# load the masks and the class ids
mask, class_ids = train_set.load_mask(image_id)
print(mask, "and", class_ids)
# display image with masks and bounding boxes
display_instances(image, bbox, mask, class_ids, train_set.class_names)

There are a couple of modifications you need to do to add multiple classes:
1) In load dataset, add classes in self.add_class("class_name"), and, then the
last line is modified to add class_ids. #number of classes you have.
# load the dataset definitions
def load_dataset(self, dataset_dir, is_train=True):
# define one class
self.add_class("dataset", 1, "car")
self.add_class("dataset", 2, "rider")
# define data locations
images_dir = dataset_dir + '/images_mod/'
annotations_dir = dataset_dir + '/annots_mod/'
# find all images
for filename in listdir(images_dir):
# extract image id
image_id = filename[:-4]
# skip all images after 150 if we are building the train set
if is_train and int(image_id) >= 3000:
continue
# skip all images before 150 if we are building the test/val set
if not is_train and int(image_id) < 3000:
continue
img_path = images_dir + filename
ann_path = annotations_dir + image_id + '.xml'
# add to dataset
self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path, class_ids=[0,1,2])
2) Now, in extract boxes, you need to modify to find the object and then look for name and bounding box dimensions. In case you have 2 classes and your XML files contains those exact classes only then you need no to use the if statement to append co-ordinates to boxes. But if you want to consider less number of classes compared to classes available in XML files, then you need to add if statement. Otherwise, all the boxes will be considered as masks.
# extract bounding boxes from an annotation file
def extract_boxes(self, filename):
# load and parse the file
tree = ElementTree.parse(filename)
# get the root of the document
root = tree.getroot()
# extract each bounding box
boxes = list()
for box in root.findall('.//object'):
name = box.find('name').text
xmin = int(box.find('./bndbox/xmin').text)
ymin = int(box.find('./bndbox/ymin').text)
xmax = int(box.find('./bndbox/xmax').text)
ymax = int(box.find('./bndbox/ymax').text)
coors = [xmin, ymin, xmax, ymax, name]
if name=='car' or name=='rider':
boxes.append(coors)
# extract image dimensions
width = int(root.find('.//size/width').text)
height = int(root.find('.//size/height').text)
return boxes, width, height
3) Finally, in the load_mask, if-else statement needs to be added to append the boxes accordingly.
# load the masks for an image
def load_mask(self, image_id):
# get details of image
info = self.image_info[image_id]
# define box file location
path = info['annotation']
# load XML
boxes, w, h = self.extract_boxes(path)
# create one array for all masks, each on a different channel
masks = zeros([h, w, len(boxes)], dtype='uint8')
# create masks
class_ids = list()
for i in range(len(boxes)):
box = boxes[i]
row_s, row_e = box[1], box[3]
col_s, col_e = box[0], box[2]
if (box[4] == 'car'):
masks[row_s:row_e, col_s:col_e, i] = 1
class_ids.append(self.class_names.index('car'))
else:
masks[row_s:row_e, col_s:col_e, i] = 2
class_ids.append(self.class_names.index('rider'))
return masks, asarray(class_ids, dtype='int32')
In my case, I require 2 classes and there are numerous classes available in XML files. Using the above code, I got the following image:

If u want to train multiple classes you can use the following code..
In load dataset, add classes in self.add_class("class_name"), and, then the last line is modified to add class_ids. #number of classes you have.
# define classes
self.add_class("dataset", 1, "class1name")
self.add_class("dataset", 2, "class2name")
# define data locations
images_dir = dataset_dir + '/images/'
annotations_dir = dataset_dir + '/annots/'
# find all images
for filename in listdir(images_dir):
# extract image id
image_id = filename[:-4]
# skip bad images
if image_id in ['00090']:
continue
# skip all images after 150 if we are building the train set
if is_train and int(image_id) >= 150:
continue
# skip all images before 150 if we are building the test/val set
if not is_train and int(image_id) < 150:
continue
img_path = images_dir + filename
ann_path = annotations_dir + image_id + '.xml'
# add to dataset
self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path,class_ids=[0,1,2])
You don't need to modify anything in below function
def extract_boxes(self, filename):
# load and parse the file
tree = ElementTree.parse(filename)
# get the root of the document
root = tree.getroot()
# extract each bounding box
boxes = list()
for box in root.findall('.//bndbox'):
xmin = int(box.find('xmin').text)
ymin = int(box.find('ymin').text)
xmax = int(box.find('xmax').text)
ymax = int(box.find('ymax').text)
coors = [xmin, ymin, xmax, ymax]
boxes.append(coors)
# extract image dimensions
width = int(root.find('.//size/width').text)
height = int(root.find('.//size/height').text)
return boxes, width, height
3)In the below function "if i == 0" means the first bounding boxes.For multiple bounding boxes(i.e for multiple classes) use i == 1,i == 2 .....
# load the masks for an image
def load_mask(self, image_id):
# get details of image
info = self.image_info[image_id]
# define box file location
path = info['annotation']
# load XML
boxes, w, h = self.extract_boxes(path)
# create one array for all masks, each on a different channel
masks = zeros([h, w, len(boxes)], dtype='uint8')
# create masks
class_ids = list()
for i in range(len(boxes)):
box = boxes[i]
row_s, row_e = box[1], box[3]
col_s, col_e = box[0], box[2]
# print()
if i == 0:
masks[row_s:row_e, col_s:col_e, i] = 1
class_ids.append(self.class_names.index('class1name'))
else:
masks[row_s:row_e, col_s:col_e, i] = 2
class_ids.append(self.class_names.index('class2name'))
# return boxes[0],masks, asarray(class_ids, dtype='int32') to check the points
return masks, asarray(class_ids, dtype='int32')

Related

ValueError: Cannot set tensor: Dimension mismatch. Got 198 but expected 300 for dimension 1 of input 175

I am using tensorflow lite for object detection specifically the SSDLite-MobileNet-v2 object detection model from google. Along with object detection I am also performing color detection of that particular object using opencv.For that I am referring to this tutorial on youtube.
Simple color detection
Here first the frame read is converted into HSV format and the the HSV value of a pixel at the center of the object bounding box is found out and the color is estimated
For object detection I'm referring to the following GitHub code--
https://github.com/EdjeElectronics/TensorFlow-Lite-Object-Detection-on-Android-and-Raspberry-Pi
The code for object and color detection:-
class VideoStream:
"""Camera object that controls video streaming from the Picamera"""
def __init__(self,resolution=(640,480),framerate=30):
## Initialize the PiCamera and the camera image stream
self.stream = cv2.VideoCapture(0)
ret = self.stream.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc(*'MJPG'))
ret = self.stream.set(3,resolution[0])
ret = self.stream.set(4,resolution[1])
## Read first frame from the stream
(self.grabbed, self.frame) = self.stream.read()
## Variable to control when the camera is stopped
self.stopped = False
def start(self):
##Start the thread that reads frames from the video stream
Thread(target=self.update,args=()).start()
return self
def update(self):
## Keep looping indefinitely until the thread is stopped
while True:
## If the camera is stopped, stop the thread
if self.stopped:
## Close camera resources
self.stream.release()
return
##Otherwise, grab the next frame from the stream
(self.grabbed, self.frame) = self.stream.read()
def read(self):
## Return the most recent frame
return self.frame
def stop(self):
## Indicate that the camera and thread should be stopped
self.stopped = True
## Define and parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('--modeldir', help='Folder the .tflite file is located in',
required=True)
parser.add_argument('--graph', help='Name of the .tflite file, if different than detect.tflite',
default='detect.tflite')
parser.add_argument('--labels', help='Name of the labelmap file, if different than labelmap.txt',
default='labelmap.txt')
parser.add_argument('--threshold', help='Minimum confidence threshold for displaying detected objects',
default=0.5)
parser.add_argument('--resolution', help='Desired webcam resolution in WxH. If the webcam does not support the resolution entered, errors may occur.',
default='1280x720')
parser.add_argument('--edgetpu', help='Use Coral Edge TPU Accelerator to speed up detection',
action='store_true')
args = parser.parse_args()
MODEL_NAME = args.modeldir
GRAPH_NAME = args.graph
LABELMAP_NAME = args.labels
min_conf_threshold = float(args.threshold)
resW, resH = args.resolution.split('x')
imW, imH = int(resW), int(resH)
use_TPU = args.edgetpu
## Import TensorFlow libraries
## If tflite_runtime is installed, import interpreter from tflite_runtime, else import from regular tensorflow
## If using Coral Edge TPU, import the load_delegate library
pkg = importlib.util.find_spec('tflite_runtime')
if pkg:
from tflite_runtime.interpreter import Interpreter
if use_TPU:
from tflite_runtime.interpreter import load_delegate
else:
from tensorflow.lite.python.interpreter import Interpreter
if use_TPU:
from tensorflow.lite.python.interpreter import load_delegate
## If using Edge TPU, assign filename for Edge TPU model
if use_TPU:
# If user has specified the name of the .tflite file, use that name, otherwise use default 'edgetpu.tflite'
if (GRAPH_NAME == 'detect.tflite'):
GRAPH_NAME = 'edgetpu.tflite'
## Get path to current working directory
CWD_PATH = os.getcwd()
## Path to .tflite file, which contains the model that is used for object detection
PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,GRAPH_NAME)
## Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH,MODEL_NAME,LABELMAP_NAME)
## Load the label map
with open(PATH_TO_LABELS, 'r') as f:
labels = [line.strip() for line in f.readlines()]
## Have to do a weird fix for label map if using the COCO "starter model" from
## https://www.tensorflow.org/lite/models/object_detection/overview
## First label is '???', which has to be removed.
if labels[0] == '???':
del(labels[0])
## Load the Tensorflow Lite model.
## If using Edge TPU, use special load_delegate argument
if use_TPU:
interpreter = Interpreter(model_path=PATH_TO_CKPT,
experimental_delegates=[load_delegate('libedgetpu.so.1.0')])
print(PATH_TO_CKPT)
else:
interpreter = Interpreter(model_path=PATH_TO_CKPT)
interpreter.allocate_tensors()
## Get model details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
height = input_details[0]['shape'][1]
width = input_details[0]['shape'][2]
floating_model = (input_details[0]['dtype'] == np.float32)
input_mean = 127.5
input_std = 127.5
## Initialize frame rate calculation
frame_rate_calc = 1
freq = cv2.getTickFrequency()
## Initialize video stream
videostream = VideoStream(resolution=(imW,imH),framerate=30).start()
time.sleep(1)
while True:
## Start timer (for calculating frame rate)
t1 = cv2.getTickCount()
## Grab frame from video stream
frame1 = videostream.read()
## Acquire frame and resize to expected shape [1xHxWx3]
frame = frame1.copy()
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame_resized = cv2.resize(frame_rgb, (width, height))
input_data = np.expand_dims(frame_resized, axis=0)
## Normalize pixel values if using a floating model (i.e. if model is non-quantized)
if floating_model:
input_data = (np.float32(input_data) - input_mean) / input_std
## Perform the actual detection by running the model with the image as input
interpreter.set_tensor(input_details[0]['index'],input_data)
interpreter.invoke()
## Retrieve detection results
boxes = interpreter.get_tensor(output_details[0]['index'])[0] ## Bounding box coordinates of detected objects
classes = interpreter.get_tensor(output_details[1]['index'])[0] ## Class index of detected objects
scores = interpreter.get_tensor(output_details[2]['index'])[0] ## Confidence of detected objects
##num = interpreter.get_tensor(output_details[3]['index'])[0] # Total number of detected
## Loop over all detections and draw detection box if confidence is above minimum threshold
for i in range(len(scores)):
if ((scores[i] > min_conf_threshold) and (scores[i] <= 1.0)):
## Get bounding box coordinates and draw box
## Interpreter can return coordinates that are outside of image dimensions, need to force them to be within image using max() and min()
ymin = int(max(1,(boxes[i][0] * imH)))
xmin = int(max(1,(boxes[i][1] * imW)))
ymax = int(min(imH,(boxes[i][2] * imH)))
xmax = int(min(imW,(boxes[i][3] * imW)))
## print(ymin,xmin,ymax,xmax)
cv2.rectangle(frame, (xmin,ymin), (xmax,ymax), (10, 255, 0), 2)
## Draw label
object_name = labels[int(classes[i])]
print(object_name)
##colour detection
hsv_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
height, width=(ymax-ymin),(xmax-xmin)
cx = int(width / 2)
cy = int(height / 2)
## Pick pixel value
pixel_center = hsv_frame[cy+ymin, cx+xmin]
hue_value = pixel_center[0]
sat_value = pixel_center[1]
val_value = pixel_center[2]
print(hue_value,sat_value,val_value)
color = "Undefined"
if hue_value>166 and val_value<95:
color="black"
elif hue_value < 5:
color = "RED"
elif hue_value < 22:
color = "ORANGE"
elif hue_value < 33:
color = "YELLOW"
elif hue_value < 67:
color = "GREEN"
elif hue_value < 117:
color = "BLUE"
elif hue_value < 144:
color = "VIOLET"
elif hue_value < 160:
color = "PINK"
else:
color = "RED"
print(color)
label = '%s: %d%%' % (object_name, int(scores[i]*100)) ## Example: 'person: 72%'
labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2) ## Get font size
label_ymin = max(ymin, labelSize[1] + 10) ## Make sure not to draw label too close to top of window
cv2.rectangle(frame, (xmin, label_ymin-labelSize[1]-10), (xmin+labelSize[0], label_ymin+baseLine-10), (255, 255, 255), cv2.FILLED) ## Draw white box to put label text in
cv2.putText(frame, label, (xmin, label_ymin-7), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2) ## Draw label text
## Draw framerate in corner of frame
cv2.putText(frame,'FPS: {0:.2f}'.format(frame_rate_calc),(30,50),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,0),2,cv2.LINE_AA)
## All the results have been drawn on the frame, so it's time to display it.
cv2.imshow('Object detector', frame)
## Press 'q' to quit
if cv2.waitKey(1) == ord('q'):
break
## Clean up
cv2.destroyAllWindows()
videostream.stop()
I am new to tensorflow lite. When I run this code it runs perfectly for the first iteration of the while loop and outputs-- (cup and the color orange) but then i get this error.
cup
20 215 197
ORANGE
Traceback (most recent call last):
File "D:\download\TensorFlow-Lite-Object-Detection-on-Android-and-Raspberry-Pi-master\TFLite_detection_webcam.py", line 188,
in <module>
interpreter.set_tensor(input_details[0]['index'],input_data)
File "C:\Users\admin\AppData\Local\Programs\Python\Python39\lib\site-packages\tensorflow\lite\python\interpreter.py", line 705, in set_tensor
self._interpreter.SetTensor(tensor_index, value)
ValueError: Cannot set tensor: Dimension mismatch. Got 198 but expected 300 for dimension 1 of input 175.
But if I remove the color detection code completely from the while loop (Colour detection code is mentioned below). It runs without any error and returns the object name
Color detection code in while loop:-
##colour detection
hsv_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
height, width=(ymax-ymin),(xmax-xmin)
cx = int(width / 2)
cy = int(height / 2)
## Pick pixel value
pixel_center = hsv_frame[cy+ymin, cx+xmin]
hue_value = pixel_center[0]
sat_value = pixel_center[1]
val_value = pixel_center[2]
print(hue_value,sat_value,val_value)
color = "Undefined"
if hue_value>166 and val_value<95:
color="black"
elif hue_value < 5:
color = "RED"
elif hue_value < 22:
color = "ORANGE"
elif hue_value < 33:
color = "YELLOW"
elif hue_value < 67:
color = "GREEN"
elif hue_value < 117:
color = "BLUE"
elif hue_value < 144:
color = "VIOLET"
elif hue_value < 160:
color = "PINK"
else:
color = "RED"
print(color)
how do I solve this problem?
The error is suggesting that the dimensions are wrong.
The cause is that you overwrite the variables width, height with the output dimensions of the network which is not same as input_dimensions of the network hence it breaks.
Solution Change the name of variable width, height in your color detection code.

How to find the dimensions of an object using realsense (L515 camera)

i have a realsense l 515 camera. I want to find the size of an object inside. In my case, i am running darknet to detect a dummy object. Now Once the object is detected, i want to use the depth frame and the color frame to calculate the length and breadth of the object(roughly). For example, an apple. The bounding box is drawn around the apple. Now , how do i use this bounding box data along with the color frame and depth frame to find the dimensions of the apple? As the bounding box is roughly the size of the apple, i want to convert the pixel coordinates of the bounding box to calculate the dimensions of the apple in real life approximately. I read online about using point clouds but I am new to this, so i am quite unclear how to proceed.
import darknet
import cv2
import numpy as np
import pyrealsense2 as rs
"""##############. Function definitions. ##################"""
#Define the detection function
def image_detection(image, network, class_names, class_colors, thresh):
# Darknet doesn't accept numpy images.
# Create one with image we reuse for each detect
width = darknet.network_width(network)
height = darknet.network_height(network)
darknet_image = darknet.make_image(width, height, 3)
#image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, (width, height),interpolation=cv2.INTER_LINEAR)
darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
detections = darknet.detect_image(network, class_names, darknet_image, thresh=thresh)
darknet.free_image(darknet_image)
image = darknet.draw_boxes(detections, image_resized, class_colors)
return cv2.cvtColor(image, cv2.COLOR_BGR2RGB), detections
# Initialize and declare the neural network along with data files, config files etc
quantity_apples = []
config_file = "/home/jetson/Desktop/pano_l515/yolov4.cfg"
data_file = "/home/jetson/Desktop/pano_l515/coco.data"
weights = "/home/jetson/Desktop/pano_l515/yolov4.weights"
network, class_names, class_colors = darknet.load_network(
config_file,
data_file,
weights,
batch_size=1
)
## Realsense from align-depth2color.py
# Create a pipeline
pipeline = rs.pipeline()
# Create a config and configure the pipeline to stream
# different resolutions of color and depth streams
config = rs.config()
# Get device product line for setting a supporting resolution
pipeline_wrapper = rs.pipeline_wrapper(pipeline)
pipeline_profile = config.resolve(pipeline_wrapper)
device = pipeline_profile.get_device()
device_product_line = str(device.get_info(rs.camera_info.product_line))
config.enable_stream(rs.stream.depth, 1024, 768, rs.format.z16, 30)
if device_product_line == 'L500':
print(device_product_line)
config.enable_stream(rs.stream.color, 1280, 720, rs.format.bgr8, 30)
else:
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)
# Start streaming
profile = pipeline.start(config)
# Getting the depth sensor's depth scale (see rs-align example for explanation)
depth_sensor = profile.get_device().first_depth_sensor()
depth_scale = depth_sensor.get_depth_scale()
print("Depth Scale is: " , depth_scale)
# We will be removing the background of objects more than
# clipping_distance_in_meters meters away
clipping_distance_in_meters = 1 #1 meter
clipping_distance = clipping_distance_in_meters / depth_scale
# Create an align object
# rs.align allows us to perform alignment of depth frames to others frames
# The "align_to" is the stream type to which we plan to align depth frames.
align_to = rs.stream.color
align = rs.align(align_to)
# Streaming loop
try:
for i in range(0,2):
# Get frameset of color and depth
frames = pipeline.wait_for_frames()
# frames.get_depth_frame() is a 640x360 depth image
# Align the depth frame to color frame
aligned_frames = align.process(frames)
# Get aligned frames
aligned_depth_frame = aligned_frames.get_depth_frame() # aligned_depth_frame is a 640x480 depth image
color_frame = aligned_frames.get_color_frame()
# Validate that both frames are valid
if not aligned_depth_frame or not color_frame:
continue
depth_image = np.asanyarray(aligned_depth_frame.get_data())
color_image = np.asanyarray(color_frame.get_data())
dn_frame_width = 416
dn_frame_height = 416
frame_width = color_image.shape[1]
frame_height = color_image.shape[0]
#### Passing the image to darknet
image, detections = image_detection(color_image, network, class_names, class_colors, thresh=0.05)
for i in range(len(detections)):
xc_percent = detections[i][2][0]/dn_frame_width
yc_percent = detections[i][2][1]/dn_frame_height
w_percent = detections[i][2][2]/dn_frame_width
h_percent = detections[i][2][3]/dn_frame_height
xc = xc_percent*frame_width
yc = yc_percent*frame_height
w = w_percent*frame_width
h = h_percent*frame_height
xmin = xc - w/2.0
ymin = yc - h/2.0
xmax = xc + w/2.0
ymax = yc + h/2.0
#If object is detected, increase the count of the object in the frame
if detections[i][0] == "apple":
cv2.rectangle(color_image, (int(xmin),int(ymin)),(int(xmax),int(ymax)),(0,0,255),2)
cv2.putText(color_image, "apple", (int(xmin), int(ymin-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,0,255), 2)
#cv2.imwrite(output_path, frame)
# Render images:
# depth align to color on left
# depth on right
depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
images = np.hstack((color_image, depth_colormap))
cv2.imwrite("test_images.jpg", color_image)
#cv2.namedWindow('Align Example', cv2.WINDOW_NORMAL)
#cv2.imshow('Align Example', images)
key = cv2.waitKey(1)
# Press esc or 'q' to close the image window
#if key & 0xFF == ord('q') or key == 27:
cv2.destroyAllWindows()
#break
finally:
pipeline.stop()
This is the output image so far. How do i proceed?

How to remove an image and it's label from a dataset

When I import a dataset of images and the csv file with the corresponding labels, I made a function to extract every image label. But know I have to remove some images that do not fit specific criteria. Is there a way to remove the corresponding label as well?
This is the function that is used to load the images and the labels
def imp_img():
dirname = '/s/desk/img/'
x = np.zeros((1000, 100, 100), dtype=np.float32)
for i in range(x.shape[0]):
img = Image.open(dirname + 'img_%02d.png' % (i))
img = np.array(img)
x[i] = img
path = '/s/desk/labels_classificatio.csv'
labels = pd.read_csv(path, usecols=["category"],
sep=";" )
y = np.array(labels)
return x, y
This is how they are imported
x, y = imp_img()
x = x/255.0
y = y.reshape(y.shape[0], 1)
x.shape, y.shape
and now I made for loop to remove the images that are too dark
c =[]
for i in x:
if np.sum(i) >= 100:
c.append(i)
c = np.asarray(c)
The problem now is that I have fewer images than I have labels. Is there a way to remove the corresponding label as well?
You're looking for enumerate. It lets you loop over an iterable while maintaining a count. Instead of for i in x we'll do for i, img in enumerate(x) which let us maintain a loop counter i. This way you can sbset the labels corresponding to the images that meet your criteria.
code:
c = []
c_labels = []
for i, img in enumerate(x):
if np.sum(img) >= 100:
c.append(img)
c_labels.append(y[i])
c = np.asarray(c)
c_labels = np.asarray(c_labels)

An efficient way to extract, and compare and match fingerprint minutiae

I am currently working on an program that detects and matches fingerprints as part of a fingerprint sensor. After processing the image, I obtain key points using Harris Corner Detection. Then, using ORB feature extractor, I obtain descriptors in the form of an array.
Problem is the number of key points I get for two different images of the same fingerprint are different. Hence, the descriptor arrays obtained are also of different sizes.
Now I've used Hamming distances to measure the difference between the descriptor arrays of two images, and hence the difference between the fingerprints themselves. However, due to the different array sizes, I'm finding it difficult to set a threshold for all fingerprints.
def get_descriptors(img):
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
img = clahe.apply(img)
img = image_enhance.image_enhance(img) #for image-processing
img = numpy.array(img, dtype=numpy.uint8)
# Threshold
ret, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
# Normalize to 0 and 1 range
img[img == 255] = 1
#Thinning
skeleton = skeletonize(img)
skeleton = numpy.array(skeleton, dtype=numpy.uint8)
skeleton = removedot(skeleton)
# Harris corners
harris_corners = cv2.cornerHarris(img, 3, 3, 0.04)
harris_normalized = cv2.normalize(harris_corners, 0, 255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32FC1)
threshold_harris = 125
# Extract keypoints
keypoints = []
for x in range(0, harris_normalized.shape[0]):
for y in range(0, harris_normalized.shape[1]):
if harris_normalized[x][y] > threshold_harris:
keypoints.append(cv2.KeyPoint(y, x, 1))
# Define descriptor
orb = cv2.ORB_create()
# Compute descriptors
_, des = orb.compute(img, keypoints)
return (keypoints, des);
def main():
img1 = cv2.imread("C:/Users/Nimesh Shahdadpuri/Desktop/DMRC Intern/database/106_1.tif" , cv2.IMREAD_GRAYSCALE)
kp1, des1 = get_descriptors(img1)
#print (des1)
#print (des1.shape)
img2 = cv2.imread("C:/Users/Nimesh Shahdadpuri/Desktop/DMRC Intern/database/106_2.tif" , cv2.IMREAD_GRAYSCALE)
kp2, des2 = get_descriptors(img2)
#print (des2)
# Matching between descriptors
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
matches= bf.match(des1,des2)
matches = sorted(matches, key= lambda match:match.distance)
#print (len(matches))
# Plot keypoints
img4 = cv2.drawKeypoints(img1, kp1, outImage=None)
img5 = cv2.drawKeypoints(img2, kp2, outImage=None)
#f, axarr = plt.subplots(1,2)
print ("First Fingerprint")
#axarr[0].imshow(img4)
plt.imshow(img4)
plt.show()
print ("Second Fingerprint")
#axarr[1].imshow(img5)
plt.imshow(img5)
plt.show()
# Plot matches
img3 = cv2.drawMatches(img1, kp1, img2, kp2, matches, flags=2, outImg=None)
print ("All the matching points and the corresponding distances")
plt.imshow(img3)
plt.show()
# Calculate score
score = 0
for match in matches:
score += match.distance
score_threshold = 40
matchper= score/len(matches)
print(matchper)
if matchper < score_threshold:
print("Fingerprint matches.")
else:
print("Fingerprint does not match.")
I expect an efficient way to define a general threshold for all fingerprints. I would also like suggestions for an alternate approach to define and match the key points.

Finding contours using opencv (image processing)

I have been working on a project "Car number plate detector" which i am implementing using opencv. After updating my Python Version and opencv, I am getting an error in finding contour as in:
imgContours, contours, npaHierarchy = cv2.findContours(imgThreshCopy, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) # find all contours
ValueError: not enough values to unpack (expected 3, got 2)
This is the code that i used:
def findPossibleCharsInScene(imgThresh):
listOfPossibleChars = [] # this will be the return value
intCountOfPossibleChars = 0
imgThreshCopy = imgThresh.copy()
imgContours, contours, npaHierarchy = cv2.findContours(imgThreshCopy, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) # find all contours
height, width = imgThresh.shape
imgContours = np.zeros((height, width, 3), np.uint8)
for i in range(0, len(contours)): # for each contour
if Main.showSteps == True: # show steps ###################################################
cv2.drawContours(imgContours, contours, i, Main.SCALAR_WHITE)
# end if # show steps #####################################################################
possibleChar = PossibleChar.PossibleChar(contours[i])
if DetectChars.checkIfPossibleChar(possibleChar): # if contour is a possible char, note this does not compare to other chars (yet) . . .
intCountOfPossibleChars = intCountOfPossibleChars + 1 # increment count of possible chars
listOfPossibleChars.append(possibleChar) # and add to list of possible chars
# end if
# end for
if Main.showSteps == True: # show steps #######################################################
print("\nstep 2 - len(contours) = " + str(len(contours))) # 2362 with MCLRNF1 image
print("step 2 - intCountOfPossibleChars = " + str(intCountOfPossibleChars)) # 131 with MCLRNF1 image
cv2.imshow("2a", imgContours)
# end if # show steps #########################################################################
return listOfPossibleChars
# end function
What changes should i do to correct it?
I think cv2.findContours() function returns only 2 values. You should change the code to the following
contours, npaHierarchy = cv2.findContours(imgThreshCopy, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

Resources