how to measure distance between center of objects - openCV? - python-3.x

I have the following code that extracts and "tracks" black objects within a video file or webcam stream. I am new to Python and CV and am struggling to understand how to extract the center points of the tracked objects for comparison.
Code:
#for this demo the only colour we are interested in is black or near black
#remember open cv uses BGR not RGB
groupColorLower = np.array([0,0,0], dtype = "uint8")
groupColorUpper = np.array([179,179,179], dtype ="uint8")
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-v", "--video", required=True,
help="path to the input video file")
args = vars(ap.parse_args())
######### Reading input video or opening webcam#########
# if a video path was not supplied, grab the reference
# to the webcam
if not args.get("video", False):
camera = cv2.VideoCapture(0)
# otherwise, load the video
else:
camera = cv2.VideoCapture(args["video"])
######### Finished Reading input video or opening webcam#########
# keep looping
while True:
# grab the current frame
(grabbed, frame) = camera.read()
# if we are viewing a video and we did not grab a
# frame, then we have reached the end of the video
if args.get("video") and not grabbed:
break
# determine which pixels fall within the black boundaries
# and then blur the binary image
blue = cv2.inRange(frame, groupColorLower, groupColorUpper)
blue = cv2.GaussianBlur(blue, (3, 3), 0)
# find contours in the image
(_, contours1, _) = cv2.findContours(blue.copy(), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
#lets try to sort contours1 left to right
(contours1, _) = contours.sort_contours(contours1)
if len(contours1) > 0:
#now lets extract the center point of the left most contour
fixedPoint = cv2.moments(contours1[0])
fixedPointX = int (fixedPoint["m10"] / fixedPoint["m00"])
fixedPointY = int (fixedPoint["m01"] / fixedPoint["m00"])
#lets draw a white dot in left most object
cv2.circle(frame, (fixedPointX, fixedPointY), 7, (255,255,255), -1)
#lets use the nearest to fixedPoint as anchor (left most [1] where [0] is fixedPoint
if len(contours1) > 1:
# sort the contours so we can iterate over them
cnt2 = sorted([(c, cv2.boundingRect(c)[0]) for c in contours1], key = lambda x: x[1], reverse = True)
#draw boxes around all objects we are tracking
for (c, _) in cnt2:
(x, y, w, h) = cv2.boundingRect(c)
cv2.rectangle(frame, (x, y), (x + w, y +h), (255, 255, 0), 4)
M = cv2.moments(c)
cX = int(M["m10"] / M["m00"])
cY = int(M["m01"] / M["m00"])
# draw a dot in center of the black object on the image
cv2.circle(frame, (cX, cY), 7, (100, 100, 100), -1)
centre.append((int(M["m10"]/M["m00"]), int(M["m01"]/M["m00"])))
What I would like to be able to do is print the distance (in pixels) between the objects as they are found but I am not sure how to access this data.
The fixed point calculation works as I assume the left most object can be hard coded, but I need to get the coords for any other objects located.
The code currently loops over each black object and places a dot in the center of it.
Thanks for any help offered:
andy

Related

How to count objects in video using openCV and python?

I want to count vehicles when the rectangle touches(intersects) the line. I don't know more about counting objects in video frame. Based on my pre-search, I have found out some useful information such as centroid, euclidean distance and etc. But, i don't have much knowledge about the geometry math behind it. Here is an example video. https://www.youtube.com/watch?v=z1Cvn3_4yGo. I think they used the same method.
I have got the center of the rectangle and drawn a red line across the road. Now I want to count vehicles when it touches the line. Bellow, I have uploaded my current output of the system in a image format as well as my current code.
# import the necessary packages
from imutils.video import VideoStream
from imutils.video import FPS
import argparse
import imutils
import time
import cv2
tracker = cv2.TrackerCSRT_create()
vs = cv2.VideoCapture("Video.mp4")
initBB = None
# loop over frames from the video stream
while vs.isOpened():
ret,frame = vs.read()
cv2.line(frame, (933 , 462), (1170 , 462), (0,0,255), 3)
# check to see if we are currently tracking an object
if initBB is not None:
# grab the new bounding box coordinates of the object
(success, box) = tracker.update(frame)
# check to see if the tracking was a success
if success:
(x, y, w, h) = [int(v) for v in box]
cv2.rectangle(frame, (x, y), (x + w, y + h),
(0, 255, 0), 2)
cX = int((x + x+w) / 2.0)
cY = int((y + y+h) / 2.0)
cv2.circle(frame, (cX, cY), 4, (255, 0, 0), -1)
# show the output frame
cv2.imshow("Frame", frame)
key = cv2.waitKey(1) & 0xFF
if key == ord("s"):
# select the bounding box of the object we want to track (make
# sure you press ENTER or SPACE after selecting the ROI)
initBB = cv2.selectROI("Frame", frame, fromCenter=False,
showCrosshair=True)
# start OpenCV object tracker using the supplied bounding box
# coordinates, then start the FPS throughput estimator as well
tracker.init(frame, initBB)
fps = FPS().start()
# if the `q` key was pressed, break from the loop
elif key == ord("q"):
break
else:
vs.release()
cv2.destroyAllWindows()
You should just check whether the lower two x axis points of your bonding box are greater than or equal to the x axis points of the line

What is the best way to extract text contained within a table in a pdf using python?

I'm constructing a program to extract text from a pdf, put it in a structured format, and send it off to a database. I have roughly 1,400 individual pdfs that all follow a similar format, but nuances in the verbiage and plan designs that the documents summarize make it tricky.
I've played around with a couple different pdf readers in python including tabula-py and pdfminer but none of them are quite getting to what I'd like to do. Tabula reads in all of the text very well, however it pulls everything as it explicitly lays horizontally, excluding the fact that some of the text is wrapped in a box. For example, if you open up the sample SBC I have attached where it reads "What is the overall deductible?" Tabula will read in "What is the overall $500/Individual or..." skipping the fact that the word "deductible" is really part of the first sentence. (Note the files I'm working with are pdfs but I've attached a jpeg because I couldn't figure out how to attach a pdf.)
import tabula
df = tabula.read_pdf(*filepath*, pandas_options={'header': None))
print(df.iloc[0][0])
print(df)
In the end, I'd really like to be able to parse out the text within each box so that I can better identify what values belong to deductible, out-of-pocket limts, copays/coinsurance, etc. I thought possibly some sort of OCR would allow me to recognize which parts of the PDF are contained in the blue rectangles and then pull the string from there, but I really don't know where to start with that.Sample SBC
#jpnadas In this case the code you copied from my answer in this post isn't really suitable because it addresses the case when a table doesn't have surrounding grid. That algorithm looks for repeating blocks of texts and tries to find a pattern that resembles a table heuristically.
But in this particular case the table does have the grid and by taking this advantage we can achieve a lot more accurate result.
The strategy is the following:
Increase image gamma to make the grid darker
Get rid of colour and apply Otsu thresholding
Find long vertical an horizontal lines in the image and create a mask from it using erode and dilate functions
Find the cell blocks in the mask using findContours function.
Find table objects
5.1 The rest can be as in the post about finding a table without the
grid: find table structure heuristically
5.2 Alternative approach could be using hierarchy returned by the findContours function. This approach is even more accurate and
allows to find multiple tables on a single image.
Having cell coordinates it's easy to extract certain cell image from the original image:
cell_image = image[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
Apply OCR to each cell_image.
BUT! I consider the OpenCV approach as a last resort when you're not able to read the PDF's contents: for instance in case when a PDF contains raster image inside.
If it's a vector-based PDF and its contents are readable it makes more sense to find the table inside contents and just read the text from it instead of doing heavy 'OCR lifting'.
Here's the code for reference for more accurate table recognition:
import os
import imutils
import numpy as np
import argparse
import cv2
def gamma_correction(image, gamma = 1.0):
look_up_table = np.empty((1,256), np.uint8)
for i in range(256):
look_up_table[0,i] = np.clip(pow(i / 255.0, gamma) * 255.0, 0, 255)
result = cv2.LUT(image, look_up_table)
return result
def pre_process_image(image):
# Let's get rid of color first
# Applying gamma to make the table lines darker
gamma = gamma_correction(image, 2)
# Getting rid of color
gray = cv2.cvtColor(gamma, cv2.COLOR_BGR2GRAY)
# Then apply Otsu threshold to reveal important areas
ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
# inverting the thresholded image
return ~thresh
def get_horizontal_lines_mask(image, horizontal_size=100):
horizontal = image.copy()
horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
horizontal = cv2.erode(horizontal, horizontal_structure, anchor=(-1, -1), iterations=1)
horizontal = cv2.dilate(horizontal, horizontal_structure, anchor=(-1, -1), iterations=1)
return horizontal
def get_vertical_lines_mask(image, vertical_size=100):
vertical = image.copy()
vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
vertical = cv2.erode(vertical, vertical_structure, anchor=(-1, -1), iterations=1)
vertical = cv2.dilate(vertical, vertical_structure, anchor=(-1, -1), iterations=1)
return vertical
def make_lines_mask(preprocessed, min_horizontal_line_size=100, min_vertical_line_size=100):
hor = get_horizontal_lines_mask(preprocessed, min_horizontal_line_size)
ver = get_vertical_lines_mask(preprocessed, min_vertical_line_size)
mask = np.zeros((preprocessed.shape[0], preprocessed.shape[1], 1), dtype=np.uint8)
mask = cv2.bitwise_or(mask, hor)
mask = cv2.bitwise_or(mask, ver)
return ~mask
def find_cell_boxes(mask):
# Looking for the text spots contours
# OpenCV 3
# img, contours, hierarchy = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# OpenCV 4
contours = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = imutils.grab_contours(contours)
contours = sorted(contours, key=cv2.contourArea, reverse=True)
image_width = mask.shape[1]
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
w = box[2]
# Excluding the page box shape but adding smaller boxes
if w < 0.95 * image_width:
boxes.append(box)
return boxes
def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
rows = {}
cols = {}
# Clustering the bounding boxes by their positions
for box in boxes:
(x, y, w, h) = box
col_key = x // cell_threshold
row_key = y // cell_threshold
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]
# Filtering out the clusters having less than 2 cols
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
# Sorting the row cells by x coord
table_cells = [list(sorted(tb)) for tb in table_cells]
# Sorting rows by the y coord
table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))
return table_cells
def build_vertical_lines(table_cells):
if table_cells is None or len(table_cells) <= 0:
return [], []
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
max_y = max_last_row_height_box[1] + max_last_row_height_box[3]
hor_lines = []
ver_lines = []
for box in table_cells:
x = box[0][0]
y = box[0][1]
hor_lines.append((x, y, max_x, y))
for box in table_cells[0]:
x = box[0]
y = box[1]
ver_lines.append((x, y, x, max_y))
(x, y, w, h) = table_cells[0][-1]
ver_lines.append((max_x, y, max_x, max_y))
(x, y, w, h) = table_cells[0][0]
hor_lines.append((x, max_y, max_x, max_y))
return hor_lines, ver_lines
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True, help="path to images directory")
args = vars(ap.parse_args())
in_file = args["image"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
img = cv2.imread(in_file)
pre_processed = pre_process_image(img)
# Visualizing pre-processed image
cv2.imwrite(filename_base + ".pre.png", pre_processed)
lines_mask = make_lines_mask(pre_processed, min_horizontal_line_size=1800, min_vertical_line_size=500)
# Visualizing table lines mask
cv2.imwrite(filename_base + ".mask.png", lines_mask)
cell_boxes = find_cell_boxes(lines_mask)
cells = find_table_in_boxes(cell_boxes)
# apply OCR to each cell rect here
# the cells array contains cell coordinates in tuples (x, y, w, h)
hor_lines, ver_lines = build_vertical_lines(cells)
# Visualize the table lines
vis = img.copy()
for line in hor_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
for line in ver_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
cv2.imwrite(filename_base + ".result.png", vis)
Some parameters are hard-coded:
page size threshold - 0.95
min horizontal line size - 1800 px
min vertical line size - 500 px
You can provide them as configurable parameters or make them relative to image size.
Results:
I think that the best way to do what you need is to find and isolate the cells in the file and then apply OCR to each individual cell.
There are a number of solutions in SO for that, I got the code from this answer and played around a little with the parameters to get the output below (not perfect yet, but you can tweak it a little bit yourself).
import os
import cv2
import imutils
# This only works if there's only one table on a page
# Important parameters:
# - morph_size
# - min_text_height_limit
# - max_text_height_limit
# - cell_threshold
# - min_columns
def pre_process_image(img, save_in_file, morph_size=(23, 23)):
# get rid of the color
pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Otsu threshold
pre = cv2.threshold(pre, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# dilate the text to make it solid spot
cpy = pre.copy()
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
pre = ~cpy
if save_in_file is not None:
cv2.imwrite(save_in_file, pre)
return pre
def find_text_boxes(pre, min_text_height_limit=20, max_text_height_limit=120):
# Looking for the text spots contours
contours, _ = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
h = box[3]
if min_text_height_limit < h < max_text_height_limit:
boxes.append(box)
return boxes
def find_table_in_boxes(boxes, cell_threshold=100, min_columns=3):
rows = {}
cols = {}
# Clustering the bounding boxes by their positions
for box in boxes:
(x, y, w, h) = box
col_key = x // cell_threshold
row_key = y // cell_threshold
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]
# Filtering out the clusters having less than 2 cols
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
# Sorting the row cells by x coord
table_cells = [list(sorted(tb)) for tb in table_cells]
# Sorting rows by the y coord
table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))
return table_cells
def build_lines(table_cells):
if table_cells is None or len(table_cells) <= 0:
return [], []
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
max_y = max_last_row_height_box[1] + max_last_row_height_box[3]
hor_lines = []
ver_lines = []
for box in table_cells:
x = box[0][0]
y = box[0][1]
hor_lines.append((x, y, max_x, y))
for box in table_cells[0]:
x = box[0]
y = box[1]
ver_lines.append((x, y, x, max_y))
(x, y, w, h) = table_cells[0][-1]
ver_lines.append((max_x, y, max_x, max_y))
(x, y, w, h) = table_cells[0][0]
hor_lines.append((x, max_y, max_x, max_y))
return hor_lines, ver_lines
if __name__ == "__main__":
in_file = os.path.join(".", "test.jpg")
pre_file = os.path.join(".", "pre.png")
out_file = os.path.join(".", "out.png")
img = cv2.imread(os.path.join(in_file))
pre_processed = pre_process_image(img, pre_file)
text_boxes = find_text_boxes(pre_processed)
cells = find_table_in_boxes(text_boxes)
hor_lines, ver_lines = build_lines(cells)
# Visualize the result
vis = img.copy()
# for box in text_boxes:
# (x, y, w, h) = box
# cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)
for line in hor_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
for line in ver_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
cv2.imwrite(out_file, vis)

How to clear numbers from the image using openCV?

I'm trying to remove numbers which are laying inside the circular part of image, numbers are in black in color and background varies between red,yellow, blue and green.
I am using opencv to remove those numbers. I used a mask which extracts numbers from image, with help of cv2.inpaint tried to remove those numbers from images.
For my further analysis I required to have clear image. But my current approach gives distorted image and numbers are not completely removed.
I tried changing the threshold values, lowering will neglect numbers from dark shaded area such as from green and red.
import cv2
img = cv2.imread('scan_1.jpg')
mask = cv2.threshold(img,50,255,cv2.THRESH_BINARY_INV)[1][:,:,0]
cv2.imshow('mask', mask)
cv2.waitKey(0)
cv2.destroyAllWindows()
dst = cv2.inpaint(img, mask, 5, cv2.INPAINT_TELEA)
cv2.imshow('dst',dst)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.imwrite('ost_1.jpg',dst)
Input images: a) scan_1.jpg
b) scan_2.jpg
Output images: a) ost_1.jpg
b) ost_2.jpg
Expected Image: Circles can ignored, but something similar to it is required.
Here is my attempt, a better/easier solution might be acquired if you do not care about preserving texts outside of your circle.
import cv2
import numpy as np
# connectivity method used for finding connected components, 4 vs 8
CONNECTIVITY = 4
# HSV threshold for finding black pixels
H_THRESHOLD = 179
S_THRESHOLD = 255
V_THRESHOLD = 150
# read image
img = cv2.imread("a1.jpg")
img_height = img.shape[0]
img_width = img.shape[1]
# save a copy for creating resulting image
result = img.copy()
# convert image to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# found the circle in the image
circles = cv2.HoughCircles(gray, cv2.HOUGH_GRADIENT, 1.7, minDist= 100, param1 = 48, param2 = 100, minRadius=70, maxRadius=100)
# draw found circle, for visual only
circle_output = img.copy()
# check if we found exactly 1 circle
num_circles = len(circles)
print("Number of found circles:{}".format(num_circles))
if (num_circles != 1):
print("invalid number of circles found ({}), should be 1".format(num_circles))
exit(0)
# save center position and radius of found circle
circle_x = 0
circle_y = 0
circle_radius = 0
if circles is not None:
# convert the (x, y) coordinates and radius of the circles to integers
circles = np.round(circles[0, :]).astype("int")
for (x, y, radius) in circles:
circle_x, circle_y, circle_radius = (x, y, radius)
cv2.circle(circle_output, (circle_x, circle_y), circle_radius, (255, 0, 0), 4)
print("circle center:({},{}), radius:{}".format(x,y,radius))
# keep a median filtered version of image, will be used later
median_filtered = cv2.medianBlur(img, 21)
# Convert BGR to HSV
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# define range of black color in HSV
lower_val = np.array([0,0,0])
upper_val = np.array([H_THRESHOLD,S_THRESHOLD,V_THRESHOLD])
# Threshold the HSV image to get only black colors
mask = cv2.inRange(hsv, lower_val, upper_val)
# find connected components
components = cv2.connectedComponentsWithStats(mask, CONNECTIVITY, cv2.CV_32S)
# apply median filtering to found components
#centers = components[3]
num_components = components[0]
print("Number of found connected components:{}".format(num_components))
labels = components[1]
stats = components[2]
for i in range(1, num_components):
left = stats[i, cv2.CC_STAT_LEFT] - 10
top = stats[i, cv2.CC_STAT_TOP] - 10
width = stats[i, cv2.CC_STAT_WIDTH] + 10
height = stats[i, cv2.CC_STAT_HEIGHT] + 10
# iterate each pixel and replace them if
#they are inside circle
for row in range(top, top+height+1):
for col in range(left, left+width+1):
dx = col - circle_x
dy = row - circle_y
if (dx*dx + dy*dy <= circle_radius * circle_radius):
result[row, col] = median_filtered[row, col]
# smooth the image, may be necessary?
#result = cv2.blur(result, (3,3))
# display image(s)
cv2.imshow("img", img)
cv2.imshow("gray", gray)
cv2.imshow("found circle:", circle_output)
cv2.imshow("mask", mask)
cv2.imshow("result", result)
cv2.waitKey(0)
cv2.destroyAllWindows()
Result for a1:

Drawing Scaled grid system using openCV and python

What I want to do is draw a grid on an image that has smaller squares at the top and larger ones near the bottom. I am lost on how to do this though. I figured out how to draw a grid on the image using open cv but don't know how to fine turn it to get the results I want.
What I will be using this for is to find where a bounding box point is for a detected object. I will need a smaller bounding box at the top and a larger one closer to the bottom of the image.
This is what I want to do:
But I cannot figure out how to get the curves in the code.
This is what I have so far:
And this the is code that I am using.
'''
#This method draws simple grid overthe image based on the passed step
#The pxstep controls the size of the grid
'''
def drawBasicGrid(image, pxstep, midX, midY):
x = pxstep
y = pxstep
#Draw all x lines
while x < img.shape[1]:
cv2.line(img, (x, 0), (x, img.shape[0]), color=(255, 0, 255), thickness=1)
x += pxstep
while y < img.shape[0]:
cv2.line(img, (0, y), (img.shape[1], y), color=(255, 0, 255),thickness=1)
y += pxstep
This draws the basic grid.
and this creates thebounding boxes that I use for detection of the bounding points of a detected object.
def makeBoundingBox(h,w, step):
#BBox is 100*100 square or step which is defied
y = 0
bbox = []
while y < h:
#print("Y value", y)
x=0
while x < w:
#print("X Value", x)
bbox.append([(x,y), (x+step, y+step)])
x += step
y += step
return bbox
And this is how I am using it.
#Prepare Images
img = cv2.imread(image)
#img = imutils.resize(img, width=300)
#gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
(H,W) = img.shape[:2]
#Create bounding boxes for the image
'''
#Creates the boxes for people detection
#It will look for which bounding boxes the person is in and then
#Draw a box around their feet
'''
#People Detection
blob = cv2.dnn.blobFromImage(cv2.resize(img, (300, 300)),0.007843,(300, 300), 127.5)
net.setInput(blob)
detections = net.forward()
'''
#Get the center point of the image
'''
midY = H//2
midX = W//2
print("Middle Y pixel", midY)
#Draw center line
cv2.line(img, (0, midY), (W, midY), color=green, thickness=2)
cv2.line(img, (midX, 0), (midX, H), color=green, thickness=2)
#Visual grid drawn
drawBasicGrid(img, step, midX,midY)
bbox = makeBoundingBox(H, W, step) #Used for finding where the endx and startx BBOX points are
Any help on this will be appreciated.

Template Matching: efficient way to create mask for minMaxLoc?

Template matching in OpenCV is great. And you can pass a mask to cv2.minMaxLoc so that you only search (sort of) in part of the image for the template you want. You can also use a mask at the matchTemplate operation, but this only masks the template.
I want to find a template and I want to be assured that this template is within some other region of my image.
Calculating the mask for minMaxLoc seems kind of heavy. That is, calculating an accurate mask feels heavy. If you calculate a mask the easy way, it ignores the size of the template.
Examples are in order. My input images are show below. They're a bit contrived. I want to find the candy bar, but only if it's completely inside the white circle of the clock face.
clock1
clock2
template
In clock1, the candy bar is inside the circular clock face and it's a "PASS". But in clock2, the candy bar is only partially inside the face and I want it to be a "FAIL". Here's a code sample for doing it the easy way. I use cv.HoughCircles to find the clock face.
import numpy as np
import cv2
img = cv2.imread('clock1.jpg')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
template = cv2.imread('template.png')
t_h, t_w = template.shape[0:2] # template height and width
# find circle in gray image using Hough transform
circles = cv2.HoughCircles(gray, method = cv2.HOUGH_GRADIENT, dp = 1,
minDist = 150, param1 = 50, param2 = 70,
minRadius = 131, maxRadius = 200)
i = circles[0,0]
x0 = i[0]
y0 = i[1]
r = i[2]
# display circle on color image
cv2.circle(img,(x0, y0), r,(0,255,0),2)
# do the template match
result = cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED)
# finally, here is the part that gets tricky. we want to find highest
# rated match inside circle and we'd like to use minMaxLoc
# make mask by drawing circle on zero array
mask = np.zeros(result.shape, dtype = np.uint8) # minMaxLoc will throw
# error w/o np.uint8
cv2.circle(mask, (x0, y0), r, color = 1, thickness = -1)
# call minMaxLoc
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result, mask = mask)
# draw found rectangle on img
if max_val > 0.4: # use 0.4 as threshold for finding candy bar
cv2.rectangle(img, max_loc, (max_loc[0]+t_w, max_loc[1]+t_h), (0,255,0), 4)
cv2.imwrite('output.jpg', img)
output using clock1
output using clock2
finds candy bar even
though part of it is outside circle
So to properly make a mask, I use a bunch of NumPy operations. I make four separate masks (one for each corner of the template bounding box) and then AND them together. I'm not aware of any convenience functions in OpenCV that would do the mask for me. I'm a little nervous that all of the array operations will be expensive. Is there a better way to do this?
h, w = result.shape[0:2]
# make arrays that hold x,y coords
grid = np.indices((h, w))
x = grid[1]
y = grid[0]
top_left_mask = np.hypot(x - x0, y - y0) - r < 0
top_right_mask = np.hypot(x + t_w - x0, y - y0) - r < 0
bot_left_mask = np.hypot(x - x0, y + t_h - y0) - r < 0
bot_right_mask = np.hypot(x + t_w - x0, y + t_h - y0) - r < 0
mask = np.logical_and.reduce((top_left_mask, top_right_mask,
bot_left_mask, bot_right_mask))
mask = mask.astype(np.uint8)
cv2.imwrite('mask.png', mask*255)
Here's what the "fancy" mask looks like:
Seems about right. It cannot be circular because of the template shape. If I run clock2.jpg with this mask I get:
It works. No candy bars are identified. But I wish I could do it in fewer lines of code...
EDIT:
I've done some profiling. I ran 100 cycles of the "easy" way and the "accurate" way and calculated frames per second (fps):
easy way: 12.7 fps
accurate way: 7.8 fps
so there is some price to pay for making the mask with NumPy. These tests were done on a relatively powerful workstation. It could get uglier on more modest hardware...
Method 1: 'mask' image before cv2.matchTemplate
Just for kicks, I tried to make my own mask of the image that I pass to cv2.matchTemplate to see what kind of performance I can achieve. To be clear, this isn't a proper mask -- I set all of the pixels to ignore to one color (black or white). This is to get around the fact only TM_SQDIFF and TM_CORR_NORMED support a proper mask.
#Alexander Reynolds makes a very good point in the comments that some care must be taken if the template image (the thing we're trying to find) has lots of black or lots of white. For many problems, we will know a priori what the template looks like and we can specify a white background or black background.
I use cv2.multiply, which seems to be faster than numpy.multiply. cv2.multiply has the added advantage that it automatically clips the results to the range 0 to 255.
import numpy as np
import cv2
import time
img = cv2.imread('clock1.jpg')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
template = cv2.imread('target.jpg')
t_h, t_w = template.shape[0:2] # template height and width
mask_background = 'WHITE'
start_time = time.time()
for i in range(100): # do 100 cycles for timing
# find circle in gray image using Hough transform
circles = cv2.HoughCircles(gray, method = cv2.HOUGH_GRADIENT, dp = 1,
minDist = 150, param1 = 50, param2 = 70,
minRadius = 131, maxRadius = 200)
i = circles[0,0]
x0 = i[0]
y0 = i[1]
r = i[2]
# display circle on color image
cv2.circle(img,(x0, y0), r,(0,255,0),2)
if mask_background == 'BLACK': # black = 0, white = 255 on grayscale
mask = np.zeros(img.shape, dtype = np.uint8)
elif mask_background == 'WHITE':
mask = 255*np.ones(img.shape, dtype = np.uint8)
cv2.circle(mask, (x0, y0), r, color = (1,1,1), thickness = -1)
img2 = cv2.multiply(img, mask) # element wise multiplication
# values > 255 are truncated at 255
# do the template match
result = cv2.matchTemplate(img2, template, cv2.TM_CCOEFF_NORMED)
# call minMaxLoc
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
# draw found rectangle on img
if max_val > 0.4:
cv2.rectangle(img, max_loc, (max_loc[0]+t_w, max_loc[1]+t_h), (0,255,0), 4)
fps = 100/(time.time()-start_time)
print('fps ', fps)
cv2.imwrite('output.jpg', img)
Profiling results:
BLACK background 12.3 fps
WHITE background 12.1 fps
Using this method has very little performance hit relative to 12.7 fps in original question. However, it has the drawback that it will still find templates that still stick over the edge a little bit. Depending on the exact nature of the problem, this may be acceptable in many applications.
Method 2: use cv2.boxFilter to create mask for minMaxLoc
In this technique, we start with a circular mask (as in OP), but then modify it with cv2.boxFilter. We change the anchor from default center of kernel to the top left corner (0, 0)
import numpy as np
import cv2
import time
img = cv2.imread('clock1.jpg')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
template = cv2.imread('target.jpg')
t_h, t_w = template.shape[0:2] # template height and width
print('t_h, t_w ', t_h, ' ', t_w)
start_time = time.time()
for i in range(100):
# find circle in gray image using Hough transform
circles = cv2.HoughCircles(gray, method = cv2.HOUGH_GRADIENT, dp = 1,
minDist = 150, param1 = 50, param2 = 70,
minRadius = 131, maxRadius = 200)
i = circles[0,0]
x0 = i[0]
y0 = i[1]
r = i[2]
# display circle on color image
cv2.circle(img,(x0, y0), r,(0,255,0),2)
# do the template match
result = cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED)
# finally, here is the part that gets tricky. we want to find highest
# rated match inside circle and we'd like to use minMaxLoc
# start to make mask by drawing circle on zero array
mask = np.zeros(result.shape, dtype = np.float)
cv2.circle(mask, (x0, y0), r, color = 1, thickness = -1)
mask = cv2.boxFilter(mask,
ddepth = -1,
ksize = (t_w, t_h),
anchor = (0,0),
normalize = True,
borderType = cv2.BORDER_ISOLATED)
# mask now contains values from zero to 1. we want to make anything
# less than 1 equal to zero
_, mask = cv2.threshold(mask, thresh = 0.9999,
maxval = 1.0, type = cv2.THRESH_BINARY)
mask = mask.astype(np.uint8)
# call minMaxLoc
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result, mask = mask)
# draw found rectangle on img
if max_val > 0.4:
cv2.rectangle(img, max_loc, (max_loc[0]+t_w, max_loc[1]+t_h), (0,255,0), 4)
fps = 100/(time.time()-start_time)
print('fps ', fps)
cv2.imwrite('output.jpg', img)
This code gives a mask identical to OP, but at 11.89 fps. This technique gives us more accuracy with slightly more performance hit than Method 1.

Resources