Text Recognize using tesseract - python-3.x

Hello I'm trying to recognize text from Image using Tesseract but unable to get result.
I'm using EAST technique to detect text. I've one more question how can I extend padding of the box. cv2.putText does not work in this case.
original code for text detection: https://github.com/opencv/opencv/blob/master/samples/dnn/text_detection.cpp
import cv2
import numpy as np
import argparse
import time
import math
import matplotlib.pyplot as plt
import skimage.io as io
import os
from imutils.object_detection import non_max_suppression
import pytesseract
print(np.__version__)
def decode_predictions(scores, geometry):
**# grab the number of rows and columns from the scores volume, then
# initialize our set of bounding box rectangles and corresponding
# confidence scores**
(numRows, numCols) = scores.shape[2:4]
boxes = []
confidences = []
**# loop over the number of rows**
for y in range(0, numRows):
**# extract the scores (probabilities), followed by the geometrical
# data used to derive potential bounding box coordinates that
# surround text**
scoresData = scores[0, 0, y]
xData0 = geometry[0, 0, y]
xData1 = geometry[0, 1, y]
xData2 = geometry[0, 2, y]
xData3 = geometry[0, 3, y]
anglesData = geometry[0, 4, y]
**# loop over the number of columns**
for x in range(0, numCols):
**# if our score does not have sufficient probability, ignore it**
if scoresData[x] < args["min_confidence"]:
continue
**# compute the offset factor as our resulting feature maps will
# be 4x smaller than the input image**
(offsetX, offsetY) = (x * 4.0, y * 4.0)
**# extract the rotation angle for the prediction and then
# compute the sin and cosine**
angle = anglesData[x]
cos = np.cos(angle)
sin = np.sin(angle)
**# use the geometry volume to derive the width and height of
# the bounding box**
h = xData0[x] + xData2[x]
w = xData1[x] + xData3[x]
**# compute the rotated rect for
# the text prediction bounding box**
offset = (offsetX + (cos * xData1[x]) + (sin * xData2[x]), offsetY - (sin * xData1[x]) + (cos * xData2[x]))
p1 = (-sin * h + offset[0], -cos * h + offset[1])
p3 = (-cos * w + offset[0], sin * w + offset[1])
center = (0.5*(p1[0]+p3[0]), 0.5*(p1[1]+p3[1]))
**# add the bounding box coordinates and probability score to
# our respective lists**
boxes.append((center, (w,h), -angle * 180.0 / math.pi))
confidences.append(float(scoresData[x]))
return (boxes, confidences)
args = {
"image":"C:\\Users\\ckunwar\\Test_Images\\licence_plate1\\52.jpg",
"east": "frozen_east_text_detection.pb",
"min_confidence":0.25,
"nms_thresh": 0.24,
"width":480,
"height":320,
"padding":0.0
}
**# load the input image and grab the image dimensions**
image = cv2.imread(args["image"])
orig = image.copy()
(H, W) = image.shape[:2]
#print(H,W)
**# set the new width and height and then determine the ratio in change
# for both the width and height**
(newW, newH) = (args["width"], args["height"])
rW = W / float(newW)
rH = H / float(newH)
**# resize the image and grab the new image dimensions**
image = cv2.resize(image, (newW, newH))
(H, W) = image.shape[:2]
**# define the two output layer names for the EAST detector model that
# we are interested -- the first is the output probabilities and the
# second can be used to derive the bounding box coordinates of text**
layerNames = ["feature_fusion/Conv_7/Sigmoid","feature_fusion/concat_3"]
**# load the pre-trained EAST text detector**
print("[INFO] loading EAST text detector...")
net = cv2.dnn.readNet(args["east"])
**# construct a blob from the image and then perform a forward pass of
# the model to obtain the two output layer sets**
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
start = time.time()
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)
end = time.time()
**# show timing information on text prediction**
print("[INFO] text detection took {:.6f} seconds".format(end - start))
(boxes, confidences) = decode_predictions(scores, geometry)
**# apply non-maxima suppression to suppress weak, overlapping bounding boxes**
indices = cv2.dnn.NMSBoxesRotated(boxes, confidences, args["min_confidence"], args["nms_thresh"])
results = []
**# loop over the bounding boxes**
for i in indices:
**# get 4 corners of the rotated rect**
vertices = cv2.boxPoints(boxes[i[0]])
**# scale the bounding box coordinates based on the respective ratios**
for j in [0,1,2,3]:
vertices[j][0] *= rW
vertices[j][1] *= rH
**# draw the bounding box on the image**
for j in [0,1,2,3]:
p1 = (vertices[j][0], vertices[j][1])
p2 = (vertices[(j + 1) % 4][0], vertices[(j + 1) % 4][1])
config = ("-l eng --oem 3 --psm 11")
text = pytesseract.image_to_string(orig,config=config)
results.append(((p1,p2), text))
results = sorted(results, key=lambda r:r[0][1])
output = orig.copy()
for ((p1,p2), text) in results:
print("OCR TEXT")
print("========")
print("{}\n".format(text))
text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
cv2.line(output, p1, p2, (0, 255, 0), 2)
#cv2.rectangle(output, p1, p2,(0, 255, 0), 2)
cv2.putText(output, text,cv2.FONT_HERSHEY_TRIPLEX, 0.8, (0, 0, 255), 2)
**# show the output image**
#orig = cv2.cvtColor(orig, cv2.COLOR_BGR2RGB)
cv2.imshow("Text Detection", output)
cv2.waitKey(0)

Related

YoloV1 : how to turn predicted bounding boxe relative coordinates into absolute ones

I'm struggling to create a light YoloV1 (with only one bounding box) on MNIST dataset (I randomly paste 28x28 digit into a 75x75 black background).
I can't figure out how to turn relative-to-cell coordinates into absolute coordinates.
Since now, I'm using the groundtruth bounding boxes to retrieve the cell which should contain an object, then I save the i,j positions, then I use those positions to get back to absolute coordinates with my predictions.
This method works but when it's time to detect a real image, I won't have the groundtruth coordinates and so, the i,j object position, and so the absolute position of the predicted bounding box.
I provide some line of code :
Encoding absolute coordinates of shape (N,4) to (N,S,S,5)
def encode(self, box):
"""
box : torch.Tensor of shape (N,4)
Absolute coordinates [xmin, ymin, w_bbox, h_bbox]
"""
### Absolute box infos
xmin, ymin, w_bbox, h_bbox = box
### Relative box infos
rw = w_bbox / 75
rh = h_bbox / 75
rx_min = xmin / 75
ry_min = ymin / 75
### x and y box center coords
rxc = (rx_min + rw/2)
ryc = (ry_min + rh/2)
### Object grid location
i = (rxc / self.cell_size).ceil() - 1.0
j = (ryc / self.cell_size).ceil() - 1.0
i, j = int(i), int(j)
### x & y of the cell left-top corner
x0 = i * self.cell_size
y0 = j * self.cell_size
### x & y of the box on the cell, normalized from 0.0 to 1.0.
x_norm = (rxc - x0) / self.cell_size
y_norm = (ryc - y0) / self.cell_size
box_target = torch.zeros(self.S, self.S, 4+1)
box_target[j, i, :5] = torch.Tensor([x_norm, y_norm, rw, rh, 1.])
return box_target
Convert relative-to-cell coordinates into absolute ones
def relative2absolute(box_true:torch.Tensor, box_pred:torch.Tensor)->tuple:
"""
Turns bounding box relative to cell coordinates into absolute coordinates
(pixels). Used to calculate IoU and to plot boxes.
Args:
box_true : torch.Tensor of shape (N, S, S, 5)
Groundtruth bounding box coordinates to convert.
box_pred : torch.Tensor of shape (N, S, S, 5)
Predicted bounding box coordinates to convert.
Return:
box_true_absolute : torch.Tensor of shape (N, 4)
box_pred_absolute : torch.Tensor of shape (N, 4)
"""
assert len(box_true.shape)==4 and len(box_pred.shape)==4, "Bbox should be of size (N,S,S,5)."
SIZEHW = 75
S = 6
CELL_SIZE = 1/S
### Get non-zero coordinates
cells_with_obj = box_true.nonzero()[::5]
N, cells_i, cells_j, _ = cells_with_obj.permute(1,0)
### Retrieving box coordinates. TBM if nb_obj > 1
xrcell_true, yrcell_true, rw_true, rh_true = box_true[N, cells_i, cells_j, 0:4].permute(1,0)
xrcell_pred, yrcell_pred, rw_pred, rh_pred = box_pred[N, cells_i, cells_j, 0:4].permute(1,0)
### Compute relative-to-image center coordinates
xc_rimg_true = xrcell_true * CELL_SIZE + cells_j * CELL_SIZE
xc_rimg_pred = xrcell_pred * CELL_SIZE + cells_j * CELL_SIZE
yc_rimg_true = yrcell_true * CELL_SIZE + cells_i * CELL_SIZE
yc_rimg_pred = yrcell_pred * CELL_SIZE + cells_i * CELL_SIZE
### Compute absolute top left coordinates
xmin_true = (xc_rimg_true - rw_true/2) * SIZEHW
xmin_pred = (xc_rimg_pred - rw_pred/2) * SIZEHW
ymin_true = (yc_rimg_true - rh_true/2) * SIZEHW
ymin_pred = (yc_rimg_pred - rh_pred/2) * SIZEHW
### Compute absolute bottom right coordinates
xmax_true = xmin_true + rw_true*SIZEHW
xmax_pred = xmin_pred + rw_pred*SIZEHW
ymax_true = ymin_true + rh_true*SIZEHW
ymax_pred = ymin_pred + rh_pred*SIZEHW
### Stacking
box_true_absolute = torch.stack((xmin_true, ymin_true, xmax_true, ymax_true), dim=-1)
box_pred_absolute = torch.stack((xmin_pred, ymin_pred, xmax_pred, ymax_pred), dim=-1)
return box_true_absolute, box_pred_absolute

Converting a color image into grayscale using a 3x3 convolution kernel

I am writing a python script that would use a 3x3 kernel to convert an image from color to grayscale.
I created a function that takes an 'image' and a 'kernel' as parameters, and returns the grayscale version of the image.
Inside of the function I split the image into 3 individual channels: redChannel, greenChannel, and blueChannel.
Then I take an average of these three channels as such: image = (red + green + blue) / 3.
I stored the values for Image height and width as follows: (Hi, Wi) = image.shape[:2] and I did the same for storing the height and width of the kernel, (Hk, Wk) = kernel.shape[:2].
I also included the padding for the image so that the kernel would not run out of bounds
pad = (Wk - 1) // 2.
Then I created two for loops that would iterate across height and width of the image using Hi and Wi.
Inside of the for loops, I reshaped the image into so that I could multiply it with the kernel. Then I store the computed result in an output array.
This is the full code:
from skimage.exposure import rescale_intensity
import numpy as np
import cv2
def convolve(image, kernel):
(Hi, Wi) = image.shape[:2]
(Hk, Wk) = kernel.shape[:2]
red, green, blue = cv2.split(image)
image = (red + green + blue) / 3
pad = (Wk - 1) // 2
image = cv2.copyMakeBorder(image, pad, pad, pad, pad, cv2.BORDER_REPLICATE)
output = np.zeros((Hi, Wi), dtype="float32")
for y in range(Hi, Hk + pad):
for x in range(Wi, Wk + pad):
roi = image[y - pad:y + pad + 1, x - pad:x + pad + 1]
k = (roi * kernel).sum()
output[y - pad, x - pad] = k
output = rescale_intensity(output, in_range=(0, 255))
output = (output * 255).astype("uint8")
return output
image = cv2.imread("mandrill.png")
kernel = np.ones((3, 3)) * (1/3)
cv2.imshow("Output", convolve(image, kernel))
cv2.waitKey(0)
cv2.destroyAllWindows()
I cannot seem to find any issues with the code, but the result is a black screen.
Any help will be greatly appreciated))
I found the answer using a slightly different approach.
This methods gets the pixel values of the image and stores it in 3 color channels (R,G,B). The res = np.dot(kernel, v) multiplies the image with a 3x3 grayscale kernel. The three if statements rescale the intensity of the pixel values.
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
def convolve(img, kernel):
width, height = img.size
pixels = img.load()
for py in range(height):
for px in range(width):
r, g, b = img.getpixel((px, py))
v = np.array([[r], [g], [b]])
res = np.dot(kernel, v)
tr, tg, tb = int(res[0, 0]), int(res[1, 0]), int(res[2, 0])
if tr > 255:
tr = 255
if tg > 255:
tg = 255
if tb > 255:
tb = 255
pixels[px, py] = (tr, tg, tb)
return img
img = Image.open('mandrill.jpg')
grayscale = np.ones((3, 3)) * (1/3)
convolve(img, grayscale)
plt.imshow(img)

What is the best way to extract text contained within a table in a pdf using python?

I'm constructing a program to extract text from a pdf, put it in a structured format, and send it off to a database. I have roughly 1,400 individual pdfs that all follow a similar format, but nuances in the verbiage and plan designs that the documents summarize make it tricky.
I've played around with a couple different pdf readers in python including tabula-py and pdfminer but none of them are quite getting to what I'd like to do. Tabula reads in all of the text very well, however it pulls everything as it explicitly lays horizontally, excluding the fact that some of the text is wrapped in a box. For example, if you open up the sample SBC I have attached where it reads "What is the overall deductible?" Tabula will read in "What is the overall $500/Individual or..." skipping the fact that the word "deductible" is really part of the first sentence. (Note the files I'm working with are pdfs but I've attached a jpeg because I couldn't figure out how to attach a pdf.)
import tabula
df = tabula.read_pdf(*filepath*, pandas_options={'header': None))
print(df.iloc[0][0])
print(df)
In the end, I'd really like to be able to parse out the text within each box so that I can better identify what values belong to deductible, out-of-pocket limts, copays/coinsurance, etc. I thought possibly some sort of OCR would allow me to recognize which parts of the PDF are contained in the blue rectangles and then pull the string from there, but I really don't know where to start with that.Sample SBC
#jpnadas In this case the code you copied from my answer in this post isn't really suitable because it addresses the case when a table doesn't have surrounding grid. That algorithm looks for repeating blocks of texts and tries to find a pattern that resembles a table heuristically.
But in this particular case the table does have the grid and by taking this advantage we can achieve a lot more accurate result.
The strategy is the following:
Increase image gamma to make the grid darker
Get rid of colour and apply Otsu thresholding
Find long vertical an horizontal lines in the image and create a mask from it using erode and dilate functions
Find the cell blocks in the mask using findContours function.
Find table objects
5.1 The rest can be as in the post about finding a table without the
grid: find table structure heuristically
5.2 Alternative approach could be using hierarchy returned by the findContours function. This approach is even more accurate and
allows to find multiple tables on a single image.
Having cell coordinates it's easy to extract certain cell image from the original image:
cell_image = image[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
Apply OCR to each cell_image.
BUT! I consider the OpenCV approach as a last resort when you're not able to read the PDF's contents: for instance in case when a PDF contains raster image inside.
If it's a vector-based PDF and its contents are readable it makes more sense to find the table inside contents and just read the text from it instead of doing heavy 'OCR lifting'.
Here's the code for reference for more accurate table recognition:
import os
import imutils
import numpy as np
import argparse
import cv2
def gamma_correction(image, gamma = 1.0):
look_up_table = np.empty((1,256), np.uint8)
for i in range(256):
look_up_table[0,i] = np.clip(pow(i / 255.0, gamma) * 255.0, 0, 255)
result = cv2.LUT(image, look_up_table)
return result
def pre_process_image(image):
# Let's get rid of color first
# Applying gamma to make the table lines darker
gamma = gamma_correction(image, 2)
# Getting rid of color
gray = cv2.cvtColor(gamma, cv2.COLOR_BGR2GRAY)
# Then apply Otsu threshold to reveal important areas
ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
# inverting the thresholded image
return ~thresh
def get_horizontal_lines_mask(image, horizontal_size=100):
horizontal = image.copy()
horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
horizontal = cv2.erode(horizontal, horizontal_structure, anchor=(-1, -1), iterations=1)
horizontal = cv2.dilate(horizontal, horizontal_structure, anchor=(-1, -1), iterations=1)
return horizontal
def get_vertical_lines_mask(image, vertical_size=100):
vertical = image.copy()
vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
vertical = cv2.erode(vertical, vertical_structure, anchor=(-1, -1), iterations=1)
vertical = cv2.dilate(vertical, vertical_structure, anchor=(-1, -1), iterations=1)
return vertical
def make_lines_mask(preprocessed, min_horizontal_line_size=100, min_vertical_line_size=100):
hor = get_horizontal_lines_mask(preprocessed, min_horizontal_line_size)
ver = get_vertical_lines_mask(preprocessed, min_vertical_line_size)
mask = np.zeros((preprocessed.shape[0], preprocessed.shape[1], 1), dtype=np.uint8)
mask = cv2.bitwise_or(mask, hor)
mask = cv2.bitwise_or(mask, ver)
return ~mask
def find_cell_boxes(mask):
# Looking for the text spots contours
# OpenCV 3
# img, contours, hierarchy = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# OpenCV 4
contours = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = imutils.grab_contours(contours)
contours = sorted(contours, key=cv2.contourArea, reverse=True)
image_width = mask.shape[1]
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
w = box[2]
# Excluding the page box shape but adding smaller boxes
if w < 0.95 * image_width:
boxes.append(box)
return boxes
def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
rows = {}
cols = {}
# Clustering the bounding boxes by their positions
for box in boxes:
(x, y, w, h) = box
col_key = x // cell_threshold
row_key = y // cell_threshold
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]
# Filtering out the clusters having less than 2 cols
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
# Sorting the row cells by x coord
table_cells = [list(sorted(tb)) for tb in table_cells]
# Sorting rows by the y coord
table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))
return table_cells
def build_vertical_lines(table_cells):
if table_cells is None or len(table_cells) <= 0:
return [], []
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
max_y = max_last_row_height_box[1] + max_last_row_height_box[3]
hor_lines = []
ver_lines = []
for box in table_cells:
x = box[0][0]
y = box[0][1]
hor_lines.append((x, y, max_x, y))
for box in table_cells[0]:
x = box[0]
y = box[1]
ver_lines.append((x, y, x, max_y))
(x, y, w, h) = table_cells[0][-1]
ver_lines.append((max_x, y, max_x, max_y))
(x, y, w, h) = table_cells[0][0]
hor_lines.append((x, max_y, max_x, max_y))
return hor_lines, ver_lines
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True, help="path to images directory")
args = vars(ap.parse_args())
in_file = args["image"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
img = cv2.imread(in_file)
pre_processed = pre_process_image(img)
# Visualizing pre-processed image
cv2.imwrite(filename_base + ".pre.png", pre_processed)
lines_mask = make_lines_mask(pre_processed, min_horizontal_line_size=1800, min_vertical_line_size=500)
# Visualizing table lines mask
cv2.imwrite(filename_base + ".mask.png", lines_mask)
cell_boxes = find_cell_boxes(lines_mask)
cells = find_table_in_boxes(cell_boxes)
# apply OCR to each cell rect here
# the cells array contains cell coordinates in tuples (x, y, w, h)
hor_lines, ver_lines = build_vertical_lines(cells)
# Visualize the table lines
vis = img.copy()
for line in hor_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
for line in ver_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
cv2.imwrite(filename_base + ".result.png", vis)
Some parameters are hard-coded:
page size threshold - 0.95
min horizontal line size - 1800 px
min vertical line size - 500 px
You can provide them as configurable parameters or make them relative to image size.
Results:
I think that the best way to do what you need is to find and isolate the cells in the file and then apply OCR to each individual cell.
There are a number of solutions in SO for that, I got the code from this answer and played around a little with the parameters to get the output below (not perfect yet, but you can tweak it a little bit yourself).
import os
import cv2
import imutils
# This only works if there's only one table on a page
# Important parameters:
# - morph_size
# - min_text_height_limit
# - max_text_height_limit
# - cell_threshold
# - min_columns
def pre_process_image(img, save_in_file, morph_size=(23, 23)):
# get rid of the color
pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Otsu threshold
pre = cv2.threshold(pre, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# dilate the text to make it solid spot
cpy = pre.copy()
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
pre = ~cpy
if save_in_file is not None:
cv2.imwrite(save_in_file, pre)
return pre
def find_text_boxes(pre, min_text_height_limit=20, max_text_height_limit=120):
# Looking for the text spots contours
contours, _ = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
h = box[3]
if min_text_height_limit < h < max_text_height_limit:
boxes.append(box)
return boxes
def find_table_in_boxes(boxes, cell_threshold=100, min_columns=3):
rows = {}
cols = {}
# Clustering the bounding boxes by their positions
for box in boxes:
(x, y, w, h) = box
col_key = x // cell_threshold
row_key = y // cell_threshold
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]
# Filtering out the clusters having less than 2 cols
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
# Sorting the row cells by x coord
table_cells = [list(sorted(tb)) for tb in table_cells]
# Sorting rows by the y coord
table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))
return table_cells
def build_lines(table_cells):
if table_cells is None or len(table_cells) <= 0:
return [], []
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
max_y = max_last_row_height_box[1] + max_last_row_height_box[3]
hor_lines = []
ver_lines = []
for box in table_cells:
x = box[0][0]
y = box[0][1]
hor_lines.append((x, y, max_x, y))
for box in table_cells[0]:
x = box[0]
y = box[1]
ver_lines.append((x, y, x, max_y))
(x, y, w, h) = table_cells[0][-1]
ver_lines.append((max_x, y, max_x, max_y))
(x, y, w, h) = table_cells[0][0]
hor_lines.append((x, max_y, max_x, max_y))
return hor_lines, ver_lines
if __name__ == "__main__":
in_file = os.path.join(".", "test.jpg")
pre_file = os.path.join(".", "pre.png")
out_file = os.path.join(".", "out.png")
img = cv2.imread(os.path.join(in_file))
pre_processed = pre_process_image(img, pre_file)
text_boxes = find_text_boxes(pre_processed)
cells = find_table_in_boxes(text_boxes)
hor_lines, ver_lines = build_lines(cells)
# Visualize the result
vis = img.copy()
# for box in text_boxes:
# (x, y, w, h) = box
# cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)
for line in hor_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
for line in ver_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
cv2.imwrite(out_file, vis)

How to perform Earth Mover's Distance instead of DoG for center surround difference on multiscale level in images in python 3.7

I am working on a image processing project where i have to perform center surround difference calculation with Earth Mover's distance(EMD) on multiscale level but the problem is that i can't figure it out how center surround difference works and how could i use EMD for it.
I found the python function for EMD but it works with 2 source image histograms whereas in my problem i have only one source.
I am generating multi scales of the image using skimage's pyramid_gaussian function using solution provided on
link: https://gist.github.com/duhaime/211365edaddf7ff89c0a36d9f3f7956c
I tried:
def get_img(path, norm_size=True, norm_exposure=False):
img = imread(path, flatten=True).astype(int)
if norm_size:
img = resize(img, (height, width), anti_aliasing=True, preserve_range=True)
if norm_exposure:
img = normalize_exposure(img)
return img
def get_histogram(img):
h, w = img.shape
hist = [0.0] * 256
for i in range(h):
for j in range(w):
hist[img[i, j]] += 1
return np.array(hist) / (h * w)
def normalize_exposure(img):
img = img.astype(int)
hist = get_histogram(img)
cdf = np.array([sum(hist[:i+1]) for i in range(len(hist))]) # get the sum of vals accumulated by each position in hist
sk = np.uint8(255 * cdf) # determine the normalization values for each unit of the cdf
height, width = img.shape # normalize each position in the output image
normalized = np.zeros_like(img)
for i in range(0, height):
for j in range(0, width):
normalized[i, j] = sk[img[i, j]]
return normalized.astype(int)
def earth_movers_distance(path_a, path_b):
img_a = get_img(path_a, norm_exposure=True)
img_b = get_img(path_b, norm_exposure=True)
hist_a = get_histogram(img_a)
hist_b = get_histogram(img_b)
return wasserstein_distance(hist_a, hist_b)
if __name__ == '__main__':
image = cv2.imread("images/test3.jpg")
pyramidlist=[]
dst = []
for (i, resized) in enumerate(pyramid_gaussian(image, downscale=1.4)):
if resized.shape[0] < 30 or resized.shape[1] < 30:
break
cv2.imshow(f"Layer {i+1}", resized)
cv2.waitKey(0)
pyramidlist.append(resized[i])
print(pyramidlist)
print(len(pyramidlist))
cv2.destroyAllWindows()
but don't know how to use EMD after generating pyramids and calculate center surround difference.

OpenCV: Segment each digit from the given image. Digits are written in each cell of a row matrix. Each cell is bounded by margins

I have been trying to recognise handwritten letters (digits/alphabet) from a form-document. As it is known that form-documents have 1d row cells, where the applicant has to fill their information within those bounded cells. However, I'm unable to segment the digits(currently my input consists only digits) from the bounding boxes.
I went through the following steps:
Reading the image (as a grayscale image) via "imread" method of opencv2. Initial Image size:19 x 209(in pixels).
pic = "crop/cropped000.jpg"
newImg = cv2.imread(pic, 0)
Resizing the image 200% its original size via "resize" method of opencv2. I used INTER_AREA Interpolation. Resized Image size: 38 x 418(in pixels)
h,w = newImg.shape
resizedImg = cv2.resize(newImg, (2*w,2*h), interpolation=cv2.INTER_AREA)
Applied Canny edge detection.
v = np.median(resizedImg)
sigma = 0.33
lower = int(max(0, (1.0 - sigma) * v))
upper = int(min(255, (1.0 + sigma) * v))
edgedImg = cv2.Canny(resizedImg, lower, upper)
Cropped the contours and saved them as images in 'BB' directory.
im2, contours, hierarchy = cv2.findContours(edgedImg.copy(),cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
num = 0
for c in contours:
x, y, w, h = cv2.boundingRect(c)
num += 1
new_img = resizedImg[y:y+h, x:x+w]
cv2.imwrite('BB/'+str(num).zfill(3) + '.jpg', new_img)
Entire code in summary:
pic = "crop/cropped000.jpg"
newImg = cv2.imread(pic, 0)
h,w = newImg.shape
print(newImg.shape)
resizedImg = cv2.resize(newImg, (2*w,2*h), interpolation=cv2.INTER_AREA)
print(resizedImg.shape)
v = np.median(resizedImg)
sigma = 0.33
lower = int(max(0, (1.0 - sigma) * v))
upper = int(min(255, (1.0 + sigma) * v))
edgedImg = cv2.Canny(resizedImg, lower, upper)
im2, contours, hierarchy = cv2.findContours(edgedImg.copy(),cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
num = 0
for c in contours:
x, y, w, h = cv2.boundingRect(c)
num += 1
new_img = resizedImg[y:y+h, x:x+w]
cv2.imwrite('BB/'+str(num).zfill(3) + '.jpg', new_img)
Images produced are posted here:
https://imgur.com/a/GStIcdj
I had to double the image size because Canny edge detection was producing double-edges for an object (However, it still does). I have also played with other openCV functionalities like Thresholding, Gaussian Blur, Dilate, Erode but all in vain.
# we need one more parameter for Date cell width : as this could be different for diff bank
def crop_image_data_from_date_field(image, new_start_h, new_end_h, new_start_w, new_end_w, cell_width):
#for date each cell has same height and width : here width: 25 px so cord will be changed based on width
cropped_image_list = []
starting_width = new_start_w
for i in range(1,9): # as date has only 8 fields: DD/MM/YYYY
cropped_img = image[new_start_h:new_end_h, new_start_w + 1 :new_start_w+22]
new_start_w = starting_width + (i*cell_width)
cropped_img = cv2.resize(cropped_img, (28, 28))
image_name = 'cropped_date/cropped_'+ str(i) + '.png'
cv2.imwrite(image_name, cropped_img)
cropped_image_list.append(image_name)
# print('cropped_image_list : ',cropped_image_list,len(cropped_image_list))
# rec_value = handwritten_digit_recog.recog_digits(cropped_image_list)
recvd_value = custom_predict.predict_digit(cropped_image_list)
# print('recvd val : ',recvd_value)
return recvd_value
you need to specify each cell width and it's x,y,w,h.
I think this will help you.

Resources