Optimise memory usage with plt.imread() - python-3.x

I am working with a lot of images (120k), each image is a component of RGB + yellow of a single image (so I have 30k unique images all break down in 4 images: one for Red, Green, Blue and Yellow)
For each image ID, I merge the 4 components (RGB + yellow) into a (M,N,4) array (where M and N are the dimensions of the image).
I work with the following code:
import pandas as pd
import numpy as np
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from os import listdir
train_labels_data = pd.read_csv('/Documents/train.csv')
def merge_rgb(img_id, colours=['red','blue','green','yellow'], path = 'train'):
"""
For each images, returns an array of shape (M,N,4)
where each dimension in the 4 are red, blue, green and yellow.
"""
merged_colour_img = []
for colour in colours:
full_path = path + '/' + img_id + '_' + colour + '.png'
colour_img=mpimg.imread(full_path)
merged_colour_img.append(colour_img)
merged_colour_img = np.dstack((merged_colour_img))
return merged_colour_img
def train_data_label(train_labels_data):
"""
From the train_labels csv file, create a list of labels, and create a large
array for the train data in same order.
"""
train_ids = [img_id for img_id in train_labels_data['Id']]
train_labels = [label for label in train_labels_data['Target']]
print ('Labels and Ids collected')
train_data = []
i=0
for img_id in train_ids:
print ('Merging Image')
train_data_img = merge_rgb (img_id)
print ('Merging done, appending the (M,N,4) array to a list')
train_data.append(train_data_img)
i += 1
print ('Done appending, going to next image')
print(i)
print('Stacking all images in one big array')
train_data = np.stack(train_data)
return train_labels, train_data
train_labels, train_data = train_data_label(train_labels_data)
# SAVE OUTPUT
data_pickle_train = pickle.dumps(train_data)
data = open("/Documents/train_data.pkl","wb")
data.write(data_pickle_train)
data.close()
data_pickle_train_labels = pickle.dumps(train_labels)
data = open("/Documents/train_data_labels.pkl","wb")
data.write(data_pickle_train_labels)
data.close()
However this code uses a lot of memory and crashes half way before all images are processed. Since I am working with images I suspect I could improve the merge_rgb function, any advice how to?
Thanks,

Related

How to properly save temperature readings from thermal camera in CSV file, time vs. readings in one row

This code is for MLX90640 infrared thermal camera. It plots a real-time temperature map across 768 (24x32) pixels using a Raspberry Pi that operates at roughly 1 frame per second. It also saves temperature data in CSV file. it wrights row per second where column A time (HH:MM:SS) then 768 readings from column "B" to column "ACN" but the problem is data in the first and last columns are mixed with double quotes and brackets e.g column "A" is 18:03:38 "[39.1 and column "ACN" is 36.8]" I used pop method and del method to delete " [ ] " but both shows out of index range. Any idea what cause this problem.
import RPi.GPIO as GPIO
import time,board,busio
import numpy as np
import adafruit_mlx90640
import matplotlib.pyplot as plt
from adafruit_blinka import Enum, Lockable, agnostic
import csv
import datetime
i2c = busio.I2C(board.SCL, board.SDA, frequency=800000) # setup I2C for thermal camera
thermal_mapfile = str(datetime.datetime.now().date()) + '_' + str(datetime.datetime.now().time()).replace(':', '.')
thermal_mapfile = thermal_mapfile[:16] #limit thermal file name to 16 characters
print("Thermal cam is ON")
mlx = adafruit_mlx90640.MLX90640(i2c) # begin MLX90640 with I2C comm
mlx.refresh_rate = adafruit_mlx90640.RefreshRate.REFRESH_2_HZ # set refresh rate 2Hz
mlx_shape = (24,32)
print("Initialized")
# setup the figure for plotting
plt.ion() # enables interactive plotting
fig,ax = plt.subplots(figsize=(12,7))
therm1 = ax.imshow(np.zeros(mlx_shape),vmin=0,vmax=60) #start plot with zeros
cbar = fig.colorbar(therm1) # setup colorbar for temps
cbar.set_label('Temperature [$^{\circ}$C]',fontsize=14) # colorbar label
t_array = []
frame = [0] * 768
t1 = time.monotonic()
while True:
try:
mlx.getFrame(frame) # read MLX temperatures into frame var
data_array = (np.reshape(frame,mlx_shape)) # reshape to 24x32
therm1.set_data(np.fliplr(data_array)) # flip left to right
therm1.set_clim(vmin=np.min(data_array),vmax=np.max(data_array)) # set bounds
cbar.update_normal(therm1) # update colorbar range
plt.title(f"Max Temp: {np.max(data_array):.1f}C")
plt.pause(0.001) # required
t_array.append(time.monotonic()-t1)
except ValueError:
continue # if error, just read again
for h in range(24):
for w in range(32):
t = frame[h*32 + w]
frame = list(np.around(np.array(frame),1)) #round array elements to one decimal point
with open("/home/pi/Thermal_Camera/"+thermal_mapfile+".csv","a") as thermalfile:
writer = csv.writer(thermalfile,delimiter=" ")
unix_time = time.time()
formatted_time = datetime.datetime.fromtimestamp(unix_time).strftime('%H:%M:%S')
writer.writerow([formatted_time,frame])
An example of what I am talking about:
import csv
import datetime
hdrs = ['dt','a', 'b', 'c']
data_list = [1, 2, 3]
#Case 1, passing a list directly.
with open('csv_list_test.csv', 'w') as csv_file:
csv_writer = csv.writer(csv_file, delimiter='|')
csv_writer.writerow(hdrs)
csv_writer.writerow([datetime.datetime.now().isoformat(), data_list])
cat csv_list_test.csv
dt|a|b|c
2023-01-24T17:17:44.961821|[1, 2, 3]
# Case 2, unpack list.
with open('csv_list_test.csv', 'w') as csv_file:
csv_writer = csv.writer(csv_file, delimiter='|')
csv_writer.writerow(hdrs)
csv_writer.writerow([datetime.datetime.now().isoformat(), *data_list])
cat csv_list_test.csv
dt|a|b|c
2023-01-24T17:18:32.337160|1|2|3
I use a delimiter that makes it easy to distinguish the columns. delimiter=" " is not a good idea.
In Case 1 you can see that the list is all in column a.
In Case 2 unpacking(*data_list) the list puts the individual elements in the appropriate columns.

Why are albumentations Augmentations (Yolo / YoloV5) altering Bounding Boxes if no augmentations are being placed?

I was using the Albumentations library in order to perform some data augmentations on an object detection dataset that I intended to train a YoloV5 model on.
I have to perform the augmentations seperately and save the images locally to disk, but when I do I noticed that some of the output bounding boxes returned aren't generating properly.
I have my augmentations set up in a seperate aug.py file, shown below (augmentations purposefully removed in debugging attempts, see below) -
import albumentations as A
import cv2
PROB = 0.5
bbp = A.BboxParams(format="yolo")
horizontal_flip_transform = A.Compose([
], bbox_params = bbp)
vertical_flip_transform = A.Compose([
], bbp)
pixel_dropout_transform = A.Compose([
], bbox_params = bbp)
random_rotate = A.Compose([
], bbox_params = bbp )
#NOTE: THIS METHOD IMPLIES THAT THE IMAGE WIDTHS MUST BE AT LEAST 50 PIXELS
#Remove this aug to remove this constraint
random_crop = A.Compose([
], bbox_params = bbp)
augs = [horizontal_flip_transform, vertical_flip_transform, pixel_dropout_transform, random_rotate, random_crop]
def get_augmentations():
return augs
And the relevant parts of my implementation for performing the augmentations and saving them to disk is below:
def run_augments_on_image(img_name, bboxes, max_images_to_generate = 500):
ret = []
img = np.array(Image.open(img_name), dtype=np.uint8)
transforms = get_augmentations()
for i in range(min(len(transforms), max_images_to_generate)):
transformed = transforms[i](image=img, bboxes = bboxes)
ret.append((transformed["image"] , transformed["bboxes"]))
return ret
def run_and_save_augments_on_image_sets(batch_img_names, bboxes_urls, max_images_to_generate, dataset_dir, trainval):
num_images = 0
for i in range(len(batch_img_names)):
bboxes = []
with open(os.path.join(dataset_dir, trainval, 'labels', bboxes_urls[i]), 'r') as f:
for row in f:
x = row.strip().split(' ')
x.append(row[0])
x.pop(0)
x[0] = float(x[0])
x[1] = float(x[1])
x[2] = float(x[2])
x[3] = float(x[3])
bboxes.append(x)
trans = run_augments_on_image(os.path.join(dataset_dir, trainval, 'images', batch_img_names[i]), bboxes)
img_index = len(os.listdir(os.path.join(dataset_dir, 'train' , 'images'))) + len(os.listdir(os.path.join(dataset_dir, 'valid', 'images'))) + 1
for j in range(len(trans)):
img_trans, bboxes_trans = trans[j]
p = Image.fromarray(img_trans).save(os.path.join(dataset_dir, trainval, 'images' , f'image-{img_index}.{batch_img_names[j].split(".")[-1]}'))
with open(os.path.join(dataset_dir, trainval, 'labels', f'image-{img_index}.txt'), 'w') as f:
for boxs in bboxes_trans:
print(f'{boxs[-1]} {boxs[0]} {boxs[1]} {boxs[2]} {boxs[3]}', file=f)
num_images += 1
img_index += 1
if num_images >= max_images_to_generate:
break
if num_images >= max_images_to_generate:
break
For testing purposes (some of the bounding boxes were off), I removed all the actual augmentations, expecting the input image label (one augmented image example shown below) to be equal to augmented label since there were no augmentations. But, as you can see, the two labels are different.
img-original.txt
0 0.5662285714285714 0.2740066225165563 0.5297714285714286 0.4837913907284769
img-augmented.txt
0 0.51488 0.47173333333333334 0.6405099999999999 0.6527333333333334
(The labels above are in normalized xywh YOLO format)
Why is albumentations altering the labels? None of the augmentations in augs.py contain anything.

How to generate heat map on the Whole Slide Images (.svs format) using some probability values?

I am trying to generate heat map, or probability map, for Whole Slide Images (WSIs) using probability values. I have coordinate points (which determine areas on the WSIs) and corresponding probability values.
Basic Introduction on WSI: WSIs are large is size (almost 100000 x 100000 pixels). Hence, can't open these images using normal image viewer. The WSIs are processed using OpenSlide software.
I have seen previous posts in Stack Overflow on related to heat map, but as WSIs are processed in a different way, I am unable to figure out how to apply these solutions. Some examples that I followed: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, etc.
To generate heat map on WSIs, follow below instructions:
First of all Extract image patches and save the coordinates. Use below code for patch extraction. The code require some changes as per the requirements. The code has been copied from: patch extraction code link
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import logging
try:
import Image
except:
from PIL import Image
import math
import numpy as np
import openslide
import os
from time import strftime,gmtime
parser = argparse.ArgumentParser(description='Extract a series of patches from a whole slide image')
parser.add_argument("-i", "--image", dest='wsi', nargs='+', required=True, help="path to a whole slide image")
parser.add_argument("-p", "--patch_size", dest='patch_size', default=299, type=int, help="pixel width and height for patches")
parser.add_argument("-b", "--grey_limit", dest='grey_limit', default=0.8, type=float, help="greyscale value to determine if there is sufficient tissue present [default: `0.8`]")
parser.add_argument("-o", "--output", dest='output_name', default="output", help="Name of the output file directory [default: `output/`]")
parser.add_argument("-v", "--verbose",
dest="logLevel",
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
default="INFO",
help="Set the logging level")
args = parser.parse_args()
if args.logLevel:
logging.basicConfig(level=getattr(logging, args.logLevel))
wsi=' '.join(args.wsi)
""" Set global variables """
mean_grey_values = args.grey_limit * 255
number_of_useful_regions = 0
wsi=os.path.abspath(wsi)
outname=os.path.abspath(args.output_name)
basename = os.path.basename(wsi)
level = 0
def main():
img,num_x_patches,num_y_patches = open_slide()
logging.debug('img: {}, num_x_patches = {}, num_y_patches: {}'.format(img,num_x_patches,num_y_patches))
for x in range(num_x_patches):
for y in range(num_y_patches):
img_data = img.read_region((x*args.patch_size,y*args.patch_size),level, (args.patch_size, args.patch_size))
print_pics(x*args.patch_size,y*args.patch_size,img_data,img)
pc_uninformative = number_of_useful_regions/(num_x_patches*num_y_patches)*100
pc_uninformative = round(pc_uninformative,2)
logging.info('Completed patch extraction of {} images.'.format(number_of_useful_regions))
logging.info('{}% of the image is uninformative\n'.format(pc_uninformative))
def print_pics(x_top_left,y_top_left,img_data,img):
if x_top_left % 100 == 0 and y_top_left % 100 == 0 and x_top_left != 0:
pc_complete = round(x_top_left /img.level_dimensions[0][0],2) * 100
logging.info('{:.2f}% Complete at {}'.format(pc_complete,strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())))
exit()
img_data_np = np.array(img_data)
""" Convert to grayscale"""
grey_img = rgb2gray(img_data_np)
if np.mean(grey_img) < mean_grey_values:
logging.debug('Image grayscale = {} compared to threshold {}'.format(np.mean(grey_img),mean_grey_values))
global number_of_useful_regions
number_of_useful_regions += 1
wsi_base = os.path.basename(wsi)
wsi_base = wsi_base.split('.')[0]
img_name = wsi_base + "_" + str(x_top_left) + "_" + str(y_top_left) + "_" + str(args.patch_size)
#write_img_rotations(img_data_np,img_name)
logging.debug('Saving {} {} {}'.format(x_top_left,y_top_left,np.mean(grey_img)))
save_image(img_data_np,1,img_name)
def gen_x_and_y(xlist,ylist,img):
for x in xlist:
for y in ylist:
img_data = img.read_region((x*args.patch_size,y*args.patch_size),level, (args.patch_size, args.patch_size))
yield (x, y,img_data)
def open_slide():
"""
The first level is always the main image
Get width and height tuple for the first level
"""
logging.debug('img: {}'.format(wsi))
img = openslide.OpenSlide(wsi)
img_dim = img.level_dimensions[0]
"""
Determine what the patch size should be, and how many iterations it will take to get through the WSI
"""
num_x_patches = int(math.floor(img_dim[0] / args.patch_size))
num_y_patches = int(math.floor(img_dim[1] / args.patch_size))
remainder_x = img_dim[0] % num_x_patches
remainder_y = img_dim[1] % num_y_patches
logging.debug('The WSI shape is {}'.format(img_dim))
logging.debug('There are {} x-patches and {} y-patches to iterate through'.format(num_x_patches,num_y_patches))
return img,num_x_patches,num_y_patches
def validate_dir_exists():
if os.path.isdir(outname) == False:
os.mkdir(outname)
logging.debug('Validated {} directory exists'.format(outname))
if os.path.exists(wsi):
logging.debug('Found the file {}'.format(wsi))
else:
logging.debug('Could not find the file {}'.format(wsi))
exit()
def rgb2gray(rgb):
"""Converts an RGB image into grayscale """
r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
return gray
def save_image(img,j,img_name):
tmp = os.path.join(outname,img_name+"_"+str(j)+".png")
try:
im = Image.fromarray(img)
im.save(tmp)
except:
print('Could not print {}'.format(tmp))
exit()
if __name__ == '__main__':
validate_dir_exists()
main()
Secondly, generate the probability values of each patches.
Finally, replace all the pixel values within a coordinates with the corresponding probability values and display the results using color maps.
This is the basic idea of generating heat map on WSIs. You can modify the code and concept to get a heat map as per your wish.
We have developed a python package for processing whole-slide-images:
https://github.com/amirakbarnejad/PyDmed
Here is a tutorial for getting heatmaps for whole-slide-images:
https://amirakbarnejad.github.io/Tutorial/tutorial_section5.html.
Also here is a sample notebook that gets heatmaps for WSIs using PyDmed:
Link to the sample notebook.
The benefit of PyDmed is that it is multi-processed. The dataloader sends a stream of patches to GPU(s), and the StreamWriter writes to disk in a separate process. Therefore, it is highly efficient. The running time of course depends on the machine, the size of WSIs, etc. On a good machine with a good GPU, PyDmed can generate heatmaps for ~120 WSIs in one day.

Separate Spam and Ham for WordCloud Visualization

I am performing spam detection and want to visualize spam and ham keywords separately in Wordcloud. Here's my .csv file.
data = pd.read_csv("spam.csv",encoding='latin-1')
data = data.rename(columns = {"v1":"label", "v2":"message"})
data = data.replace({"spam":"1","ham":"0"})
Here's my code for WordCloud. I need help with spam_words. I cannot generate the right graph.
import matplotlib.pyplot as plt
from wordcloud import WordCloud
spam_words = ' '.join(list(data[data['label'] == 1 ]['message']))
spam_wc = WordCloud(width = 512, height = 512).generate(spam_words)
plt.figure(figsize = (10,8), facecolor = 'k')
plt.imshow(spam_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
The issue is that the current code replaces "spam" and "ham" with the one-character strings "1" and "0", but you filter the DataFrame based on comparison with the integer 1. Change the replace line to this:
data = data.replace({"spam": 1, "ham": 0})

Matplotlib animation.FuncAnimation: Custom frame generator only yields once

I'm encountering a strange problem with the matplotlib animation. I'm trying to create a animated bar plot using the following code:
import os, time
from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
import matplotlib.animation as animation
import blackxample
FILE_PREFIX = "cell-isotohyper"
FILE_SUFFIX = ".tif"
FILE_PATH = "./example-video"
XCUT = (91, 91+266)
YCUT = (646, 646+252)
LIMIT = 100
OFFSET = 0
Y_SCALE = 3000
NUM_OF_BINS = 37
BAR_WIDTH = 1.0
BAR_COLOR = 'b'
RANGE = range(0, NUM_OF_BINS//2+1)
def animate(i, fig, ax, bars):
# = np.random.randn(1000)
print(len(i))
for a in RANGE:
bars[a].set_height(i[a])
return (fig, ax, bars)
def main():
fig, ax = plt.subplots()
ax.set_ylim(0, Y_SCALE)
ax.set_xlim(0, NUM_OF_BINS//2+1)
bars = ax.bar(np.arange(NUM_OF_BINS), [i for i in range(NUM_OF_BINS)], BAR_WIDTH, color=BAR_COLOR)
ani = animation.FuncAnimation(fig, animate, xframes, fargs = (fig, ax, bars), interval=500)
plt.show()
This code snippet works completely fine if I'm using randomly generated data or constant via:
def xframes():
i = 0
while i < 100:
yield [2312.7094266223335, 27.238786592368257, 75.252063484372513, 13.678304922077643, 11.879804374653929, 21.900570139020687, 2.930771773796323, 11.945594479736741, 10.88517941461987, 4.4176609254771506, 4.1075871395528338, 1.248363771876285, 1.4798157379442216, 3.5285036346353564, 3.2583080973651732, 3.4640042567344267, 3.130503535456981, 0.67334205875304676, 0.71393606581800562]
#yield np.histogram(np.random.randn(1000), NUM_OF_BINS//2 + 1)[0]
i+=1
Using the function, aframes, instead, does only yield the first item if it is used together animation.FuncAnimation(). If aframe is iterated manually, however, the generator works completely fine.
def aframes():
list_of_files = []
for dirname, dirnames, filenames in os.walk(FILE_PATH):
for filename in filenames:
if filename.startswith(FILE_PREFIX) and filename.endswith(FILE_SUFFIX):
list_of_files.append(os.path.join(FILE_PATH, filename))
# Open every picture - in every file
count = 0
imagecount = 0
framecount = 0
skipped = 0
for file in list_of_files:
framecount = 0
a = Image.open(file)
for frame in ImageSequence.Iterator(a):
if count > OFFSET and count <= OFFSET+LIMIT:
# Cut image beforehand - probably faster
frame = frame.crop((XCUT[0], YCUT[0], XCUT[1], YCUT[1]))
# Load image intro Matrix
imageMatrix = blackxample.Matrix.fromPillow(frame)
try:
imageMatrix.findContour()
imageMatrix.calculateCentroid()
imageMatrix.transform(NUM_OF_BINS)
#yield imageMatrix.getTransform()
yield [2312.7094266223335, 27.238786592368257, 75.252063484372513, 13.678304922077643, 11.879804374653929, 21.900570139020687, 2.930771773796323, 11.945594479736741, 10.88517941461987, 4.4176609254771506, 4.1075871395528338, 1.248363771876285, 1.4798157379442216, 3.5285036346353564, 3.2583080973651732, 3.4640042567344267, 3.130503535456981, 0.67334205875304676, 0.71393606581800562]
except blackxample.NoConvergenceError:
skipped+=1
print("[", count ,"] done")
framecount+=1
count+=1
imagecount+=1
# Test for frame iterator - works fine
#for i in _frames():
# print(i)
Does someone has a clue what and why is happening? How can I fix it?
The generator also runs as expected if the three imageMatrix-lines inside the try-block are commented out which suggests that there is an error inside imageMatrix.findContour(). But what am I looking for? findContour doesn't do anything weird
Since I have not found any solutions regarding this problem, I've decided to save the result of aframes() in a file, then reading and animating it seperatly which works flawlessly without adjusting the animation code.

Resources