How can I speed up using Pytorch DataLoader? - pytorch

I had a dataset including about a million of rows. Before, I read the rows, preprocessed data and created a list of rows to be trained. Then I defined a Dataloader over this data like:
train_dataloader = torch.utils.data.DataLoader(mydata['train'],
batch_size=node_batch_size,shuffle=shuffle,collate_fn=data_collator)
Preprocessing could be time consuming, so I thought to define an IterableDataSet with __iter__ function. Then I could define my Dataloader like:
train_dataloader = torch.utils.data.DataLoader(myds['train'],
batch_size=node_batch_size,shuffle=shuffle,collate_fn=data_collator)
However, still to begin training it seems that it calls my preprocessing function and creates an Iteration over it. So, it seems I didn't gain much speed up.
Please guide me how could I use speed up in this case?
Here is my part of my class:
def __iter__(self):
iter_start = self.start
iter_end = self.num_samples
worker_info = torch.utils.data.get_worker_info()
if worker_info is None: # single-process data loading, return the full iterator
iter_start = self.start
iter_end = self.num_samples
else: # in a worker process
# split workload
per_worker = int(math.ceil((self.num_samples - self.start) / float(worker_info.num_workers)))
worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.num_samples)
if self.flat_data:
return iter(self.flat_data)
else:
return iter(self.fill_data(iter_start, iter_end))
def fill_data(self, iter_start, iter_end, show_progress=False):
flat_data = []
if iter_end < 0:
iter_end = self.num_samples
kk = 0
dlog.info("========================== SPLIT: %s", self.split_name)
dlog.info("get data from %s to %s", iter_start, iter_end)
dlog.info("total rows: %s", len(self.split_df))
if show_progress:
pbar = tqdm(total = self.num_samples)
for index, d in self.split_df.iterrows():
if kk < iter_start:
dlog.info("!!!!!!!!! before start %s", iter_start)
kk += 1
continue
rel = d["prefix"]
...
# preprocessing and adding to returned list

I did preprosessing in the fill_data or __iter__ body. However, I can use a map for preprocessing. Then the preprocessing is called during training and for every batch and not before training.
import pandas as pd
import torch
class MyDataset(torch.utils.data.IterableDataset):
def __init__(self, fname, until=10):
self.df = pd.read_table("atomic/" + fname)
self.until = until
def preproc(self, t):
prefix, data = t
text = "Preproc: " + prefix + "|" + data
print(text) # to check when it is called
return text
def __iter__(self):
_iter = self.df_iter()
return map(self.preproc, _iter)
def df_iter(self):
ret = []
for idx, row in self.df.iterrows():
ret.append((row["prefix"],row["input_text"]))
return iter(ret)

Related

How to run one (pre-trained lstm) model on CPU and second (pose estimation) model on GPU simultaneously using multithreading?

!!! My problem definition might be long and makes you boring, however, I am trying to make my case and error clear to you.
**
The definition what I am doing:**
I am implementing socket transmission in python. In client side, object detection is performed, and detected number of people and detected frames are sent to server side. The server side consists of multiple threaded classes to handle data from client and perform pose estimation to monitor and log GPU utilization. When I run code, the whole code is running while logging GPU memory usage about 46% as expected. Here, I provide below DataManager.py (to handle data from client) and ChildProcess.py (to get frames data from queue and perform pose estimation) which are part of the whole code.
DataManager.py
class DataManagerThread(Thread):
def __init__(self, queue, sock, index):
super().__init__()
self.image_queue = queue
self.server_socket = sock
self.index = index
def run(self):
data = b""
payload_size = struct.calcsize("Q")
while True:
while len(data) < payload_size:
packet = self.server_socket.recv(4*1024) # The server_socket attribute is no longer None, so this should work
if not packet:
break
data += packet
packed_msg_size = data[:payload_size]
data = data[payload_size:]
msg_size = struct.unpack("Q", packed_msg_size)[0]
while len(data) < msg_size:
data += self.server_socket.recv(4*1024)
frame_data = data[:msg_size]
data = data[msg_size:]
data_dict = pickle.loads(frame_data)
# extract frame and detection information from data dictionary
img = data_dict['frame']
people = data_dict['people']
print(f'Detected number of people: {people}')
# TODO: Passing data to the process manager thread as a queue
self.put_data_to_queue(img)
else:
print("[System] end socket")
self.put_data_to_queue("End")
def put_data_to_queue(self, image):
self.image_queue.put(image)
ChildProcess.py
import warnings
warnings.filterwarnings(action="ignore")
from multiprocessing import Process
from tf_pose.estimator import TfPoseEstimator
from tf_pose.networks import get_graph_path, model_wh
import argparse
import cv2
import logging
import time
def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
def init_logger():
logger = logging.getLogger('TfPoseEstimator-WebCam')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
class ChildProcess(Process):
def __init__(self, queue):
self.start_time = time.time()
super().__init__()
self.image_queue = queue
self.start_time = time.time()
def __del__(self):
pass
def run(self):
args, w, h, e = self.init_model()
print("[Time]", time.time() - self.start_time)
while True:
print("[System] Run motion")
image = self.image_queue.get()
if type(image) is str:
print("[System end process]")
break
else:
self.motionTracking(args, e, w, h, image)
def motionTracking(self, args, e, w, h, decimg):
humans = e.inference(decimg, resize_to_default=(w > 0 and h > 0),
upsample_size=args.resize_out_ratio)
y1 = [0.0]
y = 0
image = TfPoseEstimator.draw_humans(decimg, humans, imgcopy=False)
for human in humans:
for i in range(len(humans)):
try:
a = human.body_parts[0]
x = a.x * image.shape[1]
y = a.y * image.shape[0]
y1.append(y)
except:
pass
if ((y - y1[len(y1) - 2]) > 30):
pass
cv2.imshow('tf-pose-estimation result', image)
_ = 0xFF & cv2.waitKey(1)
def init_model(self):
print("[System] model init")
parser = argparse.ArgumentParser(description='tf-pose-estimation realtime webcam')
parser.add_argument('--camera', type=int, default=0)
parser.add_argument('--resize', type=str, default='0x0',
help='if provided, resize images before they are processed. default=0x0, Recommends : 432x368 or 656x368 or 1312x736 ')
parser.add_argument('--resize-out-ratio', type=float, default=4.0,
help='if provided, resize heatmaps before they are post-processed. default=1.0')
parser.add_argument('--model', type=str, default='mobilenet_thin',
help='cmu / mobilenet_thin / mobilenet_v2_large / mobilenet_v2_small')
parser.add_argument('--show-process', type=bool, default=False,
help='for debug purpose, if enabled, speed for inference is dropped.')
parser.add_argument('--tensorrt', type=str, default="False",
help='for tensorrt process.')
args = parser.parse_args()
print('[System] initialization %s : %s' % (args.model, get_graph_path(args.model)))
w, h = model_wh(args.resize)
if w > 0 and h > 0:
e = TfPoseEstimator(get_graph_path(args.model), target_size=(w, h), trt_bool=str2bool(args.tensorrt))
else:
e = TfPoseEstimator(get_graph_path(args.model), target_size=(432, 368), trt_bool=str2bool(args.tensorrt))
print("[System] End model")
return args, w, h, e
The definition what I want to do and what error I am getting:
Here, I want to add my LSTM code to predict (people = data_dict['p#ople']) in DataManager.py file.
DataManager(added_lstm).py
# import some necessary libraries
config = tf.compat.v1.ConfigProto()
graph = tf.compat.v1.get_default_graph()
first_session = tf.compat.v1.Session(config=config)
with graph.as_default(), first_session.as_default():
with graph.as_default():
with tf.device('CPU:0'):
model = tf.keras.models.load_model('/home/tf-pose-estimation/modules/lstm_model/model1.h5', compile=False)
print(model.summary())
def make_prediction(m):
WINDOW_SIZE, alpha, theta = 5, 0.9, 3
forecast_ewma, forecast_values, theta_values, arr_of_num = [0], [], [], [1,1,1,1,1]
arr_of_num.append(m)
if len(arr_of_num)>WINDOW_SIZE:
arr_of_num = arr_of_num[1:]
if len(arr_of_num)==WINDOW_SIZE:
actual = arr_of_num[-1]
with graph.as_default(), first_session.as_default():
forecast = model.predict(np.array(arr_of_num[-WINDOW_SIZE:]).reshape(1, WINDOW_SIZE, 1))[0][0]
forecast_values.append(forecast)
a = alpha * forecast + (1 - alpha) * forecast_ewma[-1]
theta += 1 if a > 0.5 else -1
theta = min(max(theta, 0), 2)
theta_values.append(theta)
forecast_ewma.append(a)
return actual, forecast
class DataManagerThread(Thread):
def __init__(self, queue,sock, index):
super().__init__()
self.image_queue = queue
self.server_socket = sock
self.index = index
def run(self):
data = b""
payload_size = struct.calcsize("Q")
while True:
while len(data) < payload_size:
packet = self.server_socket.recv(4*1024) # The server_socket attribute is no longer None, so this should work
if not packet:
break
data += packet
packed_msg_size = data[:payload_size]
data = data[payload_size:]
msg_size = struct.unpack("Q", packed_msg_size)[0]
while len(data) < msg_size:
data += self.server_socket.recv(4*1024)
frame_data = data[:msg_size]
data = data[msg_size:]
data_dict = pickle.loads(frame_data)
# extract frame and detection information from data dictionary
img = data_dict['frame']
people = data_dict['people']
print(f'Detected number of people: {people}')
self.put_data_to_queue(img)
pred = make_prediction(people) # Added for lstm prediction
print(f"Predictions: {pred}")
def put_data_to_queue(self, image):
self.image_queue.put(image)
Here, I am adding only to DataManager.py file that loading lstm model, defining make_prediction function and use pred = make_prediction(people) inside of DataManagerThread(Thread) class to make prediction. Other code remained unchanged in this file.
When I run both models simultaneously, only lstm is predicting and pose estimation is just frozen. Also, even lstm model is forced to utilize CPU, about 84 % of GPU memory is occupied. Why? I do not know. However, my expectation is that lstm model should use cpu and pose estimation model should use gpu, and both models should bre run simultaneously.
When I run every model separately (i.e., lstm on CPU and pose estimation on GPU), they are working pretty well. Specifically, I tested LSTM model seoerately by generating random number, it worked as expected. LSTM model is trained in Tensorflow 2.5 and both models are running in Tensorflow 2.5.
Below is my PC and Env specifications:
GPU: NVIDIA GeForce RTX 2070 SUPER
Driver Version: 525
CUDA Version: 11.6
Python: 3.9.12
Tensorflow-gpu: 2.5.0
Is there possible or relevant solution for running the lstm model on CPU and the pose estimation model on GPU simultaneously using multithreading?
Any help appreciated!!!

Distributed sequential windowed data in pytorch

At every epoch of my training, I need to split my dataset in n batches of t consecutive samples. For example, if my data is [1,2,3,4,5,6,7,8,9,10], n = 2 and t = 3 then valid batches would be
[1-2-3, 4-5-6] and [7-8-9, 10-1-2]
[2-3-4, 8-9-10] and [5-6-7, 1-2-3]
My old version is the following, but it samples every point in the data, meaning that I would parse the whole dataset t times per epoch.
train_dataset = list(range(n))
train_sampler = None
if distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=bsize, shuffle=(train_sampler is None),
pin_memory=True, sampler=train_sampler)
for epoch in range(epochs):
if distributed:
train_sampler.set_epoch(epoch)
for starting_i in train_loader:
batch = np.array([np.mod(np.arange(i, i + t), n) for i in starting_i])
I have now implemented my own sampling function that splits the data into random batches where each sample is far from the two closest exactly t. In the non-distributed scenario, I can do
for epoch in range(epochs):
pad = np.random.randint(n)
train_loader = np.mod(np.arange(pad, n + pad, t), n)
np.random.shuffle(train_loader)
train_loader = np.array_split(train_loader,
np.ceil(len(train_loader) / bsize))
for starting_i in train_loader:
batch = np.array([np.mod(np.arange(i, i + t), n) for i in starting_i])
How do I make this version distributed? Do I need to make a custom torch.nn.parallel.DistributedDataParallel or torch.utils.data.DataLoader?
I have checked the DistributedSampler class
and my guess is that I have to override the __iter__ method. Am I right?
How does DistributedSampler split the dataset? Is it sequentially among num_replicas?
Say num_replicas = 2. Would my dataset be split into [1,2,3,4,5] and [6,7,8,9,10] between the 2 workers? Or is it random? Like [1,4,7,3,10] and [2,9,5,8,6]? First case would be ok for me because keeps samples sequential, but second would not.
I ended up making my own Dataset where the data is [t, t + window, ... t + n * window]. Every time it is called it randomizes the starting indices of the window. Then the sampler does the shuffling as usual. For reproducibility, it has a set_seed method similar to set_epoch of samplers.
class SequentialWindowedDataset(Dataset):
def __init__(self, size, window):
self.size = size
self.window = window
self.seed = 0
self.data = np.arange(0, self.size, self.window)
def __getitem__(self, index):
rng = np.random.default_rng(self.seed)
pad = rng.integers(0, self.size)
data = (self.data + pad) % self.size
return data[index]
def __len__(self):
return len(self.data)
def set_seed(self, seed):
self.seed = seed
The following version randomizes the data outside the call and it is much much faster.
class SequentialWindowedDataset(Dataset):
def __init__(self, size, window):
self.size = size
self.window = window
self.data = np.arange(0, self.size, self.window)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
def randomize(self, seed):
rng = np.random.default_rng(seed)
pad = rng.integers(0, self.size)
self.data = (self.data + pad) % self.size

Pytorch sequential data loader

I have looked through the documentation with for ex class IterableDataset and Start / End but I'm just not good enough to solve this one at the moment.
Training my model with random batches is fine, but using it for predictions I need it to start from min(index) up to max(index). So I wanted to re-use below and change to fit that.
Now it will take random items from the range so I can get duplicate predictions of the same index number. ex range(5) in index 1,2,3,4,5 might give 4,2,2,3,4 not desired 1,2,3,4,5.
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
DataLoader shuffle = False, then it just takes len / max of index.
I probably need to change the sampler.
class CompanyDataset(Dataset):
def __init__(self, csv_name, root_dir, training_length, forecast_window):
"""
Args:
csv_file (string): Path to the csv file.
root_dir (string): Directory
"""
# load raw data file
csv_file = os.path.join(root_dir, csv_name)
self.df = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = MinMaxScaler()
self.T = training_length
self.S = forecast_window
def __len__(self):
# return number of sensors
return len(self.df.groupby(by=["index"]))
# Will pull an index between 0 and __len__.
def __getitem__(self, idx):
# Sensors are indexed from 1
idx = idx + 1
# np.random.seed(0)
start = np.random.randint(0, len(self.df[self.df["index"] == idx]) - self.T - self.S)
Company = str(self.df[self.df["index"] == idx][["station"]][start:start + 1].values.item())
index_in = torch.tensor([i for i in range(start, start + self.T)])
index_tar = torch.tensor([i for i in range(start + self.T, start + self.T + self.S)])
_input = torch.tensor(self.df[self.df["index"] == idx][
["A1","A2","A3","A4","A5","A6","A7","A8", "A9", "A10", "A11"]][
start: start + self.T].values)
target = torch.tensor(self.df[self.df["index"] == idx][
[["A1","A2","A3","A4","A5","A6","A7","A8", "A9", "A10", "A11"]][
start + self.T: start + self.T + self.S].values)
scaler = self.transform
scaler.fit(_input[:, 0].unsqueeze(-1))
_input[:, 0] = torch.tensor(scaler.transform(_input[:, 0].unsqueeze(-1)).squeeze(-1))
target[:, 0] = torch.tensor(scaler.transform(target[:, 0].unsqueeze(-1)).squeeze(-1))
dump(scaler, 'scalar_item.joblib')
return index_in, index_tar, _input, target, station

Building a dataset with dataloader pytorch getting error cannot import name 'read_data_sets'

Loading data into dataset using pytorch dataloader.
Getting error cannot import name 'read_data_sets'
Tried searaching for results from similar issues.
If there is confusion about file instead of module and it can't find read_data_sets in your file How do i change to fix?
class MRDataset(data.Dataset):
def __init__(self, root_dir, task, plane, train=True, transform=None, weights=None):
super().__init__()
self.task = task
self.plane = plane
self.root_dir = root_dir
self.train = train
if self.train:
self.folder_path = self.root_dir + 'train/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'train-{0}.csv'.format(task), header=None, names=['id', 'label'])
else:
transform = None
self.folder_path = self.root_dir + 'valid/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'valid-{0}.csv'.format(task), header=None, names=['id', 'label'])
self.records['id'] = self.records['id'].map(
lambda i: '0' * (4 - len(str(i))) + str(i))
self.paths = [self.folder_path + filename +
'.npy' for filename in self.records['id'].tolist()]
self.labels = self.records['label'].tolist()
self.transform = transform
if weights is None:
pos = np.sum(self.labels)
neg = len(self.labels) - pos
self.weights = torch.FloatTensor([1, neg / pos])
else:
self.weights = torch.FloatTensor(weights)
def __len__(self):
return len(self.paths)
def __getitem__(self, index):
array = np.load(self.paths[index])
label = self.labels[index]
if label == 1:
label = torch.FloatTensor([[0, 1]])
elif label == 0:
label = torch.FloatTensor([[1, 0]])
if self.transform:
array = self.transform(array)
else:
array = np.stack((array,)*3, axis=1)
array = torch.FloatTensor(array)
# if label.item() == 1:
# weight = np.array([self.weights[1]])
# weight = torch.FloatTensor(weight)
# else:
# weight = np.array([self.weights[0]])
# weight = torch.FloatTensor(weight)
return array, label, self.weights
There is a model and train class to run this. Arguments specified in train.
Running the train should load data and run through model

AssertionError: Format for classes is `<label> file`

This is a python script for detecting features in a set of images for a SVM.
import os
import sys
import argparse
import _pickle as cPickle
import json
import cv2
import numpy as np
from sklearn.cluster import KMeans
def build_arg_parser():
parser = argparse.ArgumentParser(description='Creates features for given images')
parser.add_argument("--samples", dest="cls", nargs="+", action="append",
required=True, help="Folders containing the training images. \
The first element needs to be the class label.")
parser.add_argument("--codebook-file", dest='codebook_file', required=True,
help="Base file name to store the codebook")
parser.add_argument("--feature-map-file", dest='feature_map_file', required=True,
help="Base file name to store the feature map")
parser.add_argument("--scale-image", dest="scale", type=int, default=150,
help="Scales the longer dimension of the image down to this size.")
return parser
def load_input_map(label, input_folder):
combined_data = []
if not os.path.isdir(input_folder):
print ("The folder " + input_folder + " doesn't exist")
raise IOError
for root, dirs, files in os.walk(input_folder):
for filename in (x for x in files if x.endswith('.jpg')):
combined_data.append({'label': label, 'image': os.path.join(root, filename)})
return combined_data
class FeatureExtractor(object):
def extract_image_features(self, img):
kps = DenseDetector().detect(img)
kps, fvs = SIFTExtractor().compute(img, kps)
return fvs
def get_centroids(self, input_map, num_samples_to_fit=10):
kps_all = []
count = 0
cur_label = ''
for item in input_map:
if count >= num_samples_to_fit:
if cur_label != item['label']:
count = 0
else:
continue
count += 1
if count == num_samples_to_fit:
print ("Built centroids for", item['label'])
cur_label = item['label']
img = cv2.imread(item['image'])
img = resize_to_size(img, 150)
num_dims = 128
fvs = self.extract_image_features(img)
kps_all.extend(fvs)
kmeans, centroids = Quantizer().quantize(kps_all)
return kmeans, centroids
def get_feature_vector(self, img, kmeans, centroids):
return Quantizer().get_feature_vector(img, kmeans, centroids)
def extract_feature_map(input_map, kmeans, centroids):
feature_map = []
for item in input_map:
temp_dict = {}
temp_dict['label'] = item['label']
print ("Extracting features for", item['image'])
img = cv2.imread(item['image'])
img = resize_to_size(img, 150)
temp_dict['feature_vector'] = FeatureExtractor().get_feature_vector(
img, kmeans, centroids)
if temp_dict['feature_vector'] is not None:
feature_map.append(temp_dict)
return feature_map
class Quantizer(object):
def __init__(self, num_clusters=32):
self.num_dims = 128
self.extractor = SIFTExtractor()
self.num_clusters = num_clusters
self.num_retries = 10
def quantize(self, datapoints):
kmeans = KMeans(self.num_clusters,
n_init=max(self.num_retries, 1),
max_iter=10, tol=1.0)
res = kmeans.fit(datapoints)
centroids = res.cluster_centers_
return kmeans, centroids
def normalize(self, input_data):
sum_input = np.sum(input_data)
if sum_input > 0:
return input_data / sum_input
else:
return input_data
def get_feature_vector(self, img, kmeans, centroids):
kps = DenseDetector().detect(img)
kps, fvs = self.extractor.compute(img, kps)
labels = kmeans.predict(fvs)
fv = np.zeros(self.num_clusters)
for i, item in enumerate(fvs):
fv[labels[i]] += 1
fv_image = np.reshape(fv, ((1, fv.shape[0])))
return self.normalize(fv_image)
class DenseDetector(object):
def __init__(self, step_size=20, feature_scale=40, img_bound=20):
self.detector = cv2.xfeatures2d.SIFT_create("Dense")
self.detector.setInt("initXyStep", step_size)
self.detector.setInt("initFeatureScale", feature_scale)
self.detector.setInt("initImgBound", img_bound)
def detect(self, img):
return self.detector.detect(img)
class SIFTExtractor(object):
def compute(self, image, kps):
if image is None:
print ("Not a valid image")
raise TypeError
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
kps, des = cv2.SIFT().compute(gray_image, kps)
return kps, des
# Resize the shorter dimension to 'new_size'
# while maintaining the aspect ratio
def resize_to_size(input_image, new_size=150):
h, w = input_image.shape[0], input_image.shape[1]
ds_factor = new_size / float(h)
if w < h:
ds_factor = new_size / float(w)
new_size = (int(w * ds_factor), int(h * ds_factor))
return cv2.resize(input_image, new_size)
if __name__=='__main__':
args = build_arg_parser().parse_args()
input_map = []
for cls in args.cls:
assert len(cls) >= 2, "Format for classes is `<label> file`"
label = cls[0]
input_map += load_input_map(label, cls[1])
downsample_length = args.scale
# Building the codebook
print ("===== Building codebook =====")
kmeans, centroids = FeatureExtractor().get_centroids(input_map)
if args.codebook_file:
with open(args.codebook_file, 'w') as f:
pickle.dump((kmeans, centroids), f)
# Input data and labels
print ("===== Building feature map =====")
feature_map = extract_feature_map(input_map, kmeans, centroids)
if args.feature_map_file:
with open(args.feature_map_file, 'w') as f:
pickle.dump(feature_map, f)
I receive the following error:
Traceback (most recent call last):
File "create_features.py", line 164, in <module>
assert len(cls) >= 2, ("Format for classes is `<label> file`")
AssertionError: Format for classes is `<label> file`
Any idea of what could be wrong? I'm just following the instructions of 'OpenCV with Python by Example' of Prateek Joshi. Pages 494-526
Assertion are used to check a condition. If the condition isn't satisfied, it throes AssertionError. In your case, len(cls) >= 2 isn't satisfied. It means that len(cls) is smaller than 2. Apparently, cls is a list of arguments passed to the programm. And the first element of this list must be a label. And when you add argument (a file), you should specify a label for this file.
For example, if you choose a label name my_label, you must add file with my_label my_file.

Resources