keras BatchGenerator(keras.utils.Sequence) is too slow - multithreading

I'm using a custom batch generator with large dataframe. but the Generator takes too much time to generate a batch, it takes 127s to generate a batch of 1024. I've tried Dask but still, the processing is slow. is there any way to integrate multiprocessing with inside the generator. knowing that I've tried use_multiprocessing=True with workers=12
import keras
from random import randint
import glob
import warnings
import numpy as np
import math
import pandas as pd
import dask.dataframe as dd
class BatchGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, labels=None, batch_size=8, n_classes=4, shuffle=True,
seq_len=6, data_path=None, meta_path=None,list_IDs=None):
'Initialization'
self.batch_size = batch_size
self.labels = labels
self.n_classes = n_classes
self.shuffle = shuffle
self.seq_len = seq_len
self.meta_df = meta_path
self.data_df = data_path
self.data_df = self.data_df.astype({"mjd": int})
self.list_IDs = list_IDs
if self.list_IDs==None:
self.list_IDs = list(self.meta_df['object_id'].unique())
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_IDs_temp):
X_dat = np.zeros((self.batch_size, self.seq_len,6,1))
Y_mask = np.zeros((self.batch_size, self.seq_len,6,1))
# Y_dat = np.empty((self.batch_size,1), dtype=int)
X_length= np.empty((self.batch_size,1), dtype=int)
for i, trans_id in enumerate(list_IDs_temp):
curve = self.data_df[self.data_df.object_id==trans_id]
mjdlist = list(curve['mjd'].unique())
ts_length = len(mjdlist)
if ts_length <= self.seq_len :
start_ind = 0
else :
start_ind = randint(0, ts_length - self.seq_len)
ts_length = self.seq_len
for j in range(ts_length):
if j+start_ind < len(mjdlist):
step = curve[curve.mjd==mjdlist[j+start_ind]]
for k in range(len(step.mjd)):
obs = step[step.passband==k]
if len(obs) == 0 :
# print('here is one')
continue
else:
if k == 0:
X_dat[i,j,0,0] =obs.flux.iloc[0]
Y_mask[i,j,0,0] = 1
if k == 1:
X_dat[i,j,1,0] = obs.flux.iloc[0]
Y_mask[i,j,1,0] = 1
if k == 2:
X_dat[i,j,2,0] = obs.flux.iloc[0]
Y_mask[i,j,2,0] = 1
if k == 3:
X_dat[i,j,3,0] = obs.flux.iloc[0]
Y_mask[i,j,3,0] = 1
if k == 4:
X_dat[i,j,4,0] = obs.flux.iloc[0]
Y_mask[i,j,4,0] = 1
if k == 5:
X_dat[i,j,5,0] = obs.flux.iloc[0]
Y_mask[i,j,5,0] = 1
# meta = self.meta_df[self.meta_df['object_id'] == trans_id]
# Y_dat[i] = self.labels[int(meta['target'])]
X_length[i,0] = ts_length
flux_max = np.max(X_dat[i])
flux_min = np.min(X_dat[i])
flux_pow = math.log2(flux_max - flux_min)
X_dat[i] /= flux_pow
X_noised = X_dat + np.random.uniform(low=0, high=0.5, size=X_dat.shape)
return [X_noised, X_length, np.reshape(Y_mask,(self.batch_size, self.seq_len*6))], np.reshape(X_dat,(self.batch_size, self.seq_len*6))

To make it faster, the for loop in the function __data_generation should be parallelized. Using the joblib package may help.

Related

PyTorch Dataloader bucket by tensor length

I've been trying to create a custom Dataloader that can serve batches of data that are all same-sized to feed into a Conv2d layer for classification purposes.
Here's some test data
X is a NUMBER OF POINTS x CHOICES x NUM_FEATURES, while y is the label (that can be any integer CHOICES-1)
I'm having trouble writing the Sampler and Dataloader.
import random
import torch
from collections import defaultdict
from sklearn.utils import shuffle
from torch.utils.data import Dataset, DataLoader
from typing import Sequence, Iterator
import numpy as np
sample_probs = np.array([2.04302017e-03, 6.84249612e-03, 3.18776004e-02, 6.69332322e-01,
1.79056125, 1.63388916, 1.31819391, 1.43798623,
2.44057406, 5.51664089e-01, 9.66624185e-02, 1.67495225e-02,
3.59960696e-03, 2.43216687e-05])
X = []
y = []
train_datasets = []
i_dict = {0: 19,
1: 63,
2: 30,
3: 6192,
4: 16564,
5: 15115,
6: 12195,
7: 13303,
8: 22578,
9: 5103,
10: 894,
11: 155,
12: 33,
13: 2}
for i in range(2,16):
temp_x = []
temp_y = []
for j in range(i_dict[i-2]):
temp_x.append(torch.rand(i, 4, 1))
temp_y.append(torch.tensor(random.randint(0,i-1)))
X = torch.stack(temp_x)
y = torch.stack(temp_y)
train_datasets.append((X.clone(),y.clone()))
class WeightedBucketSampler(torch.utils.data.Sampler):
def __init__(self, data, weights: Sequence[float], num_samples: int,
replacement: bool = True, generator=None, shuffle=True, drop_last=False):
super().__init__(data)
self.shuffle = shuffle
self.drop_last = drop_last
self.weights = torch.as_tensor(weights, dtype=torch.double)
self.num_samples = num_samples
self.replacement = replacement
self.generator = generator
self.buckets = defaultdict(list)
'''data is a CustomDataset containing a tensor of COUNT x NUM_ROUTES x FEATURES x 1 and a tensor with the corresponding labels'''
counter = 0
for i in range(len(data)):
self.buckets[i+2] += [data[i][0],data[i][1]]
counter += len(data[i][0])
self.length = counter
def __iter__(self) -> Iterator[int]:
# Choose a bucket depending on the weighted sample
rand_bucket = torch.multinomial(self.weights, self.num_samples, self.replacement, generator=self.generator).tolist()[0]
shifter = sum([len(self.buckets[i+2][0]) for i in range(rand_bucket)])
# Generate random indices from the bucket
rand_tensor = torch.randperm(len(self.buckets[rand_bucket+2][0]), generator=self.generator)
yield from torch.add(rand_tensor, shifter).tolist()
def __len__(self):
return self.length
class CustomDataset(Dataset):
def __init__(self, data):
self.routes = dict()
self.choice = dict()
counter = 0
for i in range(len(data)):
for j in range(len(data[i][0])):
self.routes[counter] = data[i][0][j]
self.choice[counter] = data[i][1][j]
counter += 1
def __len__(self):
return len(self.choice)
def __getitem__(self, idx):
choice = self.choice[idx]
routes = self.routes[idx]
return routes, choice
train_datasets_ds = CustomDataset(train_datasets)
bucket_sampler = WeightedBucketSampler(train_datasets, sample_probs,len(sample_probs), shuffle=True, drop_last=False)
loader = DataLoader(train_datasets_ds, sampler=bucket_sampler, batch_size=32, pin_memory=True)
for X,y in loader:
print(X.size(),y.size())
This code is a combination of WeightedRandomSampler and Bucket sampling code
I'm essentially sampling via the sample weights of each classification to choose a bucket, and from that bucket choose randomly to form a batch up to batch_size.
However, when going through loader, I get the output:
...
torch.Size([32, 10, 4, 1]) torch.Size([32])
torch.Size([32, 10, 4, 1]) torch.Size([32])
torch.Size([32, 10, 4, 1]) torch.Size([32])
torch.Size([18, 10, 4, 1]) torch.Size([18])
The sum of all these batches add up to the elements in bucket 10. So it's right, but it's not jumping to another bucket. Rerunning the code
for X,y in loader:
print(X.size(),y.size())
will produce another bucket's batches.
I'm still learning PyTorch, so some of the code might be inefficient. Would love some advice as well!
Thanks to some help on the unofficial PyTorch Discord channel (sudomaze), I've fixed my problem. There's a need to iterate through all the data in the sampler.
The __len__ function in the sampler also needed fixing.
class WeightedBucketSampler(Sampler[List[int]]):
def __init__(self, data, weights: Sequence[float], num_samples: int,
replacement: bool = True, generator=None, shuffle=True, batch_size=32, drop_last=False):
super().__init__(data)
self.shuffle = shuffle
self.drop_last = drop_last
self.weights = torch.as_tensor(weights, dtype=torch.double)
self.num_samples = num_samples
self.replacement = replacement
self.generator = generator
self.batch_size = batch_size
self.buckets = defaultdict(list)
'''data is a CustomDataset containing a tensor of COUNT x NUM_ROUTES x FEATURES x 1 and a tensor with the corresponding labels'''
counter = 0
for i in range(len(data)):
self.buckets[i+2] += [data[i][0],data[i][1]]
counter += len(data[i][0])
self.length = counter
def __iter__(self) -> Iterator[int]:
# Choose a bucket depending on the weighted sample
rand_bucket = torch.multinomial(self.weights, self.num_samples, self.replacement, generator=self.generator)
batch = [0] * self.batch_size
idx_in_batch = 0
for bucket_idx in rand_bucket.tolist():
bucketsample_count = 0
shifter = sum([len(self.buckets[i+2][0]) for i in range(bucket_idx)])
# Generate random indices from the bucket and shift them
rand_tensor = torch.randperm(len(self.buckets[bucket_idx+2][0]), generator=self.generator)
# print(len(self.buckets[bucket_idx+2][0]), len(rand_tensor.tolist()))
for idx in rand_tensor.tolist():
batch[idx_in_batch] = idx+shifter
idx_in_batch += 1
if idx_in_batch == self.batch_size:
bucketsample_count += self.batch_size
yield batch
idx_in_batch = 0
batch = [0] * self.batch_size
if idx_in_batch > 0:
bucketsample_count += idx_in_batch
yield batch[:idx_in_batch]
# The last remaining tensors are added into one batch. Terminate batch and move to next bucket
idx_in_batch = 0
batch = [0] * self.batch_size
continue
def __len__(self):
return (self.length + (self.batch_size - 1)) // self.batch_size
class CustomDataset(Dataset):
def __init__(self, data):
self.routes = dict()
self.choice = dict()
counter = 0
for i in range(len(data)):
for j in range(len(data[i][0])):
self.routes[counter] = data[i][0][j]
self.choice[counter] = data[i][1][j]
counter += 1
def __len__(self):
return len(self.choice)
def __getitem__(self, idx):
choice = self.choice[idx]
routes = self.routes[idx]
return routes, choice
w = np.array([len(i[0]) for i in train_datasets])
sample_probs = 1/sample_probs*w
train_datasets_ds = CustomDataset(train_datasets)
bucket_sampler = WeightedBucketSampler(train_datasets, sample_probs,len(sample_probs), shuffle=True, batch_size=batch_size, drop_last=False)
train_loader = DataLoader(train_datasets_ds, batch_sampler=bucket_sampler)

Torch throws a RuntimeError: element 0 of tensors does not require grad... but can't find where computational graph is severed

I am getting the above error:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
I looked this up and it looks like the computational graph is not connected for some reason. However, I cannot find the location where the graph is severed.
My code is a reproduction of the arjovsky WGAN: https://github.com/martinarjovsky/WassersteinGAN
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
import torch.nn as nn
from __future__ import print_function
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.autograd import Variable
import os
import json
class MLP_G(nn.Module):
def __init__(self, isize, nz, ngf, ngpu):
super(MLP_G, self).__init__()
self.ngpu = ngpu
main = nn.Sequential(
# Z goes into a linear of size: ngf
nn.Linear(nz, ngf),
nn.ReLU(True),
nn.Linear(ngf, ngf),
nn.ReLU(True),
nn.Linear(ngf, ngf),
nn.ReLU(True),
nn.Linear(ngf, isize),
)
self.main = main
self.isize = isize
self.nz = nz
def forward(self, input):
input = input.view(input.size(0), input.size(1))
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
else:
output = self.main(input)
return output.view(output.size(0), self.isize)
class MLP_D(nn.Module):
def __init__(self, isize, nz, ndf, ngpu):
super(MLP_D, self).__init__()
self.ngpu = ngpu
main = nn.Sequential(
# Z goes into a linear of size: ndf
nn.Linear(isize, ndf),
nn.ReLU(True),
nn.Linear(ndf, ndf),
nn.ReLU(True),
nn.Linear(ndf, ndf),
nn.ReLU(True),
nn.Linear(ndf, 1),
)
self.main = main
self.isize = isize
self.nz = nz
def forward(self, input):
input = input.view(input.size(0),input.size(1))
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
else:
output = self.main(input)
output = output.mean(0)
return output.view(1)
netG = None #path to saved generator
netD = None #discriminator path
batchSize = 1000 #size of batch (which is size of data)
cuda = False
lrD = lrG = .00005
beta1 = .5
niter = 25
experiment = '/content/drive/MyDrive/savefolder'
clamp_upper = .01
clamp_lower = -clamp_upper
manualSeed = random.randint(1, 10000) # fix seed
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
cudnn.benchmark = True
dataset = torch.tensor(np.stack([x,y, instrument], axis = 1)).float().reshape(-1,3)
ngpu = 1
nz = 4 #three latents and the instrument
ngf = 128
ndf = 128
# custom weights initialization called on netG and netD
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
m.weight.data.normal_(0.0, 0.02)
elif classname.find('BatchNorm') != -1:
m.weight.data.normal_(1.0, 0.02)
m.bias.data.fill_(0)
netG = MLP_G(2, nz, ngf, ngpu)
netG.apply(weights_init)
print(netG)
netD = MLP_D(3, nz, ndf, ngpu)
print(netD)
input = torch.FloatTensor(batchSize, 2)
noise = torch.FloatTensor(batchSize, nz-1)
fixed_noise = torch.FloatTensor(batchSize, nz-1).normal_(0, 1)
one = torch.FloatTensor([1])
mone = one * -1
# setup optimizer
optimizerD = optim.Adam(netD.parameters(), lr=lrD, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lrG, betas=(beta1, 0.999))
real_cpu = data = dataset
gen_iterations = 0
for epoch in range(niter):
#data_iter = iter(dataloader)
############################
# (1) Update D network
###########################
for p in netD.parameters(): # reset requires_grad
p.requires_grad = True # they are set to False below in netG update
# train the discriminator Diters times
if gen_iterations < 25 or gen_iterations % 500 == 0:
Diters = 100
else:
Diters = 5
j = 0
while j < Diters:
j += 1
# clamp parameters to a cube
for p in netD.parameters():
p.data.clamp_(clamp_lower, clamp_upper)
# train with real
netD.zero_grad()
if cuda:
real_cpu = real_cpu.cuda()
input.resize_as_(real_cpu).copy_(real_cpu)
inputv = Variable(input, requires_grad=False)
errD_real = netD(inputv)
errD_real.backward(one)#Error Occurs here
# train with fake
noise.resize_(batchSize, nz-1).normal_(0, 1)
noisev = torch.cat([Variable(noise, requires_grad=False), dataset[:,2].reshape(-1,1)], 1)# totally freeze netG
fake = torch.cat([Variable(netG(noisev).data), dataset[:,2].view(-1,1)], 1)
inputv = fake
errD_fake = netD(inputv)
errD_fake.backward(mone)
errD = errD_real - errD_fake
optimizerD.step()
############################
# (2) Update G network
###########################
for p in netD.parameters():
p.requires_grad = False # to avoid computation
netG.zero_grad()
# in case our last batch was the tail batch of the dataloader,
# make sure we feed a full batch of noise
noise.resize_(batchSize, nz-1).normal_(0, 1)
noisev = torch.cat([Variable(noise), dataset[:,2].view(-1,1)], 1)
fake = torch.cat([netG(noisev), dataset[:,2].view(-1,1)], 1)
errG = netD(fake)
errG.backward(one)
optimizerG.step()
gen_iterations += 1
i = 0
print('[%d/%d][%d] Loss_D: %f Loss_G: %f Loss_D_real: %f Loss_D_fake %f'
% (epoch, niter, gen_iterations,
errD.data[0], errG.data[0], errD_real.data[0], errD_fake.data[0]))
# if gen_iterations % 500 == 0:
# real_cpu = real_cpu.mul(0.5).add(0.5)
# vutils.save_image(real_cpu, '{0}/real_samples.png'.format(opt.experiment))
# fake = netG(Variable(fixed_noise, volatile=True))
# fake.data = fake.data.mul(0.5).add(0.5)
# vutils.save_image(fake.data, '{0}/fake_samples_{1}.png'.format(opt.experiment, gen_iterations))
# do checkpointing
torch.save(netG.state_dict(), '{0}/netG_epoch_{1}.pth'.format(experiment, epoch))
torch.save(netD.state_dict(), '{0}/netD_epoch_{1}.pth'.format(experiment, epoch))
Error occurs on the line: errD_real.backward(one). The error might be something regarding zeroing out the computational graph as the code runs for one iteration then throws an error. Thanks for your help.
You most certainly need to add require_grad=True on one. You could define it as:
one = torch.tensor([1], dtype=torch.float16, requires_grad=True)

Building a dataset with dataloader pytorch getting error cannot import name 'read_data_sets'

Loading data into dataset using pytorch dataloader.
Getting error cannot import name 'read_data_sets'
Tried searaching for results from similar issues.
If there is confusion about file instead of module and it can't find read_data_sets in your file How do i change to fix?
class MRDataset(data.Dataset):
def __init__(self, root_dir, task, plane, train=True, transform=None, weights=None):
super().__init__()
self.task = task
self.plane = plane
self.root_dir = root_dir
self.train = train
if self.train:
self.folder_path = self.root_dir + 'train/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'train-{0}.csv'.format(task), header=None, names=['id', 'label'])
else:
transform = None
self.folder_path = self.root_dir + 'valid/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'valid-{0}.csv'.format(task), header=None, names=['id', 'label'])
self.records['id'] = self.records['id'].map(
lambda i: '0' * (4 - len(str(i))) + str(i))
self.paths = [self.folder_path + filename +
'.npy' for filename in self.records['id'].tolist()]
self.labels = self.records['label'].tolist()
self.transform = transform
if weights is None:
pos = np.sum(self.labels)
neg = len(self.labels) - pos
self.weights = torch.FloatTensor([1, neg / pos])
else:
self.weights = torch.FloatTensor(weights)
def __len__(self):
return len(self.paths)
def __getitem__(self, index):
array = np.load(self.paths[index])
label = self.labels[index]
if label == 1:
label = torch.FloatTensor([[0, 1]])
elif label == 0:
label = torch.FloatTensor([[1, 0]])
if self.transform:
array = self.transform(array)
else:
array = np.stack((array,)*3, axis=1)
array = torch.FloatTensor(array)
# if label.item() == 1:
# weight = np.array([self.weights[1]])
# weight = torch.FloatTensor(weight)
# else:
# weight = np.array([self.weights[0]])
# weight = torch.FloatTensor(weight)
return array, label, self.weights
There is a model and train class to run this. Arguments specified in train.
Running the train should load data and run through model

Is it possible to us a csv file to connect one part of the algo to another

I am trying to modify a triangular arbitrage crypto trading bot to include a predictive capability with a neural network. I've found some open source algorithms on GitHub, but I am having problem integrating them.
I've been trying to separate parts of the code into modules and using a continuously updated csv file to direct the data from the first half of the algorithm into the second, but it just isn't working.
I tried to create modules for different parts of the algorithm, but it didn't work:
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers import Bidirectional
from keras.models import Sequential
from binance.client import Client
from binance.enums import *
from sklearn.metrics import mean_squared_error
import time
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from pandas import Series
from matplotlib import cm
api_key = BinanceKey1['api_key']
api_secret = BinanceKey1['api_secret']
client = Client(api_key, api_secret)
import csv
import json
from binance.client import Client
import csv
import json
from binance.client import Client
client = Client(api_key, api_secret)
client = Client("", "")
klines1 = client.get_historical_klines("BNBBTC", Client.KLINE_INTERVAL_1MINUTE,
"1 day ago UTC")
csv.write(klines1)
# fetch 30 minute klines for the last month of 2017
klines2 = client.get_historical_klines("ETHBTC", Client.KLINE_INTERVAL_30MINUTE,
"1 Dec, 2017", "1 Jan, 2018")
csv.write(klines2)
# fetch weekly klines since it listed
klines3 = client.get_historical_klines("NEOBTC", Client.KLINE_INTERVAL_1WEEK,
"1 Jan, 2017")
csv.write(klines3)
def load_data(klines, sequence_length):
raw_data = pd.read_csv(klines, dtype=float).values
for x in range(0, raw_data.shape[0]):
for y in range(0, raw_data.shape[1]):
if(raw_data[x][y] == 0):
raw_data[x][y] = raw_data[x-1][y]
data = raw_data.tolist()
result = []
for index in range(len(data) - sequence_length):
result.append(data[index: index + sequence_length])
d0 = np.array(result)
dr = np.zeros_like(d0)
dr[:, 1:, :] = d0[:, 1:, :] / d0[:, 0:1, :] - 1
start = 2400
end = int(dr.shape[0] + 1)
unnormalized_bases = d0[start:end, 0:1, 20]
split_line = round(0.9 * dr.shape[0])
training_data = dr[:int(split_line), :]
np.random.shuffle(training_data)
X_train = training_data[:, :-1]
Y_train = training_data[:, -1]
Y_train = Y_train[:, 20]
X_test = dr[int(split_line):, :-1]
Y_test = dr[int(split_line):, 49, :]
Y_test = Y_test[:, 20]
Y_daybefore = dr[int(split_line):, 48, :]
Y_daybefore = Y_daybefore[:, 20]
sequence_length = sequence_length
window_size = sequence_length - 1
return X_train, Y_train, X_test, Y_test, Y_daybefore, unnormalized_bases, window_size
def initialize_model(window_size, dropout_value, activation_function, loss_function, optimizer):
model = Sequential()
model.add(Bidirectional(LSTM(window_size, return_sequences=True), input_shape=(window_size, X_train.shape[-1]),))
model.add(Dropout(dropout_value))
model.add(Bidirectional(LSTM((window_size*2), return_sequences=True)))
model.add(Dropout(dropout_value))
model.add(Bidirectional(LSTM(window_size, return_sequences=False)))
model.add(Dense(units=1))
model.add(Activation(activation_function))
model.compile(loss=loss_function, optimizer=optimizer)
return model
def fit_model(model, X_train, Y_train, batch_num, num_epoch, val_split):
start = time.time()
model.fit(X_train, Y_train, batch_size= batch_num, nb_epoch=num_epoch, validation_split= val_split)
training_time = int(math.floor(time.time() - start))
return model, training_time
def test_model(model, X_test, Y_test, unnormalized_bases):
y_predict = model.predict(X_test)
real_y_test = np.zeros_like(Y_test)
real_y_predict = np.zeros_like(y_predict)
for i in range(Y_test.shape[0]):
y = Y_test[i]
predict = y_predict[i]
real_y_test[i] = (y+1)*unnormalized_bases[i]
real_y_predict[i] = (predict+1)*unnormalized_bases[i]
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111)
ax.set_title("Bitcoin Price Over Time")
plt.plot(real_y_predict, color = 'green', label = 'Predicted Price')
plt.plot(real_y_test, color = 'red', label = 'Real Price')
ax.set_ylabel("Price (USD)")
ax.set_xlabel("Time (Days)")
ax.legend()
return y_predict, real_y_test, real_y_predict, fig
def price_change(Y_daybefore, Y_test, y_predict):
Y_daybefore = np.reshape(Y_daybefore, (-1, 1))
Y_test = np.reshape(Y_test, (-1, 1))
delta_predict = (y_predict - Y_daybefore) / (1+Y_daybefore)
delta_real = (Y_test - Y_daybefore) / (1+Y_daybefore)
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.set_title("Percent Change in Bitcoin Price Per Day")
plt.plot(delta_predict, color='green', label = 'Predicted Percent Change')
plt.plot(delta_real, color='red', label = 'Real Percent Change')
plt.ylabel("Percent Change")
plt.xlabel("Time (Days)")
ax.legend()
plt.show()
return Y_daybefore, Y_test, delta_predict, delta_real, fig
def binary_price(delta_predict, delta_real):
delta_predict_1_0 = np.empty(delta_predict.shape)
delta_real_1_0 = np.empty(delta_real.shape)
for i in range(delta_predict.shape[0]):
if delta_predict[i][0] > 0:
delta_predict_1_0[i][0] = 1
else:
delta_predict_1_0[i][0] = 0
for i in range(delta_real.shape[0]):
if delta_real[i][0] > 0:
delta_real_1_0[i][0] = 1
else:
delta_real_1_0[i][0] = 0
return delta_predict_1_0, delta_real_1_0
def find_positives_negatives(delta_predict_1_0, delta_real_1_0):
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
for i in range(delta_real_1_0.shape[0]):
real = delta_real_1_0[i][0]
predicted = delta_predict_1_0[i][0]
if real == 1:
if predicted == 1:
true_pos += 1
else:
false_neg += 1
elif real == 0:
if predicted == 0:
true_neg += 1
else:
false_pos += 1
return true_pos, false_pos, true_neg, false_neg
def calculate_statistics(true_pos, false_pos, true_neg, false_neg, y_predict, Y_test):
precision = float(true_pos) / (true_pos + false_pos)
recall = float(true_pos) / (true_pos + false_neg)
F1 = float(2 * precision * recall) / (precision + recall)
MSE = mean_squared_error(y_predict.flatten(), Y_test.flatten())
return precision, recall, F1, MSE
X_train, Y_train, X_test, Y_test, Y_daybefore, unnormalized_bases, window_size = load_data("Bitcoin Data.csv", 50)
print (X_train.shape)
print (Y_train.shape)
print (X_test.shape)
print (Y_test.shape)
print (Y_daybefore.shape)
print (unnormalized_bases.shape)
print (window_size)
model = initialize_model(window_size, 0.2, 'linear', 'mse', 'adam')
print model.summary()
model, training_time = fit_model(model, X_train, Y_train, 1024, 100, .05)
print "Training time", training_time, "seconds"
y_predict, real_y_test, real_y_predict, fig1 = test_model(model, X_test, Y_test, unnormalized_bases)
plt.show(fig1)
Y_daybefore, Y_test, delta_predict, delta_real, fig2 = price_change(Y_daybefore, Y_test, y_predict)
plt.show(fig)
delta_predict_1_0, delta_real_1_0 = binary_price(delta_predict, delta_real)
print delta_predict_1_0.shape
print delta_real_1_0.shape
true_pos, false_pos, true_neg, false_neg = find_positives_negatives(delta_predict_1_0, delta_real_1_0)
print "True positives:", true_pos
print "False positives:", false_pos
print "True negatives:", true_neg
print "False negatives:", false_neg
precision, recall, F1, MSE = calculate_statistics(true_pos, false_pos, true_neg, false_neg, y_predict, Y_test)
print "Precision:", precision
print "Recall:", recall
print "F1 score:", F1
print "Mean Squared Error:", MSE
class Client(object):
API_URL = 'https://api.binance.com/api'
WITHDRAW_API_URL = 'https://api.binance.com/wapi'
WEBSITE_URL = 'https://www.binance.com'
PUBLIC_API_VERSION = 'v1'
PRIVATE_API_VERSION = 'v3'
WITHDRAW_API_VERSION = 'v3'
SYMBOL_TYPE_SPOT = 'SPOT'
ORDER_STATUS_NEW = 'NEW'
ORDER_STATUS_PARTIALLY_FILLED = 'PARTIALLY_FILLED'
ORDER_STATUS_FILLED = 'FILLED'
ORDER_STATUS_CANCELED = 'CANCELED'
ORDER_STATUS_PENDING_CANCEL = 'PENDING_CANCEL'
ORDER_STATUS_REJECTED = 'REJECTED'
ORDER_STATUS_EXPIRED = 'EXPIRED'
KLINE_INTERVAL_1MINUTE = '1m'
KLINE_INTERVAL_3MINUTE = '3m'
KLINE_INTERVAL_5MINUTE = '5m'
KLINE_INTERVAL_15MINUTE = '15m'
KLINE_INTERVAL_30MINUTE = '30m'
KLINE_INTERVAL_1HOUR = '1h'
KLINE_INTERVAL_2HOUR = '2h'
KLINE_INTERVAL_4HOUR = '4h'
KLINE_INTERVAL_6HOUR = '6h'
KLINE_INTERVAL_8HOUR = '8h'
KLINE_INTERVAL_12HOUR = '12h'
KLINE_INTERVAL_1DAY = '1d'
KLINE_INTERVAL_3DAY = '3d'
KLINE_INTERVAL_1WEEK = '1w'
KLINE_INTERVAL_1MONTH = '1M'
SIDE_BUY = 'BUY'
SIDE_SELL = 'SELL'
ORDER_TYPE_LIMIT = 'LIMIT'
ORDER_TYPE_MARKET = 'MARKET'
ORDER_TYPE_STOP_LOSS = 'STOP_LOSS'
ORDER_TYPE_STOP_LOSS_LIMIT = 'STOP_LOSS_LIMIT'
ORDER_TYPE_TAKE_PROFIT = 'TAKE_PROFIT'
ORDER_TYPE_TAKE_PROFIT_LIMIT = 'TAKE_PROFIT_LIMIT'
ORDER_TYPE_LIMIT_MAKER = 'LIMIT_MAKER'
TIME_IN_FORCE_GTC = 'GTC'
TIME_IN_FORCE_IOC = 'IOC'
TIME_IN_FORCE_FOK = 'FOK'
ORDER_RESP_TYPE_ACK = 'ACK'
ORDER_RESP_TYPE_RESULT = 'RESULT'
ORDER_RESP_TYPE_FULL = 'FULL'
AGG_ID = 'a'
AGG_PRICE = 'p'
AGG_QUANTITY = 'q'
AGG_FIRST_TRADE_ID = 'f'
AGG_LAST_TRADE_ID = 'l'
AGG_TIME = 'T'
AGG_BUYER_MAKES = 'm'
AGG_BEST_MATCH = 'M'
def run():
initialize_arb()
pass
def initialize_arb():
welcome_message = "\n\n---------------------------------------------------------\n\n"
welcome_message+= "Hello and Welcome to the Binance Arbitrage Crypto Trader Bot Python Script\nCreated 2018 by Joaquin Roibal (#BlockchainEng)"
welcome_message+= "A quick 'run-through' will be performed to introduce you to the functionality of this bot\n"
welcome_message+="To learn more visit medium.com/#BlockchainEng or watch introductory Youtube Videos"
welcome_message+="\nCopyright 2018 by Joaquin Roibal\n"
bot_start_time = str(datetime.now())
welcome_message+= "\nBot Start Time: {}\n\n\n".format(bot_start_time)
print(welcome_message)
data_log_to_file(welcome_message)
time.sleep(5)
try:
status = client.get_system_status()
list_of_symbols = ['ETHBTC', 'BNBETH', 'BNBBTC']
list_of_symbols2 = ['ETHUSDT', 'BNBETH', 'BNBUSDT']
list_of_symbols3 = ['BTCUSDT', 'BNBBTC', 'BNBUSDT']
list_of_arb_sym = [list_of_symbols, list_of_symbols2, list_of_symbols3]
tickers = client.get_orderbook_tickers()
portfolio=[]
with open('Portfolio.txt') as f1:
read_data = f1.readlines()
for line in read_data:
load_portfolio = line
load_portfolio = list(load_portfolio[1:-1].split(','))
i=0
for val in load_portfolio:
if i == 4:
portfolio.append(str(datetime.now()))
break
portfolio.append(float(val))
i+=1
portf_msg = "Starting Portfolio: " + str(portfolio)
print(portf_msg)
portf_file_save(portfolio)
data_log_to_file(portf_msg)
while 1:
calc_profit_list =[]
for arb_market in list_of_arb_sym:
calc_profit_list.append(arbitrage_bin(arb_market, tickers, portfolio, 1, 1))
for profit1 in calc_profit_list:
data_log_to_file(str(profit1))
print(calc_profit_list)
exp_profit = 0
m = n = 0
for exch_market in calc_profit_list:
if exch_market[4]>exp_profit:
exp_profit = exch_market[4]
m = n
n+=1
profit_message = "\nMost Profitable Market: {} \nExpected Profit: {}%".format(list_of_arb_sym[m], exp_profit)
print(profit_message)
data_log_to_file(profit_message)
time.sleep(5)
arb_list_data = []
arb_start_time = str(datetime.now())
for i in range(0,5):
arb_list_data.append(arbitrage_bin(list_of_arb_sym[m], tickers, portfolio, 1, 1, 'Yes'))
time.sleep(30)
arb_end_time = str(datetime.now())
viz_arb_data(arb_list_data, list_of_arb_sym[m], arb_start_time, arb_end_time)
except:
print("\nFAILURE INITIALIZE\n")
def data_log_to_file(message):
with open('CryptoTriArbBot_DataLog.txt', 'a+') as f:
f.write(message)
def portf_file_save(portfolio):
with open('Portfolio.txt', 'a+') as f:
f.write('\n'+str(portfolio))
def arbitrage_bin(list_of_sym, tickers, portfolio, cycle_num=10, cycle_time=30, place_order='No'):
arb_message = "Beginning Binance Arbitrage Function Data Collection - Running\n"
print(arb_message)
data_log_to_file(arb_message)
time.sleep(2)
fee_percentage = 0.05
for i in range(0,1):
"""
pairs = []
for sym in symbols:
for symbol in coins:
if symbol in sym:
pairs.append(sym)
print(pairs)
#From Coin 1 to Coin 2 - ETH/BTC - Bid
#From Coin 2 to Coin 3 - ETH/LTC - Ask
#From Coin 3 to Coin 1 - BTC/LTC - Bid
arb_list = ['ETH/BTC'] #, 'ETH/LTC', 'BTC/LTC']
#Find 'closed loop' of currency rate pairs
j=0
while 1:
if j == 1:
final = arb_list[0][-3:] + '/' + str(arb_list[1][-3:])
print(final)
#if final in symbols:
arb_list.append(final)
break
for sym in symbols:
if sym in arb_list:
pass
else:
if j % 2 == 0:
if arb_list[j][0:3] == sym[0:3]:
if arb_list[j] == sym:
pass
else:
arb_list.append(sym)
print(arb_list)
j+=1
break
if j % 2 == 1:
if arb_list[j][-3:] == sym[-3:]:
if arb_list[j] == sym:
pass
else:
arb_list.append(sym)
print(arb_list)
j+=1
break
"""
print("List of Arbitrage Symbols:", list_of_sym)
list_exch_rate_list = []
if 1:
for k in range(0,cycle_num):
i=0
exch_rate_list = []
data_collect_message1 = "Data Collection Cycle Number: "+str(k) +'\n'
print(data_collect_message1)
data_log_to_file(data_collect_message1)
for sym in list_of_sym:
currency_pair = "Currency Pair: "+str(sym)+"\n"
print(currency_pair)
data_log_to_file(currency_pair)
if sym in list_of_sym:
"""if i == 0: #For first in triangle
depth = client.get_order_book(symbol=sym)
exch_rate_list.append(float(depth['bids'][0][0]))
print(depth['bids'][0][0])
"""
if i % 2==0:
depth = client.get_order_book(symbol=sym)
inv1 = depth['asks'][0][0]
exch_rate_list.append(float(inv1))
Exch_rate1 = "Exchange Rate: {}".format(depth['asks'][0][0]) +'\n'
print(Exch_rate1)
data_log_to_file(Exch_rate1)
if i == 1:
depth = client.get_order_book(symbol=sym)
inv2 = round(1.0/float(depth['bids'][0][0]),6)
exch_rate_list.append(float(inv2))
Exch_rate2 = "Exchange Rate: {}".format(depth['bids'][0][0])+'\n'
print(Exch_rate2)
data_log_to_file(Exch_rate2)
i+=1
else:
exch_rate_list.append(0)
exch_rate_list.append(datetime.now())
rate1 = exch_rate_list[0]
buy_price = "Buy: {}\n".format(rate1)
print(buy_price)
data_log_to_file(buy_price)
rate2 = float(exch_rate_list[2])*float(exch_rate_list[1])
sell_price = "Sell: {}\n".format(rate2)
print(sell_price)
data_log_to_file(sell_price)
if float(rate1)<float(rate2):
arb_1_msg = "Arbitrage Possibility - "
arb_profit = round((float(rate2)-float(rate1))/float(rate2)*100,3)
arb_1_msg += "Potential Profit (Percentage): "+str(arb_profit) +'%\n'
print(arb_1_msg)
data_log_to_file(arb_1_msg)
exch_rate_list.append(arb_profit)
if place_order == 'Yes':
place_order_msg = "PLACING ORDER"
print(place_order_msg)
data_log_to_file(place_order_msg)
portfolio = tri_arb_paper(portfolio, list_of_sym, exch_rate_list)
portf_file_save(portfolio)
else:
arb_2_msg = "No Arbitrage Possibility"
print(arb_2_msg)
data_log_to_file(arb_2_msg)
exch_rate_list.append(0)
exch_msg = "Exchange Rate List: " +str(exch_rate_list)+'\n'
print(exch_msg)
data_log_to_file(exch_msg)
time.sleep(cycle_time)
print('\nARBITRAGE FUNCTIONALITY SUCCESSFUL - Data of Exchange Rates Collected\n')
return exch_rate_list
def tri_arb_paper(portfolio1, sym_list, list_exch_rates):
tri_arb_paper_msg = "\nSTARTING TRI ARB PAPER TRADING FUNCTION\n"
print(tri_arb_paper_msg)
time.sleep(10)
data_log_to_file(tri_arb_paper_msg)
if sym_list[0][-3:]=='BTC':
portf_pos = 0
elif sym_list[0][-3:]=='ETH':
portf_pos = 1
elif sym_list[0][-3:]=='SDT':
portf_pos = 2
elif sym_list[0][-3:]=='BNB':
portf_pos = 3
start_amount = float(portfolio1[portf_pos])
amt_coin2 = start_amount / float(list_exch_rates[0])
amt_coin3 = amt_coin2 * float(list_exch_rates[1])
final_amount = amt_coin3 * float(list_exch_rates[2])
tri_arb_paper_msg = "Starting Amount: "+str(sym_list[0][-3:])+" "+str(start_amount)+'\n'
tri_arb_paper_msg += "Amount Coin 2: "+str(sym_list[0][0:3])+" "+str(amt_coin2)+'\n'
tri_arb_paper_msg += "Amount Coin 3: "+str(sym_list[2][0:3])+" "+str(amt_coin3) +'\n'
tri_arb_paper_msg += "Final Amount: "+str(sym_list[0][-3:])+" "+str(final_amount)+'\n'
print(tri_arb_paper_msg)
data_log_to_file(tri_arb_paper_msg)
portfolio1[portf_pos] = final_amount
portfolio1[-1] = str(datetime.now())
return portfolio1
def viz_arb_data(list_exch_rate_list, arb_market, start_time, end_time):
viz_msg = "RUNNING ARBITRAGE VISUALIZATION FUNCTIONALITY"
print(viz_msg)
data_log_to_file(viz_msg)
rateA = []
rateB = []
rateB_fee = []
price1 = []
price2 = []
time_list = []
profit_list = []
for rate in list_exch_rate_list:
rateA.append(rate[0])
rateB1 = round(float(rate[1])*float(rate[2]),6)
rateB.append(rateB1)
price1.append(rate[1])
price2.append(rate[2])
profit_list.append(rate[4])
time_list.append(rate[3])
viz_msg2 = "Rate A: {} \n Rate B: {} \n Projected Profit (%): {} ".format(rateA, rateB, profit_list) #rateB_fee))
print(viz_msg2)
data_log_to_file(viz_msg2)
fig, host = plt.subplots()
fig.subplots_adjust(right=0.75)
par1 = host.twinx()
par2 = host.twinx()
par2.spines["right"].set_position(("axes", 1.2))
make_patch_spines_invisible(par2)
par2.spines["right"].set_visible(True)
p1, = host.plot(time_list, rateA, "k", label = "{}".format(arb_market[0]))
p1, = host.plot(time_list, rateB, "k+", label = "{} * {}".format(arb_market[1], arb_market[2]))
p2, = par1.plot(time_list, price1, "b-", label="Price - {}".format(arb_market[1]))
p3, = par2.plot(time_list, price2, "g-", label="Price - {}".format(arb_market[2]))
host.set_xlabel("Time")
host.set(title='Triangular Arbitrage - Exchange: {}\nStart Time: {}\n End Time: {}\n'
'Copyright (c) 2018 #BlockchainEng'.format('Binance', start_time, end_time))
host.set_ylabel("Exchange Rate")
par1.set_ylabel("Price - {}".format(arb_market[1]))
par2.set_ylabel("Price - {}".format(arb_market[2]))
host.yaxis.label.set_color(p1.get_color())
tkw = dict(size=4, width=1.5)
host.tick_params(axis='y', colors=p1.get_color(), **tkw)
par1.tick_params(axis='y', colors=p2.get_color(), **tkw)
par2.tick_params(axis='y', colors=p3.get_color(), **tkw)
host.tick_params(axis='x', **tkw)
lines = [p1, p2, p3]
host.legend(lines, [l.get_label() for l in lines])
fname = "Binance_Test.png"
plt.savefig(fname)
""", dpi=None, facecolor='w', edgecolor='w',
orientation='portrait', papertype=None, format=None,
transparent=False, bbox_inches=None, pad_inches=0.1,
frameon=None)"""
print_figure_message = "Data Collected Figure Printed & Saved - " + str(fname)
print(print_figure_message)
data_log_to_file(print_figure_message)
def make_patch_spines_invisible(ax):
ax.set_frame_on(True)
ax.patch.set_visible(False)
for sp in ax.spines.values():
sp.set_visible(False)
"""
def market_depth(sym, num_entries=20):
#Get market depth
#Retrieve and format market depth (order book) including time-stamp
i=0 #Used as a counter for number of entries
#print("Order Book: ", convert_time_binance(client.get_server_time()))
depth = client.get_order_book(symbol=sym)
print(depth)
print(depth['asks'][0])
ask_tot=0.0
ask_price =[]
ask_quantity = []
bid_price = []
bid_quantity = []
bid_tot = 0.0
place_order_ask_price = 0
place_order_bid_price = 0
max_order_ask = 0
max_order_bid = 0
print("\n", sym, "\nDepth ASKS:\n")
print("Price Amount")
for ask in depth['asks']:
if i<num_entries:
if float(ask[1])>float(max_order_ask):
#Determine Price to place ask order based on highest volume
max_order_ask=ask[1]
place_order_ask_price=round(float(ask[0]),5)-0.0001
#ask_list.append([ask[0], ask[1]])
ask_price.append(float(ask[0]))
ask_tot+=float(ask[1])
ask_quantity.append(ask_tot)
#print(ask)
i+=1
j=0 #Secondary Counter for Bids
print("\n", sym, "\nDepth BIDS:\n")
print("Price Amount")
for bid in depth['bids']:
if j<num_entries:
if float(bid[1])>float(max_order_bid):
#Determine Price to place ask order based on highest volume
max_order_bid=bid[1]
place_order_bid_price=round(float(bid[0]),5)+0.0001
bid_price.append(float(bid[0]))
bid_tot += float(bid[1])
bid_quantity.append(bid_tot)
#print(bid)
j+=1
return ask_price, ask_quantity, bid_price, bid_quantity, place_order_ask_price, place_order_bid_price
#Plot Data
"""
if __name__ == "__main__":
run()
Ideally, the code is supposed to find arbitrage opportunities in predicted price changes and execute orders accordingly.

TypeError: __init__() takes from 1 to 4 positional arguments but 9 were given

when l run the following program l got this error :
originDataset = dataset.lmdbDataset(originPath, 'abc', *args)
TypeError: __init__() takes from 1 to 4 positional arguments but 9 were given
This error is relate to the second code source l presented below. it's strange because l don't have 9 argument. what's wrong with my code ?
import sys
origin_path = sys.path
sys.path.append("..")
import dataset
sys.path = origin_path
import lmdb
def writeCache(env, cache):
with env.begin(write=True) as txn:
for k, v in cache.iteritems():
txn.put(k, v)
def convert(originPath, outputPath):
args = [0] * 6
originDataset = dataset.lmdbDataset(originPath, 'abc', *args)
print('Origin dataset has %d samples' % len(originDataset))
labelStrList = []
for i in range(len(originDataset)):
label = originDataset.getLabel(i + 1)
labelStrList.append(label)
if i % 10000 == 0:
print(i)
lengthList = [len(s) for s in labelStrList]
items = zip(lengthList, range(len(labelStrList)))
items.sort(key=lambda item: item[0])
env = lmdb.open(outputPath, map_size=1099511627776)
cnt = 1
cache = {}
nSamples = len(items)
for i in range(nSamples):
imageKey = 'image-%09d' % cnt
labelKey = 'label-%09d' % cnt
origin_i = items[i][1]
img, label = originDataset[origin_i + 1]
cache[labelKey] = label
cache[imageKey] = img
if cnt % 1000 == 0 or cnt == nSamples:
writeCache(env, cache)
cache = {}
print('Written %d / %d' % (cnt, nSamples))
cnt += 1
nSamples = cnt - 1
cache['num-samples'] = str(nSamples)
writeCache(env, cache)
print('Convert dataset with %d samples' % nSamples)
if __name__ == "__main__":
convert('/share/datasets/scene_text/Synth90k/synth90k-val-lmdb', '/share/datasets/scene_text/Synth90k/synth90k-val-ordered-lmdb')
convert('/share/datasets/scene_text/Synth90k/synth90k-train-lmdb', '/share/datasets/scene_text/Synth90k/synth90k-train-ordered-lmdb')
which calls the following program :
#!/usr/bin/python
# encoding: utf-8
import random
import torch
from torch.utils.data import Dataset
from torch.utils.data import sampler
import torchvision.transforms as transforms
import lmdb
import six
import sys
from PIL import Image
import numpy as np
class lmdbDataset(Dataset):
def __init__(self, root=None, transform=None, target_transform=None):
self.env = lmdb.open(
root,
max_readers=1,
readonly=True,
lock=False,
readahead=False,
meminit=False)
if not self.env:
print('cannot creat lmdb from %s' % (root))
sys.exit(0)
with self.env.begin(write=False) as txn:
nSamples = int(txn.get('num-samples'))
self.nSamples = nSamples
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return self.nSamples
def __getitem__(self, index):
assert index <= len(self), 'index range error'
index += 1
with self.env.begin(write=False) as txn:
img_key = 'image-%09d' % index
imgbuf = txn.get(img_key)
buf = six.BytesIO()
buf.write(imgbuf)
buf.seek(0)
try:
img = Image.open(buf).convert('L')
except IOError:
print('Corrupted image for %d' % index)
return self[index + 1]
if self.transform is not None:
img = self.transform(img)
label_key = 'label-%09d' % index
label = str(txn.get(label_key))
if self.target_transform is not None:
label = self.target_transform(label)
return (img, label)
class resizeNormalize(object):
def __init__(self, size, interpolation=Image.BILINEAR):
self.size = size
self.interpolation = interpolation
self.toTensor = transforms.ToTensor()
def __call__(self, img):
img = img.resize(self.size, self.interpolation)
img = self.toTensor(img)
img.sub_(0.5).div_(0.5)
return img
class randomSequentialSampler(sampler.Sampler):
def __init__(self, data_source, batch_size):
self.num_samples = len(data_source)
self.batch_size = batch_size
def __iter__(self):
n_batch = len(self) // self.batch_size
tail = len(self) % self.batch_size
index = torch.LongTensor(len(self)).fill_(0)
for i in range(n_batch):
random_start = random.randint(0, len(self) - self.batch_size)
batch_index = random_start + torch.range(0, self.batch_size - 1)
index[i * self.batch_size:(i + 1) * self.batch_size] = batch_index
# deal with tail
if tail:
random_start = random.randint(0, len(self) - self.batch_size)
tail_index = random_start + torch.range(0, tail - 1)
index[(i + 1) * self.batch_size:] = tail_index
return iter(index)
def __len__(self):
return self.num_samples
class alignCollate(object):
def __init__(self, imgH=32, imgW=128, keep_ratio=False, min_ratio=1):
self.imgH = imgH
self.imgW = imgW
self.keep_ratio = keep_ratio
self.min_ratio = min_ratio
def __call__(self, batch):
images, labels = zip(*batch)
imgH = self.imgH
imgW = self.imgW
if self.keep_ratio:
ratios = []
for image in images:
w, h = image.size
ratios.append(w / float(h))
ratios.sort()
max_ratio = ratios[-1]
imgW = int(np.floor(max_ratio * imgH))
imgW = max(imgH * self.min_ratio, imgW) # assure imgH >= imgW
transform = resizeNormalize((imgW, imgH))
images = [transform(image) for image in images]
images = torch.cat([t.unsqueeze(0) for t in images], 0)
return images, labels

Resources