Tensorflow: using a FIFO queue for code running on GPUs - io

The code below shows my attempt to run an algorithm on single GPUs and feed data to it using a FIFO queue. The data exists in a CSV file. I use a separate python thread to read from the file one line at a time and enqueue the line into a FIFO.
N = 16
num_ckfs =80000
q = [0.01 for i in range(N)]
q_ckfs = np.array([q for i in range(num_ckfs)])
r = [5]
r_ckfs = np.array([r for i in range(num_ckfs)])
init_var = [10.0 for i in range(N)]
init_var_ckfs = np.array([init_var for i in range(num_ckfs)])
init_state = [0.0 for i in range(N)]
init_state_ckfs = np.array([init_state for i in range(num_ckfs)])
class CKF(object):
def __init__(self, num_ckfs, N):
self.init_variances = tf.Variable(init_var_ckfs, name='init_variances', dtype=tf.float64)
self.init_states = tf.Variable(init_state_ckfs, name='init_states', dtype=tf.float64)
init_states_expanded = tf.expand_dims(self.init_states, 2) # num_ckfs X N X 1
self.q_values = tf.constant(q_ckfs, name='q_values', dtype=tf.float64)
self.r_values = tf.constant(r_ckfs, name='r_values', dtype=tf.float64)
self.input_vectors = tf.placeholder(tf.float64, shape=[num_ckfs, N], name='input_vectors')
self.z_k = tf.placeholder(tf.float64, shape=[num_ckfs, 1], name='z_k');
q = tf.FIFOQueue(200, [tf.float64, tf.float64], shapes=[[num_ckfs,1], [num_ckfs,N]])
self.enqueue_op = q.enqueue([self.z_k, self.input_vectors])
observations, inputs = q.dequeue()
#further processing using the input data
with tf.device('/gpu:0'):
ckf_gpu0 = CKF(num_ckfs, N)
def load_and_enqueue():
#read one line at a time
#obvs_list corresponds to the first column
#data_list corresponds to the rest of the columns
session.run(ckf_gpu0.enqueue_op, feed_dict={
ckf_gpu0.input_vectors: data_list[0], ckf_gpu0.z_k: obvs_list[0]})
count += 1
t = threading.Thread(target=load_and_enqueue)
t.start()
for i in range( num_rows):
out = session.run([ckf_gpu0.projected_output ])
The first problem that I have run into is:
InvalidArgumentError (see above for traceback): Cannot assign a device to node 'fifo_queue': Could not satisfy explicit device specification '/device:GPU:0' because no supported kernel for GPU devices is available.
Is there an alternate way to do such a thing, i.e. hide the I/O latency while the computation is being done on a GPU?

Related

How to concatenate gathered data using mpi4py library in python

I used to list append of data employing mpi4py and try to save the data sequentially at the source(root==0) node.
As suggested by Alan22, I've modified the code and it works, but the script does not concatenate properly, so I get the output file as shown in attached figure:01.
Can anybody help how to fix the error message? In addition, whatever I've written in python script [shown below], isn't the best way to solve the problem.
Is there any way to solve this type of problem efficiently? Any help is highly appreciated.
The python script is given as follows:
import numpy as np
from scipy import signal
from mpi4py import MPI
import random
import cmath, math
import matplotlib.pyplot as plt
import time
#File storing path
save_results_to = 'File storing path'
count_day = 1
count_hour = 1
arr_x = [0, 8.49, 0.0, -8.49, -12.0, -8.49, -0.0, 8.49, 12.0]
arr_y = [0, 8.49, 12.0, 8.49, 0.0, -8.49, -12.0, -8.49, -0.0]
M = len(arr_x)
N = len(arr_y)
np.random.seed(12345)
total_rows = 50000
raw_data=np.reshape(np.random.rand(total_rows*N),(total_rows,N))
# Function of CSD:: Using For Loop
fs = 500; # Sampling frequency
def csdMat(data):
dat, cols = data.shape # For 2D data
total_csd = []
for i in range(cols):
col_csd =[]
for j in range( cols):
freq, Pxy = signal.csd(data[:,i], data[:, j], fs=fs, window='hann', nperseg=100, noverlap=70, nfft=5000)
col_csd.append(Pxy)
total_csd.append(col_csd)
pxy = np.array(total_csd)
return freq, pxy
# Finding cross spectral density (CSD)
t0 = time.time()
freq, csd = csdMat(raw_data)
print('The shape of the csd data', csd.shape)
print('Time required {} seconds to execute CSD--For loop'.format(time.time()-t0))
kf=1*2*np.pi/10
resolution = 50 # This is important:: the HIGHER the Resolution, the higher the execution time!!!
grid_size = N * resolution
kx = np.linspace(-kf, kf, ) # space vector
ky = np.linspace(-kf, kf, grid_size) # space vector
def DFT2D(data):
P=len(kx)
Q=len(ky)
dft2d = np.zeros((P,Q), dtype=complex)
for k in range(P):
for l in range(Q):
sum_log = []
mat2d = np.zeros((M,N))
sum_matrix = 0.0
for m in range(M):
for n in range(N):
e = cmath.exp(-1j*((((dx[m]-dx[n])*kx[l])/1) + (((dy[m]-dy[n])*ky[k])/1)))
sum_matrix += data[m, n] * e
dft2d[k,l] = sum_matrix
return dft2d
dx = arr_x[:]; dy = arr_y[:]
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
data = []
start_freq = 100
end_freq = 109
freq_range = np.arange(start_freq,end_freq)
no_of_freq = len(freq_range)
for fr_count in range(start_freq, end_freq):
if fr_count % size == rank:
dft = np.zeros((grid_size, grid_size))
spec_csd = csd[:,:, fr_count]
dft = DFT2D(spec_csd) # Call the DFT2D function
spec = np.array(np.real(dft)) # Spectrum or 2D_DFT of data[real part]
print('Shape of spec', spec.shape)
data.append(spec)
#data = np.append(data,spec)
np.seterr(invalid='ignore')
data = comm.gather(data, root =0)
# comm.Allreduce(MPI.IN_PLACE,data,op=MPI.MAX)
print("Rank: ", rank, ". Spectrum shape is:\n", spec.shape)
if rank == 0:
output_data = np.concatenate(data, axis = 0)
#output_data = np.c_(data, axis = 0)
dft_tot = np.array((output_data), dtype='object')
res = np.zeros((grid_size, grid_size))
for k in range(size):
for i in range(no_of_freq):
jj = np.around(freq[freq_range[i]], decimals = 2)
#print('The shape of data after indexing', data1.shape)
#data_final=data1.reshape(data1.shape[0]*data1.shape[1], data1.shape[2])
res[i * size + k] = dft_tot[k][i] #np.array(data[k])
data = np.array(res)
#print('The shape of the dft at root node', data.shape)
np.savetxt(save_results_to + f'Day_{count_day}_hour_{count_hour}_f_{jj}_hz.txt', data.view(float))
I use the following bash script command to run the script ( i.e., my_file.sh). I submit the job with command sbatch my_file.sh
#! /bin/bash -l
#SBATCH -J testmvapich2
#SBATCH -N 1 ## Maximum 04 nodes
#SBATCH --ntasks=10
#SBATCH --cpus-per-task=1 # cpu-cores per task
#SBATCH --mem-per-cpu=3000MB
#SBATCH --time=00:20:00
#SBATCH -p para
#SBATCH --output="stdout.txt"
#SBATCH --error="stderr.txt"
#SBATCH -A camk
##SBATCH --mail-type=ALL
##SBATCH --chdir=/work/cluster_computer/my_name/data_work/MMC331/
eval "$(conda shell.bash hook)"
conda activate myenv
#conda activate fast-mpi4py
cd $SLURM_SUBMIT_DIR
#module purge
#module add mpi/mvapich2-2.2-x86_64
mpirun python3 mpi_test.py
You can try with this after "data = comm.gather(data, root=0)"
if rank == 0:
print('Type of data:', type(data))
dft_tot = np.array((data))#, dtype='object')
print('shape of DATA array:', dft_tot.shape)
#print('Type of dft array:', type(dft_tot))
res = np.zeros((450,450))
for k in range(size):
# for i in range(len(data[rank])):
for i in range(no_of_freq):
jj = np.around(freq[freq_range[k]], decimals = 2)
#data1 = np.array(dft_tot[k])
res[i * size + k] = data[k]
data = np.array(res)#.reshape(data1.shape[0]*data1.shape[1], data1.shape[2])
print('The shape of the dft at root node', data.shape)
np.savetxt(save_results_to + f'Day_{count_day}_hour_{co
Here is the link. Hope it helps mpi4py on HPC: comm.gather
As mentioned in the comments, there are two typos in the code:
The indices for arrays kx and ky have been swapped in the line where variable e is calculated in the function DFT2D(data).
The code is being run for 10 MPI processes for frequencies fr_count in the range start_freq = 100 and end_freq = 109. For this, the loops and arange must be written as for fr_count in range(start_freq, end_freq + 1) and freq_range = np.arange(start_freq, end_freq + 1) as these are not end-point inclusive.
The data = comm.gather(data, root=0) and subsequent output_data = np.concatenate(data, axis=0) operations are performing as they should and as such, the question detracts from the actual issue in the code.
A major issue is that in line res[i * size + k] = dft_tot[k][i] arrays of disparate sizes are being assigned to each other.
Shape of res: 450 x 450
Shape of dft_tot: 10 x 50 x 450
The value of i*size + k ranges from 0 to 110. I think the user expects dft_tot to have the shape 450 x 450, probably due to the indexing confusion mentioned in typo#2 above. Properly done concatenation would yield dft_tot with shape 500 x 450 (since there are 10 arrays of size 50 x 450).
Currently the gather operation returns a list of lists, each containing a NumPy array of size 50 x 450. Technically, it should return a list of NumPy arrays each of size 50 x 450. Adding the line data = data[0] (since data has only one element anyway in each process) before performing data = comm.gather(data, root=0) will achieve this result.
But this whole process seems redundant..
Because there are 10 frequencies considered here. For each frequency, there is a data set of size 50 x 450 . There are 10 MPI processes with each handling one frequency out of the 10. Finally, 10 files are being written corresponding to each frequency. This makes the whole gather operation redundant, as each MPI process can directly write the file corresponding to each frequency.
If instead the dft_tot file was being written as is by rank = 0, then the gather operation would make sense. But splitting the array into the constituent frequencies defeats the point.
This achieves the same result without the gather operation:
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
start_freq = 100
end_freq = 109
freq_range = np.arange(start_freq,end_freq+1)
no_of_freq = len(freq_range)
for fr_count in range(start_freq, end_freq+1):
if fr_count % size == rank:
dft = np.zeros((grid_size, grid_size))
spec_csd = csd[:,:, fr_count]
dft = DFT2D(spec_csd) # Call the DFT2D function
spec = np.array(np.real(dft)) # Spectrum or 2D_DFT of data[real part]
print('Shape of spec', spec.shape)
jj = np.around(freq[freq_range[rank]], decimals = 2)
np.savetxt(f'Day_{count_day}_hour_{count_hour}_f_{jj}_hz.txt', spec.view(float))

Python Multiprocessing (Splitting data in smaller chunks - multiple function arguments)

Note from 22.02.21:
-Potentially my problem could also be solved by a more efficient memory usage instead of multiprocessing, since I realized that the memory load gets very high and might be a limiting factor here.
I'm trying to reduce the time that my script needs to run by making use of multiprocessing.
In the past I got some good tips about increasing the speed of the function itself (Increase performance of np.where() loop), but now I would like to make use of all cores of a 32-core workstation.
My function compares entries of two lists (X and Y) with a reference lists Q and Z. For every element in X/Y, it checks whether X[i] occurs somewhere in Q and whether Y[i] occurs in Z. If X[i] == Q[s] AND Y[i] == Z[s], it returns the index "s".
(Note: My real data consists of DNA sequencing reads and I need to map my reads to the reference.)
What I tried so far:
Splitting my long lists X and Y into even chunks (n-chunks, where n == cpu_count)
Trying the "concurrent.futures.ProcessPoolExecutor()" to run the function for each "sublist" in parallel and in the end combine the result of each process to one final dictionary (matchdict). (--> see commented out section)
My problem:
All cores are getting used when I uncomment the multiprocessing section but it ends up with an error (index out of range) which I could not yet resolve. (--> Tip: lower N to 1000 and you will immediately see the error without waiting forever)
Does anyone know how to solve this, or can suggest a better approach to use multiprocessing in my code?
Here is the code:
import numpy as np
import multiprocessing
import concurrent.futures
np.random.seed(1)
def matchdictfunc(index,x,y,q,z): # function to match entries of X and Y to Q and Z and get index of Q/Z where their values match X/Y
lookup = {}
for i, (q, z) in enumerate(zip(Q, Z)):
lookup.setdefault((q, z), []).append(i)
matchlist = [lookup.get((x, y), []) for x, y in zip(X, Y)]
matchdict = {}
for ind, match in enumerate(matchlist):
matchdict[index[ind]] = match
return matchdict
def split(a, n): # function to split list in n even parts
k, m = divmod(len(a), n)
return list((a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)))
def splitinput(index,X,Y,Q,Z): # split large lists X and Y in n-even parts (n = cpu_count), make new list containing n-times Q and Z (to feed Q and Z for every process)
cpu_count = multiprocessing.cpu_count()
#create multiple chunks for X and Y and index:
index_split = split(index,cpu_count)
X_split = split(X,cpu_count)
Y_split = split(Y,cpu_count)
# create list with several times Q and Z since it needs to be same length as X_split etc:
Q_mult = []
Z_mult = []
for _ in range(cpu_count):
Q_mult.append(Q)
Z_mult.append(Z)
return index_split,X_split,Y_split,Q_mult,Z_mult
# N will finally scale up to 10^9
N = 10000000
M = 300
index = [str(x) for x in list(range(N))]
X = np.random.randint(M, size=N)
Y = np.random.randint(M, size=N)
# Q and Z size is fixed at 120000
Q = np.random.randint(M, size=120000)
Z = np.random.randint(M, size=120000)
# convert int32 arrays to str64 arrays and then to list, to represent original data (which are strings and not numbers)
X = np.char.mod('%d', X).tolist()
Y = np.char.mod('%d', Y).tolist()
Q = np.char.mod('%d', Q).tolist()
Z = np.char.mod('%d', Z).tolist()
# single-core:
matchdict = matchdictfunc(index,X,Y,Q,Z)
# split lists to number of processors (cpu_count)
index_split,X_split,Y_split,Q_mult,Z_mult = splitinput(index,X,Y,Q,Z)
## Multiprocessing attempt - FAILS! (index out of range)
# finallist = []
# if __name__ == '__main__':
# with concurrent.futures.ProcessPoolExecutor() as executor:
# results = executor.map(matchlistfunc,X_split,Y_split,Q_mult,Z_mult)
# for result in results:
# finallist.append(result)
# matchdict = {}
# for d in finallist:
# matchdict.update(d)
Your function matchdictfunc currently has arguments x, y, q, z but in fact does not use them, although in the multiprocessing version it will need to use two arguments. There is also no need for function splitinput to replicate Q and Z into returned values Q_split and Z_split. Currently, matchdictfunc is expecting Q and Z to be global variables and we can arrange for that to be the case in the multiprocessing version by using the initializer and initargs arguments when constructing the pool. You should also move code that you do not need to be executed by the sub-processes into the block controlled by if __name__ == '__main__':, such as the arary initialization code. These changes result in:
import numpy as np
import multiprocessing
import concurrent.futures
MULTIPROCESSING = True
def init_pool(q, z):
global Q, Z
Q = q
Z = z
def matchdictfunc(index, X, Y): # function to match entries of X and Y to Q and Z and get index of Q/Z where their values match X/Y
lookup = {}
for i, (q, z) in enumerate(zip(Q, Z)):
lookup.setdefault((q, z), []).append(i)
matchlist = [lookup.get((x, y), []) for x, y in zip(X, Y)]
matchdict = {}
for ind, match in enumerate(matchlist):
matchdict[index[ind]] = match
return matchdict
def split(a, n): # function to split list in n even parts
k, m = divmod(len(a), n)
return list((a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)))
def splitinput(index, X, Y): # split large lists X and Y in n-even parts (n = cpu_count))
cpu_count = multiprocessing.cpu_count()
#create multiple chunks for X and Y and index:
index_split = split(index,cpu_count)
X_split = split(X,cpu_count)
Y_split = split(Y,cpu_count)
return index_split, X_split ,Y_split
def main():
# following required for non-multiprocessing
if not MULTIPROCESSING:
global Q, Z
np.random.seed(1)
# N will finally scale up to 10^9
N = 10000000
M = 300
index = [str(x) for x in list(range(N))]
X = np.random.randint(M, size=N)
Y = np.random.randint(M, size=N)
# Q and Z size is fixed at 120000
Q = np.random.randint(M, size=120000)
Z = np.random.randint(M, size=120000)
# convert int32 arrays to str64 arrays and then to list, to represent original data (which are strings and not numbers)
X = np.char.mod('%d', X).tolist()
Y = np.char.mod('%d', Y).tolist()
Q = np.char.mod('%d', Q).tolist()
Z = np.char.mod('%d', Z).tolist()
# for non-multiprocessing:
if not MULTIPROCESSING:
matchdict = matchdictfunc(index, X, Y)
else:
# for multiprocessing:
# split lists to number of processors (cpu_count)
index_split, X_split, Y_split = splitinput(index, X, Y)
with concurrent.futures.ProcessPoolExecutor(initializer=init_pool, initargs=(Q, Z)) as executor:
finallist = [result for result in executor.map(matchdictfunc, index_split, X_split, Y_split)]
matchdict = {}
for d in finallist:
matchdict.update(d)
#print(matchdict)
if __name__ == '__main__':
main()
Note: I tried this for a smaller value of N = 1000 (printing out the results of matchdict) and the multiprocessing version seemed to return the same results. My machine does not have the resources to run with the full value of N without freezing up everything else.
Another Approach
I am working under the assumption that your DNA data is external and the X and Y values can be read n values at a time or can be read in and written out so that this is possible. Then rather than having all the data resident in memory and splitting it up into 32 pieces, I propose that it be read n values at a time and thus broken up into approximately N/n pieces.
In the following code I have switched to using the imap method from class multiprocessing.pool.Pool. The advantage is that it lazily submits tasks to the process pool, that is, the iterable argument doesn't have to be a list or convertible to a list. Instead the pool will iterate over the iterable sending tasks to the pool in chunksize groups. In the code below, I have used a generator function for the argument to imap, which will generate successive X and Y values. Your actual generator function would first open the DNA file (or files) and read in successive portions of the file.
import numpy as np
import multiprocessing
def init_pool(q, z):
global Q, Z
Q = q
Z = z
def matchdictfunc(t): # function to match entries of X and Y to Q and Z and get index of Q/Z where their values match X/Y
index, X, Y = t
lookup = {}
for i, (q, z) in enumerate(zip(Q, Z)):
lookup.setdefault((q, z), []).append(i)
matchlist = [lookup.get((x, y), []) for x, y in zip(X, Y)]
matchdict = {}
for ind, match in enumerate(matchlist):
matchdict[index[ind]] = match
return matchdict
def next_tuple(n, stop, M):
start = 0
while True:
end = min(start + n, stop)
index = [str(x) for x in list(range(start, end))]
x = np.random.randint(M, size=n)
y = np.random.randint(M, size=n)
# convert int32 arrays to str64 arrays and then to list, to represent original data (which are strings and not numbers)
x = np.char.mod('%d', x).tolist()
y = np.char.mod('%d', y).tolist()
yield (index, x, y)
start = end
if start >= stop:
break
def compute_chunksize(XY_AT_A_TIME, N):
n_tasks, remainder = divmod(N, XY_AT_A_TIME)
if remainder:
n_tasks += 1
chunksize, remainder = divmod(n_tasks, multiprocessing.cpu_count() * 4)
if remainder:
chunksize += 1
return chunksize
def main():
np.random.seed(1)
# N will finally scale up to 10^9
N = 10000000
M = 300
# Q and Z size is fixed at 120000
Q = np.random.randint(M, size=120000)
Z = np.random.randint(M, size=120000)
# convert int32 arrays to str64 arrays and then to list, to represent original data (which are strings and not numbers)
Q = np.char.mod('%d', Q).tolist()
Z = np.char.mod('%d', Z).tolist()
matchdict = {}
# number of X, Y pairs at a time:
# experiment with this, especially as N increases:
XY_AT_A_TIME = 10000
chunksize = compute_chunksize(XY_AT_A_TIME, N)
#print('chunksize =', chunksize) # 32 with 8 cores
with multiprocessing.Pool(initializer=init_pool, initargs=(Q, Z)) as pool:
for d in pool.imap(matchdictfunc, next_tuple(XY_AT_A_TIME, N, M), chunksize):
matchdict.update(d)
#print(matchdict)
if __name__ == '__main__':
import time
t = time.time()
main()
print('total time =', time.time() - t)
Update
I want to eliminate using numpy from the benchmark. It is known that numpy uses multiprocessing for some of its operations and when used in multiprocessing applications can be the cause of of reduced performance. So the first thing I did was to take the OP's original program and where the code was, for example:
import numpy as np
np.random.seed(1)
X = np.random.randint(M, size=N)
X = np.char.mod('%d', X).tolist()
I replaced it with:
import random
random.seed(1)
X = [str(random.randrange(M)) for _ in range(N)]
I then timed the OP's program to get the time for generating the X, Y, Q and Z lists and the total time. On my desktop the times were approximately 20 seconds and 37 seconds respectively! So in my multiprocessing version just generating the arguments for the process pool's processes is more than half the total running time. I also discovered for the second approach, that as I increased the value of XY_AT_A_TIME that the CPU utilization went down from 100% to around 50% but that the total elapsed time improved. I haven't quite figured out why this is.
Next I tried to emulate how the programs would function if they were reading the data in. So I wrote out 2 * N random integers to a file, temp.txt and modified the OP's program to initialize X and Y from the file and then modified my second approach's next_tuple function as follows:
def next_tuple(n, stop, M):
with open('temp.txt') as f:
start = 0
while True:
end = min(start + n, stop)
index = [str(x) for x in range(start, end)] # improvement
x = [f.readline().strip() for _ in range(n)]
y = [f.readline().strip() for _ in range(n)]
yield (index, x, y)
start = end
if start >= stop:
break
Again as I increased XY_AT_A_TIME the CPU utilization went down (best performance I found was value 400000 with CPU utilization only around 40%).
I finally rewrote my first approach trying to be more memory efficient (see below). This updated version again reads the random numbers from a file but uses generator functions for X, Y and index so I don't need memory for both the full lists and the splits. Again, I do not expect duplicated results for the multiprocessing and non-multiprocessing versions because of the way I am assigning the X and Y values in the two cases (a simple solution to this would have been to write the random numbers to an X-value file and a Y-value file and read the values back from the two files). But this has no effect on the running times. But again, the CPU utilization, despite using the default pool size of 8, was only 30 - 40% (it fluctuated quite a bit) and the overall running time was nearly double the non-multiprocessing running time. But why?
import random
import multiprocessing
import concurrent.futures
import time
MULTIPROCESSING = True
POOL_SIZE = multiprocessing.cpu_count()
def init_pool(q, z):
global Q, Z
Q = q
Z = z
def matchdictfunc(index, X, Y): # function to match entries of X and Y to Q and Z and get index of Q/Z where their values match X/Y
lookup = {}
for i, (q, z) in enumerate(zip(Q, Z)):
lookup.setdefault((q, z), []).append(i)
matchlist = [lookup.get((x, y), []) for x, y in zip(X, Y)]
matchdict = {}
for ind, match in enumerate(matchlist):
matchdict[index[ind]] = match
return matchdict
def split(a): # function to split list in POOL_SIZE even parts
k, m = divmod(N, POOL_SIZE)
divisions = [(i + 1) * k + min(i + 1, m) - (i * k + min(i, m)) for i in range(POOL_SIZE)]
parts = []
for division in divisions:
part = [next(a) for _ in range(division)]
parts.append(part)
return parts
def splitinput(index, X, Y): # split large lists X and Y in n-even parts (n = POOL_SIZE)
#create multiple chunks for X and Y and index:
index_split = split(index)
X_split = split(X)
Y_split = split(Y)
return index_split, X_split ,Y_split
def main():
global N
# following required for non-multiprocessing
if not MULTIPROCESSING:
global Q, Z
random.seed(1)
# N will finally scale up to 10^9
N = 10000000
M = 300
# Q and Z size is fixed at 120000
Q = [str(random.randrange(M)) for _ in range(120000)]
Z = [str(random.randrange(M)) for _ in range(120000)]
with open('temp.txt') as f:
# for non-multiprocessing:
if not MULTIPROCESSING:
index = [str(x) for x in range(N)]
X = [f.readline().strip() for _ in range(N)]
Y = [f.readline().strip() for _ in range(N)]
matchdict = matchdictfunc(index, X, Y)
else:
# for multiprocessing:
# split lists to number of processors (POOL_SIZE)
# generator functions:
index = (str(x) for x in range(N))
X = (f.readline().strip() for _ in range(N))
Y = (f.readline().strip() for _ in range(N))
index_split, X_split, Y_split = splitinput(index, X, Y)
with concurrent.futures.ProcessPoolExecutor(POOL_SIZE, initializer=init_pool, initargs=(Q, Z)) as executor:
finallist = [result for result in executor.map(matchdictfunc, index_split, X_split, Y_split)]
matchdict = {}
for d in finallist:
matchdict.update(d)
if __name__ == '__main__':
t = time.time()
main()
print('total time =', time.time() - t)
Resolution?
Can it be that the overhead of transferring the data from the main process to the subprocesses, which involves shared memory reading and writing, is what is slowing everything down? So, this final version was an attempt to eliminate this potential cause for the slowdown. On my desktop I have 8 processors. For the first approach dividing the N = 10000000 X and Y values among them means that each process should be processing N // 8 -> 1250000 values. So I wrote out the random numbers in 16 groups of 1250000 numbers (8 groups for X and 8 groups for Y) as a binary file noting the offset and length of each of these 16 groups using the following code:
import random
random.seed(1)
with open('temp.txt', 'wb') as f:
offsets = []
for i in range(16):
n = [str(random.randrange(300)) for _ in range(1250000)]
b = ','.join(n).encode('ascii')
l = len(b)
offsets.append((f.tell(), l))
f.write(b)
print(offsets)
And from that I constructed lists X_SPECS and Y_SPECS that the worker function matchdictfunc could use for reading in the values X and Y itself as needed. So now instead of passing 1250000 values at a time to this worker function, we are just passing indices 0, 1, ... 7 to the worker function so it knows which group it has to read in. Shared memory access has been totally eliminated in accessing X and Y (it's still required for Q and Z) and the disk access moved to the process pool. The CPU Utilization will, of course, not be 100% because the worker function is doing I/O. But I found that while the running time has now been greatly improved, it still offered no improvement over the original non-multiprocessing version:
OP's original program modified to read `X` and `Y` values in from file: 26.2 seconds
Multiprocessing elapsed time: 29.2 seconds
In fact, when I changed the code to use multithreading by replacing the ProcessPoolExecutor with ThreadPoolExecutor, the elpased time went down almost another second demonstrating the there is very little contention for the Global Interpreter Lock within the worker function, i.e. most of the time is being spent in C-language code. The main work is done by:
matchlist = [lookup.get((x, y), []) for x, y in zip(X, Y)]
When we do this with multiprocessing, we have multiple list comprehensions and multiple zip operations (on smaller lists) being performed by separate processes and we then assemble the results in the end. This is conjecture on my part, but there just may not be any performance gains to be had by taking what are already efficient operations and scaling them down across multiple processors. Or in other words, I am stumped and that was my best guess.
The final version (with some additional optimizations -- please note):
import random
import concurrent.futures
import time
POOL_SIZE = 8
X_SPECS = [(0, 4541088), (4541088, 4541824), (9082912, 4540691), (13623603, 4541385), (18164988, 4541459), (22706447, 4542961), (27249408, 4541847), (31791255, 4542186)]
Y_SPECS = [(36333441, 4542101), (40875542, 4540120), (45415662, 4540802), (49956464, 4540971), (54497435, 4541427), (59038862, 4541523), (63580385, 4541571), (68121956, 4542335)]
def init_pool(q_z):
global Q_Z
Q_Z = q_z
def matchdictfunc(index, i): # function to match entries of X and Y to Q and Z and get index of Q/Z where their values match X/Y
x_offset, x_len = X_SPECS[i]
y_offset, y_len = Y_SPECS[i]
with open('temp.txt', 'rb') as f:
f.seek(x_offset, 0)
X = f.read(x_len).decode('ascii').split(',')
f.seek(y_offset, 0)
Y = f.read(y_len).decode('ascii').split(',')
lookup = {}
for i, (q, z) in enumerate(Q_Z):
lookup.setdefault((q, z), []).append(i)
matchlist = [lookup.get((x, y), []) for x, y in zip(X, Y)]
matchdict = {}
for ind, match in enumerate(matchlist):
matchdict[index[ind]] = match
return matchdict
def split(a): # function to split list in POOL_SIZE even parts
k, m = divmod(N, POOL_SIZE)
divisions = [(i + 1) * k + min(i + 1, m) - (i * k + min(i, m)) for i in range(POOL_SIZE)]
parts = []
for division in divisions:
part = [next(a) for _ in range(division)]
parts.append(part)
return parts
def main():
global N
random.seed(1)
# N will finally scale up to 10^9
N = 10000000
M = 300
# Q and Z size is fixed at 120000
Q = (str(random.randrange(M)) for _ in range(120000))
Z = (str(random.randrange(M)) for _ in range(120000))
Q_Z = list(zip(Q, Z)) # pre-compute the `zip` function
# for multiprocessing:
# split lists to number of processors (POOL_SIZE)
# generator functions:
index = (str(x) for x in range(N))
index_split = split(index)
with concurrent.futures.ProcessPoolExecutor(POOL_SIZE, initializer=init_pool, initargs=(Q_Z,)) as executor:
finallist = executor.map(matchdictfunc, index_split, range(8))
matchdict = {}
for d in finallist:
matchdict.update(d)
print(len(matchdict))
if __name__ == '__main__':
t = time.time()
main()
print('total time =', time.time() - t)
The Cost of Inter-Process Memory Transfers
In the code below function create_files was called to create 100 identical files consisting of a "pickled" list of 1,000,000 numbers. I then used a multiprocessing pool of size 8 twice to read the 100 files and unpickle the files to reconstitute the original lists. The difference between the first case (worker1) and the second case (worker2) was that in the second case the list is returned back to the caller (but not saved so that memory can be garbage collected immediately). The second case took more than three times longer than the first case. This can also explain in part why you do not see a speedup when you switch to multiprocessing.
from multiprocessing import Pool
import pickle
import time
def create_files():
l = [i for i in range(1000000)]
# create 100 identical files:
for file in range(1, 101):
with open(f'pkl/test{file}.pkl', 'wb') as f:
pickle.dump(l, f)
def worker1(file):
file_name = f'pkl/test{file}.pkl'
with open(file_name, 'rb') as f:
obj = pickle.load(f)
def worker2(file):
file_name = f'pkl/test{file}.pkl'
with open(file_name, 'rb') as f:
obj = pickle.load(f)
return file_name, obj
POOLSIZE = 8
if __name__ == '__main__':
#create_files()
pool = Pool(POOLSIZE)
t = time.time()
# no data returned:
for file in range(1, 101):
pool.apply_async(worker1, args=(file,))
pool.close()
pool.join()
print(time.time() - t)
pool = Pool(POOLSIZE)
t = time.time()
for file in range(1, 101):
pool.apply_async(worker2, args=(file,))
pool.close()
pool.join()
print(time.time() - t)
t = time.time()
for file in range(1, 101):
worker2(file)
print(time.time() - t)

how to accelerate 2D convolution with FFT?

# Zero-pad an image
def zero_pad(image, pad_height, pad_width):
H, W = image.shape
out = np.zeros((H+2*pad_height, W+2*pad_width))
out[pad_height:H+pad_height,pad_width:W+pad_width] = image
return out
# An step-by-step implementation of convolution filter
def conv(image, kernel):
Hi, Wi = image.shape
Hk, Wk = kernel.shape
out = np.zeros((Hi, Wi))
image_pad = zero_pad(image, (Hk-1)//2, (Wk-1)//2)
for i in range(Hi):
for j in range(Wi):
out[i,j] = np.sum(kernel*image_pad[i:Hk+i,j:Wk+j])
return out
# accelerate convolution using FFT
def conv_faster(image, kernel):
Hi, Wi = image.shape
Hk, Wk = kernel.shape
out = np.zeros((Hi, Wi))
# expand image and kernel by zero-padding
if( (Hi+Hk) % 2 == 0):
x = (Hi+Hk)
else:
x = (Hi+Hk)-1
if( (Wi+Wk) % 2 == 0):
y = (Wi+Wk)
else:
y = (Wi+Wk)-1
image_pad = np.zeros((x,y))
kernel_pad = np.zeros((x,y))
image_pad[0:Hi,0:Wi] = image
kernel_pad[0:Hk,0:Wk] = kernel
# make image and kernel at the center of frequency domain
for p in range(Hi):
for q in range(Wi):
image_pad[p,q]*=(-1)**(p+q)
for p in range(Hk):
for q in range(Wk):
kernel_pad[p,q]*=(-1)**(p+q)
# do fft for image and kernel
image_pad_fft = np.fft.fft2(image_pad)
kernel_pad_fft = np.fft.fft2(kernel_pad)
# get the imaginary part of kernel's transformation
kernel_pad_fft = 1j*np.imag(kernel_pad_fft)
# multiply the two in frequency domain
out_pad = np.fft.ifft2(image_pad_fft*kernel_pad_fft)
# get he real part of result
out = np.real(out_pad[0:Hi,0:Wi])
# Counteract the preceding centralization
for p in range(Hi):
for q in range(Wi):
out[p,q]*=(-1)**(p+q)
return out
There are some difference between their returns.
I think there result should be same, how could I refine it? Thinks!
the kernel is [[1,0,-1],[2,0,-2],[1,0,-1]]
I input this image for the two function
The step-by-step function obtains this result
The accelerated function obtains this result
For a convolution, the Kernel must be flipped. What you do in conv() is a correlation.
Since your Kernel is symmetric apart from a minus sign, result2 = -result1 in your current results

Incremental PCA

I've never used incremental PCA which exists in sklearn and I'm a bit confused about it's parameters and not able to find a good explanation of them.
I see that there is batch_size in the constructor, but also, when using partial_fit method you can again pass only a part of your data, I've found the following way:
n = df.shape[0]
chunk_size = 100000
iterations = n//chunk_size
ipca = IncrementalPCA(n_components=40, batch_size=1000)
for i in range(0, iterations):
ipca.partial_fit(df[i*chunk_size : (i+1)*chunk_size].values)
ipca.partial_fit(df[iterations*chunk_size : n].values)
Now, what I don't understand is the following - when using partial fit, does the batch_size play any role at all, or not? And how are they related?
Moreover, if both are considered, how should I change their values properly, when wanting to increase the precision while increasing memory footprint (and the other way around, decrease the memory consumption for the price of decreased accuracy)?
The docs say:
batch_size : int or None, (default=None)
The number of samples to use for each batch. Only used when calling fit...
This param is not used within partial_fit, where the batch-size is controlled by the user.
Bigger batches will increase memory-consumption, smaller ones will decrease it.
This is also written in the docs:
This algorithm has constant memory complexity, on the order of batch_size, enabling use of np.memmap files without loading the entire file into memory.
Despite some checks and parameter-heuristics, the whole fit-function looks like this:
for batch in gen_batches(n_samples, self.batch_size_):
self.partial_fit(X[batch], check_input=False)
Here is some an incremental PCA code based on https://github.com/kevinhughes27/pyIPCA which is an implementation of CCIPCA method.
import scipy.sparse as sp
import numpy as np
from scipy import linalg as la
import scipy.sparse as sps
from sklearn import datasets
class CCIPCA:
def __init__(self, n_components, n_features, amnesic=2.0, copy=True):
self.n_components = n_components
self.n_features = n_features
self.copy = copy
self.amnesic = amnesic
self.iteration = 0
self.mean_ = None
self.components_ = None
self.mean_ = np.zeros([self.n_features], np.float)
self.components_ = np.ones((self.n_components,self.n_features)) / \
(self.n_features*self.n_components)
def partial_fit(self, u):
n = float(self.iteration)
V = self.components_
# amnesic learning params
if n <= int(self.amnesic):
w1 = float(n+2-1)/float(n+2)
w2 = float(1)/float(n+2)
else:
w1 = float(n+2-self.amnesic)/float(n+2)
w2 = float(1+self.amnesic)/float(n+2)
# update mean
self.mean_ = w1*self.mean_ + w2*u
# mean center u
u = u - self.mean_
# update components
for j in range(0,self.n_components):
if j > n: pass
elif j == n: V[j,:] = u
else:
# update the components
V[j,:] = w1*V[j,:] + w2*np.dot(u,V[j,:])*u / la.norm(V[j,:])
normedV = V[j,:] / la.norm(V[j,:])
normedV = normedV.reshape((self.n_features, 1))
u = u - np.dot(np.dot(u,normedV),normedV.T)
self.iteration += 1
self.components_ = V / la.norm(V)
return
def post_process(self):
self.explained_variance_ratio_ = np.sqrt(np.sum(self.components_**2,axis=1))
idx = np.argsort(-self.explained_variance_ratio_)
self.explained_variance_ratio_ = self.explained_variance_ratio_[idx]
self.components_ = self.components_[idx,:]
self.explained_variance_ratio_ = (self.explained_variance_ratio_ / \
self.explained_variance_ratio_.sum())
for r in range(0,self.components_.shape[0]):
d = np.sqrt(np.dot(self.components_[r,:],self.components_[r,:]))
self.components_[r,:] /= d
You can test it with
import pandas as pd, ccipca
df = pd.read_csv('iris.csv')
df = np.array(df)[:,:4].astype(float)
pca = ccipca.CCIPCA(n_components=2,n_features=4)
S = 10
print df[0, :]
for i in range(150): pca.partial_fit(df[i, :])
pca.post_process()
The resulting eigenvectors / values will not exaactly be the same as the batch PCA. Results are approximate, but they are useful.

repeating tests in multiple functions python

I have some function for sound processing/ sound processing. And before it was all a single channel. But know i make it less or more multi channel.
At this point i have the feeling i do part of the scrips over and over again.
In this example it are two functions(my original function is longer) but the same happens also in single scripts.
my Two functions
import numpy as np
# def FFT(x, fs, *args, **kwargs):
def FFT(x, fs, output='complex'):
from scipy.fftpack import fft, fftfreq
N = len(x)
X = fft(x) / N
if output is 'complex':
F = np.linspace(0, N) / (N / fs)
return(F, X, [])
elif output is 'ReIm':
F = np.linspace(0, N) / (N / fs)
RE = np.real(X)
IM = np.imag(X)
return(F, RE, IM)
elif output is 'AmPh0':
F = np.linspace(0, (N-1)/2, N/2)
F = F/(N/fs)
# N should be int becouse of nfft
half_spec = np.int(N / 2)
AMP = abs(X[0:half_spec])
PHI = np.arctan(np.real(X[0:half_spec]) / np.imag(X[0:half_spec]))
return(F, AMP, PHI)
elif output is 'AmPh':
half_spec = np.int(N / 2)
F = np.linspace(1, (N-1)/2, N/2 - 1)
F = F/(N/fs)
AMP = abs(X[1:half_spec])
PHI = np.arctan(np.real(X[1:half_spec])/np.imag(X[1:half_spec]))
return(F, AMP, PHI)
def mFFT(x, fs, spectrum='complex'):
fft_shape = np.shape(x)
if len(fft_shape) == 1:
mF, mX1, mX2 = FFT(x, fs, spectrum)
elif len(fft_shape) == 2:
if fft_shape[0] < fft_shape[1]:
pass
elif fft_shape[0] > fft_shape[1]:
x = x.T
fft_shape = np.shape(x)
mF = mX1 = mX2 = []
for channel in range(fft_shape[0]):
si_mF, si_mX1, si_mX2 = FFT(x[channel], fs, spectrum)
if channel == 0:
mF = np.append(mF, si_mF)
mX1 = np.append(mX1, si_mX1)
mX2 = np.append(mX2, si_mX2)
else:
mF = np.vstack((mF, si_mF))
mX1 = np.vstack((mX1, si_mX1))
if si_mX2 == []:
pass
else:
mX2 = np.vstack((mX2, si_mX2))
elif len(fft_shape) > 2:
raise ValueError("Shape of input can't be greather than 2")
return(mF, mX1, mX2)
The second funcion in this case have the problem.
The reason for this checks is best to understand with an example:
I have recorded a sample of 1 second of audio data with 4 microphones.
so i have an ndim array of 4 x 44100 samples.
The FFT works on every even length array. This means that i get an result in both situations (4 x 44100 and 44100 x 4).
For all function after this function i have also 2 data types. or a complex signal or an tuple of two signals (amplitude and phase)... what's create an extra switch/ check in the script.
check type (tuple or complex data)
check direction (ad change it)
Check size / shape
run function and append/ stack this
Are there some methods to make this less repeative i have this situation in at least 10 functions...
Bert,
The problematic I'm understanding is the repeat of the calls you're making to do checks of all sorts. I'm not understanding all but I'm guessing they are made to format your data in a way you'll be able to execute fft on it.
One of the philosophy about computer programming in Python is "It's easier to ask forgiveness than it is to get permission."[1] . This means, you should probably try first and then ask forgiveness (try, except). It's much faster to do it this way then to do a lots of checks on the value. Also, those who are going to use your program should understand how it works pretty easily; make it easy to read without those check by separating the logic business from the technical logic. Don't worry, it's not evident and the fact you're asking is an indicator you're catching something isn't right :).
Here is what I would propose for your case (and it's not the perfect solution!):
def mFFT(x, fs, spectrum='complex'):
#Assume we're correcty align when receiving the data
#:param: x assume that we're multi-channel in the format [channel X soundtrack ]
#also, don't do this:
#mF = mX1 = si_mX2 = []
# see why : https://stackoverflow.com/questions/2402646/python-initializing-multiple-lists-line
mF = []
mX1 = []
mX2 = []
try:
for channel in range(len(x)):
si_mF, si_mX1, si_mX2 = FFT(x[channel], fs, spectrum)
mF.append(si_mF)
mX1.append(si_mX1)
mX2.append(si_mX2)
return (mF, mX1, mX2)
except:
#this is where you would try to point out why it could have failed. One good you had was the check for the orientation of the data and try again;
if np.shape(x)[0] > np.shape(x)[1]:
result = mFFT(x.T,fs,spectrum)
return result
else :
if np.shape(x)[0] > 2:
raise(ValueError("Shape of input isn't supported for greather than 2"))
I gave an example because I believe you expected one, but I'm not giving the perfect answer away ;). The problematic you have is a design problematic and no, there are no easy solution. What I propose to you is to start by assuming that the order is always in this format [ n-th channel X sample size ] (i.e. [ 4 channel X 44100 sample]). That way, you try it out first like this(as in try/except), then maybe as the inverse order.
Another suggestion (and it really depends on your use case), would be to make a data structure class that would manipulate the FFT data to return the complex or the ReIm or the AmPh0 or the AmPh as getters. (so you treat the input data as to be always time and you just give what the users want).
class FFT(object):
def __init__(self,x, fs):
from scipy.fftpack import fft, fftfreq
self.N = len(x)
self.fs = fs
self.X = fft(x) / N
def get_complex(self):
F = np.linspace(0, self.N) / (self.N / self.fs)
return(F, self.X, [])
def get_ReIm(self):
F = np.linspace(0, self.N) / (self.N / self.fs)
RE,IM = np.real(self.X), np.imag(self.X)
return(F, RE, IM)
def get_AmPh0(self):
F = np.linspace(0, (self.N-1)/2, self.N/2)/(self.N/self.fs)
# N should be int because of nfft
half_spec = np.int(self.N / 2)
AMP = abs(self.X[:half_spec])
PHI = np.arctan(np.real(self.X[:half_spec]) / np.imag(self.X[:half_spec]))
return(F, AMP, PHI)
This can then be used to be called depending on the desired output from another class with an eval to get the desire output (but you require to use the same convention across your code ;) ). 2

Resources