How to concatenate gathered data using mpi4py library in python - python-3.x

I used to list append of data employing mpi4py and try to save the data sequentially at the source(root==0) node.
As suggested by Alan22, I've modified the code and it works, but the script does not concatenate properly, so I get the output file as shown in attached figure:01.
Can anybody help how to fix the error message? In addition, whatever I've written in python script [shown below], isn't the best way to solve the problem.
Is there any way to solve this type of problem efficiently? Any help is highly appreciated.
The python script is given as follows:
import numpy as np
from scipy import signal
from mpi4py import MPI
import random
import cmath, math
import matplotlib.pyplot as plt
import time
#File storing path
save_results_to = 'File storing path'
count_day = 1
count_hour = 1
arr_x = [0, 8.49, 0.0, -8.49, -12.0, -8.49, -0.0, 8.49, 12.0]
arr_y = [0, 8.49, 12.0, 8.49, 0.0, -8.49, -12.0, -8.49, -0.0]
M = len(arr_x)
N = len(arr_y)
total_rows = 50000
# Function of CSD:: Using For Loop
fs = 500; # Sampling frequency
def csdMat(data):
dat, cols = data.shape # For 2D data
total_csd = []
for i in range(cols):
col_csd =[]
for j in range( cols):
freq, Pxy = signal.csd(data[:,i], data[:, j], fs=fs, window='hann', nperseg=100, noverlap=70, nfft=5000)
pxy = np.array(total_csd)
return freq, pxy
# Finding cross spectral density (CSD)
t0 = time.time()
freq, csd = csdMat(raw_data)
print('The shape of the csd data', csd.shape)
print('Time required {} seconds to execute CSD--For loop'.format(time.time()-t0))
resolution = 50 # This is important:: the HIGHER the Resolution, the higher the execution time!!!
grid_size = N * resolution
kx = np.linspace(-kf, kf, ) # space vector
ky = np.linspace(-kf, kf, grid_size) # space vector
def DFT2D(data):
dft2d = np.zeros((P,Q), dtype=complex)
for k in range(P):
for l in range(Q):
sum_log = []
mat2d = np.zeros((M,N))
sum_matrix = 0.0
for m in range(M):
for n in range(N):
e = cmath.exp(-1j*((((dx[m]-dx[n])*kx[l])/1) + (((dy[m]-dy[n])*ky[k])/1)))
sum_matrix += data[m, n] * e
dft2d[k,l] = sum_matrix
return dft2d
dx = arr_x[:]; dy = arr_y[:]
size = comm.Get_size()
rank = comm.Get_rank()
data = []
start_freq = 100
end_freq = 109
freq_range = np.arange(start_freq,end_freq)
no_of_freq = len(freq_range)
for fr_count in range(start_freq, end_freq):
if fr_count % size == rank:
dft = np.zeros((grid_size, grid_size))
spec_csd = csd[:,:, fr_count]
dft = DFT2D(spec_csd) # Call the DFT2D function
spec = np.array(np.real(dft)) # Spectrum or 2D_DFT of data[real part]
print('Shape of spec', spec.shape)
#data = np.append(data,spec)
data = comm.gather(data, root =0)
# comm.Allreduce(MPI.IN_PLACE,data,op=MPI.MAX)
print("Rank: ", rank, ". Spectrum shape is:\n", spec.shape)
if rank == 0:
output_data = np.concatenate(data, axis = 0)
#output_data = np.c_(data, axis = 0)
dft_tot = np.array((output_data), dtype='object')
res = np.zeros((grid_size, grid_size))
for k in range(size):
for i in range(no_of_freq):
jj = np.around(freq[freq_range[i]], decimals = 2)
#print('The shape of data after indexing', data1.shape)
#data_final=data1.reshape(data1.shape[0]*data1.shape[1], data1.shape[2])
res[i * size + k] = dft_tot[k][i] #np.array(data[k])
data = np.array(res)
#print('The shape of the dft at root node', data.shape)
np.savetxt(save_results_to + f'Day_{count_day}_hour_{count_hour}_f_{jj}_hz.txt', data.view(float))
I use the following bash script command to run the script ( i.e., I submit the job with command sbatch
#! /bin/bash -l
#SBATCH -J testmvapich2
#SBATCH -N 1 ## Maximum 04 nodes
#SBATCH --ntasks=10
#SBATCH --cpus-per-task=1 # cpu-cores per task
#SBATCH --mem-per-cpu=3000MB
#SBATCH --time=00:20:00
#SBATCH -p para
#SBATCH --output="stdout.txt"
#SBATCH --error="stderr.txt"
#SBATCH -A camk
##SBATCH --mail-type=ALL
##SBATCH --chdir=/work/cluster_computer/my_name/data_work/MMC331/
eval "$(conda shell.bash hook)"
conda activate myenv
#conda activate fast-mpi4py
#module purge
#module add mpi/mvapich2-2.2-x86_64
mpirun python3

You can try with this after "data = comm.gather(data, root=0)"
if rank == 0:
print('Type of data:', type(data))
dft_tot = np.array((data))#, dtype='object')
print('shape of DATA array:', dft_tot.shape)
#print('Type of dft array:', type(dft_tot))
res = np.zeros((450,450))
for k in range(size):
# for i in range(len(data[rank])):
for i in range(no_of_freq):
jj = np.around(freq[freq_range[k]], decimals = 2)
#data1 = np.array(dft_tot[k])
res[i * size + k] = data[k]
data = np.array(res)#.reshape(data1.shape[0]*data1.shape[1], data1.shape[2])
print('The shape of the dft at root node', data.shape)
np.savetxt(save_results_to + f'Day_{count_day}_hour_{co
Here is the link. Hope it helps mpi4py on HPC: comm.gather

As mentioned in the comments, there are two typos in the code:
The indices for arrays kx and ky have been swapped in the line where variable e is calculated in the function DFT2D(data).
The code is being run for 10 MPI processes for frequencies fr_count in the range start_freq = 100 and end_freq = 109. For this, the loops and arange must be written as for fr_count in range(start_freq, end_freq + 1) and freq_range = np.arange(start_freq, end_freq + 1) as these are not end-point inclusive.
The data = comm.gather(data, root=0) and subsequent output_data = np.concatenate(data, axis=0) operations are performing as they should and as such, the question detracts from the actual issue in the code.
A major issue is that in line res[i * size + k] = dft_tot[k][i] arrays of disparate sizes are being assigned to each other.
Shape of res: 450 x 450
Shape of dft_tot: 10 x 50 x 450
The value of i*size + k ranges from 0 to 110. I think the user expects dft_tot to have the shape 450 x 450, probably due to the indexing confusion mentioned in typo#2 above. Properly done concatenation would yield dft_tot with shape 500 x 450 (since there are 10 arrays of size 50 x 450).
Currently the gather operation returns a list of lists, each containing a NumPy array of size 50 x 450. Technically, it should return a list of NumPy arrays each of size 50 x 450. Adding the line data = data[0] (since data has only one element anyway in each process) before performing data = comm.gather(data, root=0) will achieve this result.
But this whole process seems redundant..
Because there are 10 frequencies considered here. For each frequency, there is a data set of size 50 x 450 . There are 10 MPI processes with each handling one frequency out of the 10. Finally, 10 files are being written corresponding to each frequency. This makes the whole gather operation redundant, as each MPI process can directly write the file corresponding to each frequency.
If instead the dft_tot file was being written as is by rank = 0, then the gather operation would make sense. But splitting the array into the constituent frequencies defeats the point.
This achieves the same result without the gather operation:
size = comm.Get_size()
rank = comm.Get_rank()
start_freq = 100
end_freq = 109
freq_range = np.arange(start_freq,end_freq+1)
no_of_freq = len(freq_range)
for fr_count in range(start_freq, end_freq+1):
if fr_count % size == rank:
dft = np.zeros((grid_size, grid_size))
spec_csd = csd[:,:, fr_count]
dft = DFT2D(spec_csd) # Call the DFT2D function
spec = np.array(np.real(dft)) # Spectrum or 2D_DFT of data[real part]
print('Shape of spec', spec.shape)
jj = np.around(freq[freq_range[rank]], decimals = 2)
np.savetxt(f'Day_{count_day}_hour_{count_hour}_f_{jj}_hz.txt', spec.view(float))


Gekko feasible in smaller problem while infeasible in larger problem

I am trying to solve the problem as follows with Gekko in python.
I_s is an indicator variable in the problem whose value is 1 if theta is positive and 0 if theta is zero.
I wrote the problem in a code using Gekko, python.
In contrast to my previous posts, I add some constraints with respect to I, which is an indicator variable.
If I set N=10, the solution, theta is all zero, which is the result that I want.
But if I set N=100 or 200, the solution cannot be found. I cannot understand why this happens.
I want to check if theta is also zero in larger N (200).
Is there any way to solve this issue?
My code is as belows.
# Import package
from gekko import GEKKO
import numpy as np
# Define parameters
P_CO = 600 # $/tonCO
beta_CO2 = 1 # no unit
P_CO2 = 80 # $/tonCO2eq
E_ref = 3.1022616 # tonCO2eq/tonCO
E_dir = -1.600570692 # tonCO2eq/tonCO
E_indir_others = 0.3339226804 # tonCO2eq/tonCO
E_indir_elec_cons = 18.46607256 # GJ/tonCO
C1_CAPEX = 285695 # no unit
C2_CAPEX = 188.42 # no unit
C1_FOX = 82282 # no unit
C2_FOX = 24.094 # no unit
C1_ROX = 4471.5 # no unit
C2_ROX = 96.034 # no unit
C1_UOX = 7934.9 # no unit
C2_UOX = 986.9 # no unit
r = 0.08 # discount rate
N = 10 # number of scenarios
T = 30 # total time period
GWP_init = 0.338723235 # 2020 Electricity GWP in EU 27 countries
theta_max = 1600000 # Max capacity
# Function to make GWP_EU matrix (TxN matrix)
def Electricity_GWP(GWP_init, n_years, num_episodes):
GWP_mean = 0.36258224*np.exp(-0.16395611*np.arange(1, n_years+2)) + 0.03091272
GWP_mean = GWP_mean.reshape(-1,1)
GWP_Yearly = np.tile(GWP_mean, num_episodes)
noise = np.zeros((n_years+1, num_episodes))
stdev2050 = GWP_mean[-1] * 0.25
stdev = np.arange(0, stdev2050 * (1 + 1/n_years), stdev2050/n_years)
for i in range(n_years+1):
noise[i,:] = np.random.normal(0, stdev[i], num_episodes)
GWP_forecast = GWP_Yearly + noise
return GWP_forecast
GWP_EU = Electricity_GWP(GWP_init, T, N) # (T+1)*N matrix
GWP_EU = GWP_EU[1:,:] # T*N matrix
# Build Gekko model
m = GEKKO(remote=False)
theta = m.Array(m.Var, N, lb=0, ub=theta_max)
I = m.Array(m.Var, N, lb=0, ub=1, integer=True)
demand = np.ones((T,1))
demand[0] = 8031887.589
for k in range(1,11):
demand[k] = demand[k-1] * 1.026
for k in range(11,21):
demand[k] = demand[k-1] * 1.016
for k in range(21,T):
demand[k] = demand[k-1] * 1.011
demand = 0.12 * demand
demand = np.tile(demand, N) # T*N matrix
m3 = [[m.min3(demand[t,s],theta[s]) for t in range(T)] for s in range(N)]
obj = m.sum([sum([((1/(1+r))**(t+1))*((P_CO*m3[s][t]) \
+ (beta_CO2*P_CO2*m3[s][t]*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s])) \
- (C1_CAPEX*I[s]+C2_CAPEX*theta[s]+C1_FOX*I[s]+C2_FOX*theta[s])\
- (C1_ROX*I[s]+C2_ROX*m3[s][t]+C1_UOX*I[s]+C2_UOX*m3[s][t])) for t in range(T)]) for s in range(N)])
for i in range(N):
# obj = m.sum([m.sum([((1/(1+r))**(t+1))*((P_CO*m.min3(demand[t,s], theta[s])) \
# + (beta_CO2*P_CO2*m.min3(demand[t,s], theta[s])*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s])) \
# - (C1_CAPEX+C2_CAPEX*theta[s]+C1_FOX+C2_FOX*theta[s])-(C1_ROX+C2_ROX*m.min3(demand[t,s], theta[s])+C1_UOX+C2_UOX*m.min3(demand[t,s], theta[s]))) for t in range(T)]) for s in range(N)])
# s = m.sum(m.sum(((1/(1+r))**(t+1))*((P_CO*m.min3(demand[t,s], theta[s])) \
# + beta_CO2*P_CO2*m.min3(demand[t,s], theta[s])*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s]) \
# - (C1_CAPEX + C2_CAPEX*theta[s]) - (C1_FOX + C2_FOX*theta[s]) - (C1_ROX + C2_ROX*m.min3(demand[t,s], theta[s])) - (C1_UOX + C2_UOX*m.min3(demand[t,s], theta[s])))
# for s in range(N)) for t in range(T))/N
I solved this issue by increasing the big M in the constraint for an indicator variable I, 1000000 to 10000000.
for i in range(N):
I didn't understand why this worked, but the result gave me the solution of 200*1 array with all zero.

No performance increase when looping of FFTs in Cython

I'm writing a script that tracks the shifts of a sample by estimating the displacement of an ensemble of particles. The first implementation, in Python, works alright, but it takes too long for a large amount of samples. To combat this, I tried rewriting the method in Cython, but as this was my first time ever using it, I can't seem to get any performance increases. I know 3D FFTs exist and are often faster than looped 2D FFTs, but for this instance, they take too much memory and or slower than for-loops.
Python function:
import numpy as np
from scipy.fft import fftshift
import pyfftw
def python_corr(frame_a, frame_b):
DTYPEf = 'float32'
DTYPEc = 'complex64'
k = frame_a.shape[0]
m = frame_a.shape[1] # size y of 2d sample
n = frame_a.shape[2] # size x of 2d sample
fs = [m,n] # sample shape
bs = [m,n//2+1] # rfft sample shape
corr = np.zeros([k,m,n], DTYPEf) # out
fft_forward =
pyfftw.empty_aligned(fs, dtype = DTYPEf),
axes = [-2,-1],
fft_backward =
pyfftw.empty_aligned(bs, dtype = DTYPEc),
axes = [-2,-1],
for ind in range(k): # looping over 2D samples
window_a = frame_a[ind,:,:]
window_b = frame_b[ind,:,:]
corr[ind,:,:] = fftshift( # cross correlation via FFT algorithm
axes = [-2,-1]
return corr
Cython function:
import numpy as np
from scipy.fft import fftshift
import pyfftw
cimport numpy as np
cimport cython
DTYPEf = np.float32
ctypedef np.float32_t DTYPEf_t
DTYPEc = np.complex64
ctypedef np.complex64_t DTYPEc_t
def cython_corr(
np.ndarray[DTYPEf_t, ndim = 3] frame_a,
np.ndarray[DTYPEf_t, ndim = 3] frame_b,
cdef int ind, k, m, n
k = frame_a.shape[0]
m = frame_a.shape[1] # size y of sample
n = frame_a.shape[2] # size x of sample
cdef DTYPEf_t[:,:] window_a = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # sample a
window_a[:,:] = 0.
cdef DTYPEf_t[:,:] window_b = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # sample b
window_b[:,:] = 0.
cdef DTYPEf_t[:,:] corr = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # cross-corr matrix
corr[:,:] = 0.
cdef DTYPEf_t[:,:,:] out = pyfftw.empty_aligned([k,m,n], dtype = DTYPEf) # out
out[:,:] = 0.
cdef object fft_forward
cdef object fft_backward
cdef DTYPEc_t[:,:] f2a = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # rfft out of sample a
f2a[:,:] = 0. + 0.j
cdef DTYPEc_t[:,:] f2b = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # rfft out of sample b
f2b[:,:] = 0. + 0.j
cdef DTYPEc_t[:,:] r = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # power spectrum of sample a and b
r[:,:] = 0. + 0.j
fft_forward =
pyfftw.empty_aligned([m,n], dtype = DTYPEf),
axes = [0,1],
fft_backward =
pyfftw.empty_aligned([m,n//2+1], dtype = DTYPEc),
axes = [0,1],
for ind in range(k):
window_a = frame_a[ind,:,:]
window_b = frame_b[ind,:,:]
r = np.conj(fft_forward(window_a))*fft_forward(window_b) # power spectrum of sample a and b
corr = fft_backward(r).real # cross correlation
corr = fftshift(corr, axes = [0,1]) # shift Q1 --> Q3, Q2 --> Q4
# the fftshift could be moved out of the loop, but lets use that as a last resort :)
out[ind,:,:] = corr
return out
Test for methods:
import time
aa = bb = np.empty([14000, 24,24]).astype('float32') # a small test with 14000 24x24px samples
print(f'Number of samples: {aa.shape[0]}')
start = time.time()
corr = python_corr(aa, bb)
print(f'Time for Python: {time.time() - start}')
del corr
start = time.time()
corr = cython_corr(aa, bb)
print(f'Time for Cython: {time.time() - start}')
del corr

Speed Up a for Loop - Python

I have a code that works perfectly well but I wish to speed up the time it takes to converge. A snippet of the code is shown below:
def myfunction(x, i):
y = x + (min(0, target[i] - data[i, :]x))*data[i]/(norm(data[i])**2))
return y
rows, columns = data.shape
start = time.time()
iterate = 0
iterate_count = []
norm_count = []
res = 5
x_not = np.ones(columns)
while res > 1e-8:
for row in range(rows):
y = myfunction(x_not, row)
x_not = y
iterate += 1
res = abs(norm_count[-1] - norm_count[-2])
print('Converge at {} iterations'.format(iterate))
print('Duration: {:.4f} seconds'.format(time.time() - start))
I am relatively new in Python. I will appreciate any hint/assistance.
Ax=b is the problem we wish to solve. Here, 'A' is the 'data' and 'b' is the 'target'
Ugh! After spending a while on this I don't think it can be done the way you've set up your problem. In each iteration over the row, you modify x_not and then pass the updated result to get the solution for the next row. This kind of setup can't be vectorized easily. You can learn the thought process of vectorization from the failed attempt, so I'm including it in the answer. I'm also including a different iterative method to solve linear systems of equations. I've included a vectorized version -- where the solution is updated using matrix multiplication and vector addition, and a loopy version -- where the solution is updated using a for loop to demonstrate what you can expect to gain.
1. The failed attempt
Let's take a look at what you're doing here.
def myfunction(x, i):
y = x + (min(0, target[i] - data[i, :] # x)) * (data[i] / (norm(data[i])**2))
return y
You subtract
the dot product of (the ith row of data and x_not)
from the ith row of target,
limited at zero.
You multiply this result with the ith row of data divided my the norm of that row squared. Let's call this part2
Then you add this to the ith element of x_not
Now let's look at the shapes of the matrices.
data is (M, N).
target is (M, ).
x_not is (N, )
Instead of doing these operations rowwise, you can operate on the entire matrix!
1.1. Simplifying the dot product.
Instead of doing data[i, :] # x, you can do data # x_not and this gives an array with the ith element giving the dot product of the ith row with x_not. So now we have data # x_not with shape (M, )
Then, you can subtract this from the entire target array, so target - (data # x_not) has shape (M, ).
So far, we have
part1 = target - (data # x_not)
Next, if anything is greater than zero, set it to zero.
part1[part1 > 0] = 0
1.2. Finding rowwise norms.
Finally, you want to multiply this by the row of data, and divide by the square of the L2-norm of that row. To get the norm of each row of a matrix, you do
rownorms = np.linalg.norm(data, axis=1)
This is a (M, ) array, so we need to convert it to a (M, 1) array so we can divide each row. rownorms[:, None] does this. Then divide data by this.
part2 = data / (rownorms[:, None]**2)
1.3. Add to x_not
Finally, we're adding each row of part1 * part2 to the original x_not and returning the result
result = x_not + (part1 * part2).sum(axis=0)
Here's where we get stuck. In your approach, each call to myfunction() gives a value of part1 that depends on target[i], which was changed in the last call to myfunction().
2. Why vectorize?
Using numpy's inbuilt methods instead of looping allows it to offload the calculation to its C backend, so it runs faster. If your numpy is linked to a BLAS backend, you can extract even more speed by using your processor's SIMD registers
The conjugate gradient method is a simple iterative method to solve certain systems of equations. There are other more complex algorithms that can solve general systems well, but this should do for the purposes of our demo. Again, the purpose is not to have an iterative algorithm that will perfectly solve any linear system of equations, but to show what kind of speedup you can expect if you vectorize your code.
Given your system
data # x_not = target
Let's define some variables:
A = data.T # data
b = data.T # target
And we'll solve the system A # x = b
x = np.zeros((columns,)) # Initial guess. Can be anything
resid = b - A # x
p = resid
while (np.abs(resid) > tolerance).any():
Ap = A # p
alpha = (resid.T # resid) / (p.T # Ap)
x = x + alpha * p
resid_new = resid - alpha * Ap
beta = (resid_new.T # resid_new) / (resid.T # resid)
p = resid_new + beta * p
resid = resid_new + 0
To contrast the fully vectorized approach with one that uses iterations to update the rows of x and resid_new, let's define another implementation of the CG solver that does this.
def solve_loopy(data, target, itermax = 100, tolerance = 1e-8):
A = data.T # data
b = data.T # target
rows, columns = data.shape
x = np.zeros((columns,)) # Initial guess. Can be anything
resid = b - A # x
resid_new = b - A # x
p = resid
niter = 0
while (np.abs(resid) > tolerance).any() and niter < itermax:
Ap = A # p
alpha = (resid.T # resid) / (p.T # Ap)
for i in range(len(x)):
x[i] = x[i] + alpha * p[i]
resid_new[i] = resid[i] - alpha * Ap[i]
# resid_new = resid - alpha * A # p
beta = (resid_new.T # resid_new) / (resid.T # resid)
p = resid_new + beta * p
resid = resid_new + 0
niter += 1
return x
And our original vector method:
def solve_vect(data, target, itermax = 100, tolerance = 1e-8):
A = data.T # data
b = data.T # target
rows, columns = data.shape
x = np.zeros((columns,)) # Initial guess. Can be anything
resid = b - A # x
resid_new = b - A # x
p = resid
niter = 0
while (np.abs(resid) > tolerance).any() and niter < itermax:
Ap = A # p
alpha = (resid.T # resid) / (p.T # Ap)
x = x + alpha * p
resid_new = resid - alpha * Ap
beta = (resid_new.T # resid_new) / (resid.T # resid)
p = resid_new + beta * p
resid = resid_new + 0
niter += 1
return x
Let's solve a simple system to see if this works first:
2x1 + x2 = -5
−x1 + x2 = -2
should give a solution of [-1, -3]
data = np.array([[ 2, 1],
[-1, 1]])
target = np.array([-5, -2])
print(solve_loopy(data, target))
print(solve_vect(data, target))
Both give the correct solution [-1, -3], yay! Now on to bigger things:
data = np.random.random((100, 100))
target = np.random.random((100, ))
Let's ensure the solution is still correct:
sol1 = solve_loopy(data, target)
np.allclose(data # sol1, target)
# Output: False
sol2 = solve_vect(data, target)
np.allclose(data # sol2, target)
# Output: False
Hmm, looks like the CG method doesn't work for badly conditioned random matrices we created. Well, at least both give the same result.
np.allclose(sol1, sol2)
# Output: True
But let's not get discouraged! We don't really care if it works perfectly, the point of this is to demonstrate how amazing vectorization is. So let's time this:
import timeit
timeit.timeit('solve_loopy(data, target)', number=10, setup='from __main__ import solve_loopy, data, target')
# Output: 0.25586539999994784
timeit.timeit('solve_vect(data, target)', number=10, setup='from __main__ import solve_vect, data, target')
# Output: 0.12008900000000722
Nice! A ~2x speedup simply by avoiding a loop while updating our solution!
For larger systems, this will be even better.
for N in [10, 50, 100, 500, 1000]:
data = np.random.random((N, N))
target = np.random.random((N, ))
t_loopy = timeit.timeit('solve_loopy(data, target)', number=10, setup='from __main__ import solve_loopy, data, target')
t_vect = timeit.timeit('solve_vect(data, target)', number=10, setup='from __main__ import solve_vect, data, target')
print(N, t_loopy, t_vect, t_loopy/t_vect)
This gives us:
N t_loopy t_vect speedup
00010 0.002823 0.002099 1.345390
00050 0.051209 0.014486 3.535048
00100 0.260348 0.114601 2.271773
00500 0.980453 0.240151 4.082644
01000 1.769959 0.508197 3.482822

Python: Fastest way to subtract elements of datasets of HDF5 file?

Here is one interesting problem.
Input: Input is two arrays (Nx4, sorted in column-2) stored in datasets-1 and 2 in HDF5 file (input.h5). N is huge (originally belonging to 10 GB of file, hence stored in HDF5 file).
Output: Subtracting each column-2 element of dataset-2 from dataset-1, such that the difference (delta) is between +/-4000. Eventually saving this info in dset of a new HDF5 file. I need to refer to this new file back-and-forth, hence HDF5 not a text file.
Concern: I initially used .append method but that crashed the execution for 10GBs input. So, I am now using dset.resize method (and would like to stick to it preferably). I am also using binary search as I was told in one of my last posts. So now, although the script seems to be working for large (10 GBs) of datasets, it is quite slow! The subtraction (for/while) loop is possibly the culprit! Any suggestions on how I can make this fast? I aim to use the fastest approach (and possibly the simplest, since I am a beginner).
import numpy as np
import time
import h5py
import sys
import csv
f_r = h5py.File('input.h5', 'r+')
dset1 = f_r.get('dataset_1')
dset2 = f_r.get('dataset_2')
r1,c1 = dset1.shape
r2,c2 = dset2.shape
left, right, count = 0,0,0
W = 4000 # Window half-width
n = 1
# **********************************************
# HDF5 Out Creation
# **********************************************
f_w = h5py.File('data.h5', 'w')
d1 = np.zeros(shape=(0, 4))
dset = f_w.create_dataset('dataset_1', data=d1, maxshape=(None, None), chunks=True)
for j in range(r1):
e1 = dset1[j,1]
# move left pointer so that is within -delta of e
while left < r2 and dset2[left,1] - e1 <= -W:
left += 1
# move right pointer so that is outside of +delta
while right < r2 and dset2[right,1] - e1 <= W:
right += 1
for i in range(left, right):
delta = e1 - dset2[i,1]
dset.resize(dset.shape[0] + n, axis=0)
dset[count, 0:4] = [count, dset1[j,1], dset2[i,1], delta]
count += 1
print("\nFinal shape of dataset created: " + str(dset.shape))
EDIT (Aug 8, chunking HDF5 file as suggested by #kcw78)
#kcw78: So, I tried chunking as well. The following works well for small files (<100MB) but the computation time increases incredibly when I play with GBs of data. Can something be improvised in my code to make it fast?
My suspicion is for j loop is computationally expensive and may be the reason, any suggestions ?
filename = 'file.h5'
with h5py.File(filename, 'r') as fid:
chunks1 = fid["dataset_1"][:, :]
with h5py.File(filename, 'r') as fid:
chunks2 = fid["dataset_2"][:, :]
print(chunks1.shape, chunks2.shape) # shape is (13900,4) and (138676,4)
count = 0
W = 4000 # Window half-width
# **********************************************
# HDF5-Out Creation
# **********************************************
f_w = h5py.File('data.h5', 'w')
d1 = np.zeros(shape=(0, 4))
dset = f_w.create_dataset('dataset_1', data=d1, maxshape=(None, None), chunks=True)
# chunk size to read from first/second dataset
size1 = 34850
size2 = 34669
# save "n" no. of subtracted values in dset
n = 10**4
u = 0
fill_index = 0
for c in range(4): # read 4 chunks of dataset-1 one-by-one
h = c * size1
chunk1 = chunks1[h:(h + size1)]
for d in range(4): # read chunks of dataset-2
g = d * size2
chunk2 = chunks2[g:(g + size2)]
r2 = chunk2.shape[0]
left, right = 0, 0
for j in range(chunk1.shape[0]): # grab col.2 values from dataset-1
e1 = chunk1[j, 1]
while left < r2 and chunk2[left, 1] - e1 <= -W:
left += 1
# move right pointer so that is outside of +delta
while right < r2 and chunk2[right, 1] - e1 <= W:
right += 1
for i in range(left, right):
if chunk1[j, 0]<8193 and chunk2[i, 0] <8193:
e2 = chunk2[i, 1]
delta = e1 - e2 # subtract col.2 values
count += 1
if fill_index == (n):
dset.resize(dset.shape[0] + n, axis=0)
dset[u:(u + n), 0:4] = [count, e1, e1, delta]
u = u * n
fill_index = 0
fill_index += 1
del chunk2
del chunk1
print(count) # these are (no. of) subtracted values such that the difference is between +/- 4000
EDIT (Jul 31)
I tried reading in chunks and even using memory mapping. It is efficient if I do not perform any subtraction and just go through the chunks. The for j in range(m): is the one that is inefficient; probably because I am grabbing each value of the chunk from file-1. This is when I am just subtracting and not saving the difference. Any better logic/implementation you can think of that can be replaced for "for j in range(m):?
size1 = 100_000_0
size2 = 100_000_0
filename = ["file-1.txt", "file-2.txt"]
chunks1 = pd.read_csv(filename[0], chunksize=size1,
names=['c1', 'c2', 'lt', 'rt'])
fp1 = np.memmap('newfile1.dat', dtype='float64', mode='w+', shape=(size1,4))
fp2 = np.memmap('newfile2.dat', dtype='float64', mode='w+', shape=(size2,4))
for chunk1 in chunks1: # grab chunks from file-1
m, _ = chunk1.shape
fp1[0:m,:] = chunk1
chunks2 = pd.read_csv(filename[1], chunksize=size2,
names=['ch', 'tmstp', 'lt', 'rt'])
for chunk2 in chunks2: # grab chunks from file-2
k, _ = chunk2.shape
fp2[0:k, :] = chunk2
for j in range(m): # Grabbing values from file-1's chunk
e1 = fp1[j,1]
delta_mat = e1 - fp2 # just a test, actually e1 should be subtracted from col-2 of fp2, not the whole fp2
count += 1
a += k
del chunks2
i += m
prog_count += m

Incremental PCA

I've never used incremental PCA which exists in sklearn and I'm a bit confused about it's parameters and not able to find a good explanation of them.
I see that there is batch_size in the constructor, but also, when using partial_fit method you can again pass only a part of your data, I've found the following way:
n = df.shape[0]
chunk_size = 100000
iterations = n//chunk_size
ipca = IncrementalPCA(n_components=40, batch_size=1000)
for i in range(0, iterations):
ipca.partial_fit(df[i*chunk_size : (i+1)*chunk_size].values)
ipca.partial_fit(df[iterations*chunk_size : n].values)
Now, what I don't understand is the following - when using partial fit, does the batch_size play any role at all, or not? And how are they related?
Moreover, if both are considered, how should I change their values properly, when wanting to increase the precision while increasing memory footprint (and the other way around, decrease the memory consumption for the price of decreased accuracy)?
The docs say:
batch_size : int or None, (default=None)
The number of samples to use for each batch. Only used when calling fit...
This param is not used within partial_fit, where the batch-size is controlled by the user.
Bigger batches will increase memory-consumption, smaller ones will decrease it.
This is also written in the docs:
This algorithm has constant memory complexity, on the order of batch_size, enabling use of np.memmap files without loading the entire file into memory.
Despite some checks and parameter-heuristics, the whole fit-function looks like this:
for batch in gen_batches(n_samples, self.batch_size_):
self.partial_fit(X[batch], check_input=False)
Here is some an incremental PCA code based on which is an implementation of CCIPCA method.
import scipy.sparse as sp
import numpy as np
from scipy import linalg as la
import scipy.sparse as sps
from sklearn import datasets
class CCIPCA:
def __init__(self, n_components, n_features, amnesic=2.0, copy=True):
self.n_components = n_components
self.n_features = n_features
self.copy = copy
self.amnesic = amnesic
self.iteration = 0
self.mean_ = None
self.components_ = None
self.mean_ = np.zeros([self.n_features], np.float)
self.components_ = np.ones((self.n_components,self.n_features)) / \
def partial_fit(self, u):
n = float(self.iteration)
V = self.components_
# amnesic learning params
if n <= int(self.amnesic):
w1 = float(n+2-1)/float(n+2)
w2 = float(1)/float(n+2)
w1 = float(n+2-self.amnesic)/float(n+2)
w2 = float(1+self.amnesic)/float(n+2)
# update mean
self.mean_ = w1*self.mean_ + w2*u
# mean center u
u = u - self.mean_
# update components
for j in range(0,self.n_components):
if j > n: pass
elif j == n: V[j,:] = u
# update the components
V[j,:] = w1*V[j,:] + w2*,V[j,:])*u / la.norm(V[j,:])
normedV = V[j,:] / la.norm(V[j,:])
normedV = normedV.reshape((self.n_features, 1))
u = u -,normedV),normedV.T)
self.iteration += 1
self.components_ = V / la.norm(V)
def post_process(self):
self.explained_variance_ratio_ = np.sqrt(np.sum(self.components_**2,axis=1))
idx = np.argsort(-self.explained_variance_ratio_)
self.explained_variance_ratio_ = self.explained_variance_ratio_[idx]
self.components_ = self.components_[idx,:]
self.explained_variance_ratio_ = (self.explained_variance_ratio_ / \
for r in range(0,self.components_.shape[0]):
d = np.sqrt([r,:],self.components_[r,:]))
self.components_[r,:] /= d
You can test it with
import pandas as pd, ccipca
df = pd.read_csv('iris.csv')
df = np.array(df)[:,:4].astype(float)
pca = ccipca.CCIPCA(n_components=2,n_features=4)
S = 10
print df[0, :]
for i in range(150): pca.partial_fit(df[i, :])
The resulting eigenvectors / values will not exaactly be the same as the batch PCA. Results are approximate, but they are useful.
