I have a generator that yields millions of tuples (~100 Millions) and I need the product (np.prod) of each tuple to then sum them up together.
I have the following example code that works fine for a reasonable number of tuples in the generator, but which takes a lot of time when the number is getting high. I am working on a instance with 64 cores and ~160GB of RAM and I am looking for a way to optimize my code if possible.
import random
import numpy as np
import multiprocessing as mp
import time
nprocs = mp.cpu_count()
pool = mp.Pool(processes=nprocs)
x = 1000000
mygen = ((random.randint(0, 100)/100, random.randint(0, 100)/100 ) for k in range(x))
start = time.time()
proba_all = sum(pool.map(np.prod, mygen))
print(proba_all)
end = time.time()
print (end-start)
Related
I've experienced some difficulties when using multiprocessing Pool in python3. I want to do BIG array calculation by using pool.map. Basically, I've a 3D array which I need to do computation for 10 times and it generates 10 output files sequentially. This task can be done 3 times i,e, in the output we get 3*10=30 output files(*.txt). To do this, I've prepared the following script for small array calculation (a sample problem). However, when I use this script for a BIG array calculation or array come out from a series of files, then this piece of code (maybe pool) capture the memory, and it does not save any .txt file at the destination directory. There is no error message when I run the file with command mpirun python3 sample_prob_func.py
Can anybody suggest what is the problem in the sample script and how to write code to get rid of stuck? I've not received any error message, but don't know where the problem occurs. Any help is appreciated. Thanks!
import numpy as np
import multiprocessing as mp
from scipy import signal
import matplotlib.pyplot as plt
import contextlib
import os, glob, re
import random
import cmath, math
import time
import pdb
#File Storing path
save_results_to = 'File saving path'
arr_x = [0, 8.49, 0.0, -8.49, -12.0, -8.49, -0.0, 8.49, 12.0]
arr_y = [0, 8.49, 12.0, 8.49, 0.0, -8.49, -12.0, -8.49, -0.0]
N=len(arr_x)
np.random.seed(12345)
total_rows = 5000
arr = np.reshape(np.random.rand(total_rows*N),(total_rows, N))
arr1 = np.reshape(np.random.rand(total_rows*N),(total_rows, N))
arr2 = np.reshape(np.random.rand(total_rows*N),(total_rows, N))
# Finding cross spectral density (CSD)
def my_func1(data):
# Do something here
return array1
t0 = time.time()
my_data1 = my_func1(arr)
my_data2 = my_func1(arr1)
my_data3 = my_func1(arr2)
print('Time required {} seconds to execute CSD--For loop'.format(time.time()-t0))
mydata_list = [my_data1,my_data3,my_data3]
def my_func2(data2):
# Do something here
return from_data2
start_freq = 100
stop_freq = 110
freq_range= np.around(np.linspace(start_freq,stop_freq,11)/10, decimals=2)
no_of_freq = len(freq_range)
list_arr =[]
def my_func3(csd):
list_csd=[]
for fr_count in range(start_freq, stop_freq):
csd_single = csd[:,:, fr_count]
list_csd.append(csd_single)
print('Shape of list is :', np.array(list_csd).shape)
return list_csd
def parallel_function(BIG_list_data):
with contextlib.closing(mp.Pool(processes=10)) as pool:
dft= pool.map(my_func2, BIG_list_data)
pool.close()
pool.join()
data_arr = np.array(dft)
print('shape of data :', data_arr.shape)
return data_arr
count_day = 1
count_hour =0
for count in range(3):
count_hour +=1
list_arr = my_func3(mydata_list[count]) # Load Numpy files
print('Array shape is :', np.array(arr).shape)
t0 = time.time()
data_dft = parallel_function(list_arr)
print('The hour number={} data is processing... '.format(count_hour))
print('Time in parallel:', time.time() - t0)
for i in range(no_of_freq-1): # (11-1=10)
jj = freq_range[i]
#print('The hour_number {} and frequency number {} data is processing... '.format(count_hour, jj))
dft_1hr_complx = data_dft[i,:,:]
np.savetxt(save_results_to + f'csd_Day_{count_day}_Hour_{count_hour}_f_{jj}_hz.txt', dft_1hr_complx.view(float))
As #JérômeRichard suggested,to aware your job scheduler you need to define the number of processors will engage to perform this task. So, the following command could help you: ncpus = int(os.getenv('SLURM_CPUS_PER_TASK', 1))
You need to use this line inside your python script. Also, inside the parallel_function use with contextlib.closing(mp.Pool(ncpus=10)) as pool: instead of with contextlib.closing(mp.Pool(processes=10)) as pool:. Thanks
I have multiple threads that uses numpy array.
import threading
import numpy as np
import time
shared_array = np.ones((5, 5))
def run(shared_array, nb_iters):
k = shared_array**2
for i in range(nb_iters):
k+=2
def multi_thread():
jobs = []
for _ in range(5):
thread = threading.Thread(target=run, args=(shared_array, 1000000))
jobs.append(thread)
for j in jobs:
j.start()
for j in jobs:
j.join()
t0 = time.time()
multi_thread()
print(time.time() - t0)
#result: 6.502177000045776
t0 = time.time()
# we used 1000000 iterations for each thread => total nb of iterations = 5 * 1000000
run(shared_array, 1000000 * 5)
print(time.time() - t0)
#result: 6.6372435092926025
the problem is after adding the numpy array as an argument, the execution time of 5 parallel threads is equal to a sequential execution!
so I want to know how to make a program (similar to this one) parallel,
That's a poor example. Python has an internal lock (the global interpreter lock, GIL) that means only one thread at a time can be executing Python code. When you go into numpy, that can run in parallel, but because your array is so small, you are spending almost no time in numpy, so you aren't getting any parallelism to speak of.
I have a simple pi-approximating script like so:
import numpy as np
import matplotlib.pyplot as plt
import time
start = 10
stop = 1000000
step = 100
exactsolution = np.pi
def montecarlopi(N=1000000):
random_x = np.random.random(size = N)
random_y = np.random.random(size = N)
bod = np.array([random_x, random_y]).T
square_area = N
quarter_circle_area = np.count_nonzero(np.linalg.norm(bod, axis = 1)<=1)
pi_approx = 4*quarter_circle_area/square_area
return pi_approx
if __name__ == '__main__':
times = []
results = []
attemps = np.arange(start = start, stop = stop, step = step)
for i in attemps:
start_time = time.time()
results.append(montecarlopi(i))
times.append(time.time()-start_time)
absolute_errors = np.abs(np.array(results)-exactsolution)
and I want to know how long the calculation takes based on the number of random attemps I use. As you can see I use a for loop to get each of the calculation times I need, but this defeats the purpose of Numpy, slowing down my code a lot. Effectively I'd like to just call montecarlopi() on the whole attemps array, but then I wouldn't have the calculation times.
Is there a way to time each paralelized calculation numpy does?
I used the timing code from the answer provided here:
https://codereview.stackexchange.com/questions/165245/plot-timings-for-a-range-of-inputs
I only had to change labels to codecs in the line:
empty_multi_index = pd.MultiIndex(levels=[[], []], codes=[[], []], names=['func', 'result'])
Timing linear
Then you can run your whole timing experiment using
timings.plot_times([montecarlopi], inputs=np.arange(10, 1000000, 1000), repeats=3)
And get an output like this
Timing Logspace
Or more clear using logspacing
timings.plot_times([montecarlopi], inputs=np.logspace(1, 8, 8, dtype=np.int), repeats=3)
I am new to pyspark. I have been trying to multiply two sparse RDD. The code whichI have tried generates two sparse matrices and I have written a function to multiply the two RDD but I think this is not the solution as the computations does not occur in parallel. Can someone help me with it? How can I multiply the RDD in parallel? I tried out a lot of resources on the sites but could not come up with a solution.
import findspark
findspark.init()
import numpy as np
import pyspark
import random
from scipy.sparse import rand
sc = pyspark.SparkContext(appName="matrix")
np.random.seed(42)
n=4
x = rand(n, n, density=0.25)
y = rand(n, n, density=0.25)
A = x.A
B = y.A
rdd_x = sc.parallelize(A)
rdd_y = sc.parallelize(B)
def multiply(r1, r2):
A = r1.collect()
B = r2.collect()
result = []
for i in range(len(B[0])):
total = 0
for j in range(len(A)):
total += A[j] * B[j][i]
result.append(total)
return result
C = multiply(rdd_x,rdd_x)
print(C)
sc.stop()
If you're using collect() anyway, you might as well use np.multiply():
C = np.multiply(np.array(rdd_x.collect()), np.array(rdd_y.collect()))
Or if you want a dot product, you can use np.dot():
C = np.dot(np.array(rdd_x.collect()), np.array(rdd_y.collect()))
I'm reading video File from opencv and store their frames in a list, Then I provide this list to face detection function which in turn store the face location in another list, the problem is that when I give an equal number of frame to multiprocessing code and single processing code, the performance is not very different. please check my code, suggest the possible solution. I am using python 3.5, the number of CPU core is 4. Multiprocessing code is supposed to give almost 4 times performance but it only gives few second gains.
My code:
import cv2,time,dlib,imutils
from multiprocessing import Pool
detector = dlib.get_frontal_face_detector()
vidcap=cv2.VideoCapture(r'/home/deeplearning/PycharmProjects
/sjtech/jurassic_park_intro.mp4')
count = 0
frame_list = []
def parallel_detection(f):
return detector(f,1)
success,image = vidcap.read()
while success:
print('Read a new frame: ', success)
frame_list.append(image)
count += 1
success,image = vidcap.read()
del frame_list[-1]
print("out of while")
p = Pool()
t1 = time.time()
#below is my multiprocessing code, on 40 frames it takes 42 seconds
face_location=p.map(parallel_detection,frame_list[900:940])
#below is single processing code, it takes 50 seconds
face_location=[detector(frame_list[x],1) for x in range(900,940)]
print(time.time()-t1)