Multiplying two RDD in pyspark - apache-spark

I am new to pyspark. I have been trying to multiply two sparse RDD. The code whichI have tried generates two sparse matrices and I have written a function to multiply the two RDD but I think this is not the solution as the computations does not occur in parallel. Can someone help me with it? How can I multiply the RDD in parallel? I tried out a lot of resources on the sites but could not come up with a solution.
import findspark
findspark.init()
import numpy as np
import pyspark
import random
from scipy.sparse import rand
sc = pyspark.SparkContext(appName="matrix")
np.random.seed(42)
n=4
x = rand(n, n, density=0.25)
y = rand(n, n, density=0.25)
A = x.A
B = y.A
rdd_x = sc.parallelize(A)
rdd_y = sc.parallelize(B)
def multiply(r1, r2):
A = r1.collect()
B = r2.collect()
result = []
for i in range(len(B[0])):
total = 0
for j in range(len(A)):
total += A[j] * B[j][i]
result.append(total)
return result
C = multiply(rdd_x,rdd_x)
print(C)
sc.stop()

If you're using collect() anyway, you might as well use np.multiply():
C = np.multiply(np.array(rdd_x.collect()), np.array(rdd_y.collect()))
Or if you want a dot product, you can use np.dot():
C = np.dot(np.array(rdd_x.collect()), np.array(rdd_y.collect()))

Related

How to scatter/send all possible column pairs to the child processes and find coherence between the columns using python mpi4py? Parallel computation

I've a big matrix/2D array for which every possible column-pair I need to find the coherence by parallel computation in python (e.g. mpi4py). Coherence [a function] are computed at various child processes and the child process should send the coherence value to the parent process that gather the coherence value as a list. To do this, I've created a small matrix and list of all possible column pairs as follows:
import numpy as np
from scipy import signal
from itertools import combinations
from mpi4py import MPI
comm = MPI.COMM_WORLD
nproc = comm.Get_size()
rank = comm.Get_rank()
data=np.arange(20).reshape(5, 4)
#List of all possible column pairs
data_col = list(combinations(np.transpose(data), 2)) #list
# Function creation
def myFunc(X,Y):
..................
..................
return Real_coh
if rank==0:
Data= comm.scatter(data_col,root=0) #col_pair
Can anyone suggest me how to proceed further. You are welcome to ask any questions/clarifications. Expecting your cordial help. Thanks
check out the following scripts [with comm.Barrier for sync. communication]. In the script, I've written and read the files as a chunk of h5py dataset which is memory efficient.
import numpy as np
from scipy import signal
from mpi4py import MPI
import h5py as t
chunk_len = 5000 # No. of rows of a matrix
num_c = 34 # No. of column of the matrix
# Actual Dataset
data_mat = np.random.random((10000, num_c))
shape = (chunk_len, data_mat.shape[1])
chunk_size = (chunk_len, 1)
no_of_chunks = data_mat.shape[1]
with t.File('file_name.h5', 'w') as hf:
hf.create_dataset("chunked_arr", data=data_mat, chunks=chunk_size, compression='lzf')
del data_mat
def myFunc(dset_X, dset_Y):
..............
............
return Real_coh
res = np.zeros((num_c, num_c))
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
for i in range(num_c):
with t.File('file_name.h5', 'r', libver='latest') as hf:
dset_X = hf['chunked_arr'][:, i] # Chunk data reading
if i % size == rank:
for j in range(num_c):
with t.File('file_name.h5', 'r', libver='latest') as hf:
dset_Y = hf['chunked_arr'][:, j] # Chunk data reading
res[i][j] = spac(dset_X, dset_Y)
comm.Barrier()
print('Shape of final result :', res.shape )

Product of tuples in generator

I have a generator that yields millions of tuples (~100 Millions) and I need the product (np.prod) of each tuple to then sum them up together.
I have the following example code that works fine for a reasonable number of tuples in the generator, but which takes a lot of time when the number is getting high. I am working on a instance with 64 cores and ~160GB of RAM and I am looking for a way to optimize my code if possible.
import random
import numpy as np
import multiprocessing as mp
import time
nprocs = mp.cpu_count()
pool = mp.Pool(processes=nprocs)
x = 1000000
mygen = ((random.randint(0, 100)/100, random.randint(0, 100)/100 ) for k in range(x))
start = time.time()
proba_all = sum(pool.map(np.prod, mygen))
print(proba_all)
end = time.time()
print (end-start)

Can I using multiple processes to read different subsets of numpy array (or pandas dataframe) safely?

I want to use multiple processes to get each 2 columns combination in numpy array (or pandas dataframe), such as array[:, 1:3], array[:, 2:4].
I wonder is it safe to get array[:, 1:3] in one process and get array[:, 2:4] in another process?
The example code is shown:
import time
import numpy as np
import pandas as pd
from itertools import combinations
from multiprocessing import Pool, Value, Lock, Array
g = np.load('input.npy')
c = Value('i', 0, lock=True)
def count_valid_pairs(i):
pair = g[:, i]
global c
if pair.max() > 100:
with c.get_lock():
c.value += 1
return
if __name__ == '__main__':
t_start = time.time()
cpus = 20
p = Pool(processes=cpus)
r=p.imap_unordered(count_valid_pairs, combinations(range(g.shape[1]), 2))
p.close()
p.join()
print("Total {} pairs has max value > 100".format(c.value)

calculation of distance matrix in a faster approach

I have a dataframe
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
a = {'b':['cat','bat','cat','cat','bat','No Data','bat','No Data']}
df11 = pd.DataFrame(a,index=['x1','x2','x3','x4','x5','x6','x7','x8'])
and i have a distance function
def distancemetric(x):
list1 = x['b'].tolist()
result11 =[]
sortlist11 = [process.extract(ele, list1, limit=11000000, scorer=fuzz.token_set_ratio) for ele in list1]
d11 = [dict(element) for element in sortlist11]
finale11 = [(k, element123[k]) for k in list1 for element123 in d11]
result11.extend([x[1] for x in finale11])
final_result11=np.reshape(result11, (len(x.index),len(x.index)))
return final_result11
I call the funtion by
values1 = distancemetric(df11)
Here the token_set_ratio methods compares only two strings. When i pass an array of strings it gives me avg which i dont need.
This code is working but it is slower. Is there any way which could make it run faster

Submit looping calculation to dask and get back the result

My co-worker and I have been setting up, configuring, and testing Dask for a week or so now, and everything is working great (can't speak highly enough about how easy, straightforward, and powerful it is), but now we are trying to leverage it for more than just testing and are running into an issue. We believe it's a fairly simple one related to syntax and an understanding gap. Any help to get it running is greatly appreciated. Any support in evolving our understanding of more optimal paths is also greatly appreciated.
We got fairly close with these two posts:
Dask: How would I parallelize my code with dask delayed?
Unpacking result of delayed function
High level flow:
Open data in pandas & clean it (we plan on moving this to a pipeline)
From there, convert the cleaned data set for regression into a dask data frame
Set the x & y variables and create all unique x combination sets
Create all unique formulas (y ~ x1 + x2 +0)
Run each individual formula set with the data through a linear lasso lars model to get the AIC for each formula for ranking
Current Issue:
Run each individual formula set (~1700 formulas) with the data (1 single data set which doesn’t vary with each run) on the dask cluster and get the results back
Optimize the calculation & return the final data
Code:
# In[]
# Imports:
import logging as log
import datetime as dat
from itertools import combinations
import numpy as np
import pandas as pd
from patsy import dmatrices
import sklearn as sk
from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
import dask as dask
import dask.dataframe as dk
from dask.distributed import Client
# In[]
# logging, set the dask client, open & clean the data, pass into a dask dataframe
log.basicConfig(level=log.INFO,
format='%(asctime)s %(message)s',
datefmt="%m-%d %H:%M:%S"
)
c = Client('ip:port')
ST = dat.datetime.now()
data_pd = pd.read_csv('some.txt', sep="\t")
#fill some na/clean up the data a bit
data_pd['V9'] = data_pd.V9.fillna("Declined")
data_pd['y'] = data_pd.y.fillna(0)
data_pd['x1'] = data_pd.x1.fillna(0)
#output the clean data and re-import into dask, we could alse use from_pandas to get to dask dataframes
data_pd.to_csv('clean_rr_cp.csv')
data = dk.read_csv(r'C:\path\*.csv', sep=",")
# set x & y variables - the below is truncated
y_var = "y"
x_var = ['x1',
'x2',
'x3',
'x4',......
#list of all variables
all_var = list(y_var) + x_var
#all unique combinations
x_var_combos = [combos for combos in combinations(x_var,2)]
#add single variables for testing as well
for i in x_var:
x_var_combos.append((i,""))
# create formulas from our y, x variables
def formula(y_var, combo):
combo_len = len(combo)
if combo_len == 2:
formula = y_var +"~"+combo[0] +"+"+ combo[1]+"+0"
else:
formula = y_var +"~"+combo[0]+"+0"
return formula
#dask.delayed
def model_aic(dt, formula):
k = 2
y_df, x_df = dmatrices(formula, dt, return_type = 'dataframe')
y_df = np.ravel(y_df)
log.info('dmatrices successful')
LL_model = sk.linear_model.LassoLarsIC(max_iter = 100)
AIC_Value = min(LL_model.fit(x_df, y_df).criterion_) + ( (2*(k**2)+2*(k)) / (len(x_df)-k-1) )
log.info('AIC_Value: %s', AIC_Value)
oup = [formula ,AIC_Value, len(dt)-AIC_Value]
return oup
# ----------------- here's where we're stuck ---------------------
# ----------------- we think this is correct ----------------------
# ----------------- create a list of all formula to execute -------
# In[]
out = []
for i in x_var_combos:
var = model_aic(data, formula(y_var, i))
out.append(var)
# ----------------- but we're stuck figuring out how to -----------
# ------------------make it compute & return the result -----------
ans = c.compute(*out)
ans2 = c.compute(out[1])
print (ans2)

Resources