Function not working in Python Multiprocessing Pool - python-3.x

This is my code which is working, but a little too slow.
def fsl(Df,p):
NewDf = Df.copy()
if p=='group1':
try:
del NewDf['sl']
del NewDf['Y2030']
except:
pass
selected_clusters = NewDf.loc[(NewDf['Group']==p) & (NewDf['Selected']=='Y'),'Clusters'].tolist()
for i in selected_clusters:
x = 0
surplus = calc_surplus(x,i,p)
while (surplus > 0) and (x < 100000):
x += 100
surplus = calc_surplus(x,i,p)
NewDf.loc[(NewDf['Clusters']==i) & (NewDf['Group']==p),'sl']=x
if p=='group1':
NewDf['sl'] = NewDf['sl'].fillna(0)
return NewDf
I would like the surplus for each selected_cluster to be calculated on parallel to speed up the process.
I moved those codes for parallel run to a new function, and tried to run it on Multiprocessing.Pool like this:
def parallel(i):
x = 0
surplus = calc_surplus(x,i,p)
while (surplus > 0) and (x < 100000):
x += 100
surplus = calc_surplus(x,i,p)
NewDf.loc[(NewDf['Clusters']==i) & (NewDf['Group']==p),'sl']=x
if p=='group1':
NewDf['sl'] = NewDf['sl'].fillna(0)
def fsl(Df,p):
NewDf = Df.copy()
if p=='group1':
try:
del NewDf['sl']
del NewDf['Y2030']
except:
pass
selected_clusters = NewDf.loc[(NewDf['Group']==p) & (NewDf['Selected']=='Y'),'Clusters'].tolist()
if __name__ == '__main__':
with Pool(4) as pool:
pool.map(parallel,[i for i in selected_clusters])
return NewDf
The problem is, the function parallel never runs when the function fsl is called. The column sl is never created. I think the mistake is in pool.map or pool.starmap, but i really can't seem to solve this.
I have seen other threads on Multiprocessing, but most of them don't quite apply to this. What is wrong with my code?

Related

I have some code that take under 2 seconds to run 100 iterations, under 8 seconds to run 1000, and over 11 minutes to run 10,000

I'm a hobbyist programmer, and this is just a little project I set for myself. I know I very likely have something in this code that is inefficient enough to not matter for small loops but is compounding when I scale it up. Any suggestions would be appreciated.
def RndSelection(ProjMatrix):
percentiles = [0,10,20,25,30,40,50,60,70,75,80,90,99]
results = []
for row in ProjMatrix.itertuples():
x = npr.randint(1,100)
for p in range(3,16):
if p < 15:
a = percentiles[p-3]
b = percentiles[p-2]
if x in range (a,b):
factor = (b-x)/(b-a)
r = round((row[p]*factor)+((row[p+1])*(1-factor)),2)
break
else:
r = row[p]
results.append(r)
thisrun = pd.DataFrame(results)
return(thisrun)
def main():
ts = datetime.datetime.now()
print ('Run Started: ', ts)
Matrix = SetMatrix()
Outcome = Matrix['player_id']
with concurrent.futures.ProcessPoolExecutor() as executor:
results = [executor.submit(RndSelection,Matrix) for _ in range(10000)]
for f in concurrent.futures.as_completed(results):
thisrun = f.result()
Outcome = pd.concat([Outcome,thisrun],axis=1)
print(Outcome)
ts = datetime.datetime.now()
print('Run Completed: ', ts)
if __name__ == '__main__':
main()
So the answer, as Jérôme pointed out, was the iteration of the concat.
Moving the output to a list of lists and then concat just once improved the runtime of 10,000 interactions to 8 seconds and 100,000 iterations to 2 mins, 34 seconds.
def RndSelection(ProjMatrix):
percentiles = [0,10,20,25,30,40,50,60,70,75,80,90,99]
results = []
r = ""
for row in ProjMatrix.itertuples():
x = npr.randint(1,100)
for p in range(3,16):
if p < 15:
a = percentiles[p-3]
b = percentiles[p-2]
if x in range (a,b):
factor = (b-x)/(b-a)
r = round((row[p]*factor)+((row[p+1])*(1-factor)),2)
break
else:
r = row[p]
results.append(r)
return results
def main():
ts = datetime.datetime.now()
print ('Run Started: ', ts)
Matrix = SetMatrix()
runs = 100000
s = 0
Outcome = pd.DataFrame(Matrix['player_id'])
thisrun = np.empty((runs,0)).tolist()
with concurrent.futures.ProcessPoolExecutor() as executor:
results = [executor.submit(RndSelection,Matrix) for _ in range(runs)]
for f in concurrent.futures.as_completed(results):
thisrun[s]=f.result()
s += 1
allruns = pd.DataFrame(thisrun).transpose()
Outcome = pd.concat([Outcome,allruns],axis=1)
ts = datetime.datetime.now()
print('Run Completed: ', ts)
if __name__ == '__main__':
main()

Problem with LSH implementation from Datasketch (size of input data > 150000)

I am a beginner data scientist, trying to write fast-duplicate search using LSH implementation from datasketch. When I run my program with input text with big size(number of docs > 250000), step 1 is fine, but then program hangs on step 2. When I run program with small input, everything works fine. Is there any decision how to fix this problem?
def LSH(data, num_perm = 128, threshold = 0.5, check_const = 0.9):
vec_unig = CountVectorizer(min_df=50, analyzer = 'word', stop_words = ['_dot_', '_comma_''_voskl_'], ngram_range=(1,2))
X = vec_unig.fit_transform([" ".join(i) for i in data])
length = X.shape[0]
array1 = []
print("Collection:" ,length)
print("Step 1:")
print("Form Minhash")
start = datetime.now()
for i in range(len(data)):
print(i)
m = MinHash(num_perm = num_perm)
for d in data[i]:
m.update(d.encode('utf8'))
array1.append(m)
print(datetime.now()- start)
print("Step 2")
print("Form potential clusters")
start = datetime.now()
lsh = MinHashLSH(threshold = threshold, num_perm = num_perm)
for i in range(len(array1)):
if ((i % 100) == 0):
print(i)
lsh.insert(i, array1[i])
print(datetime.now()- start)

Why is getting the first 30 keys of the dictionary in two statements faster than one statement?

I was doing a benchmark for myself that I encountered this interesting thing. I am trying to get the first 30 keys of a dictionary, and I have written three ways to get it as follows:
import time
dic = {str(i): i for i in range(10 ** 6)}
start_time = time.time()
x = list(dic.keys())[0:30]
print(time.time() - start_time)
start_time = time.time()
y = list(dic.keys())
x = y[0:30]
print(time.time() - start_time)
start_time = time.time()
z = dic.keys()
y = list(z)
x = y[0:30]
print(time.time() - start_time)
The results are:
0.015970945358276367
0.010970354080200195
0.01691460609436035
Surprisingly, the second method is much faster! Any thoughts on this?
Using Python's timeit module to measure various alternatives. I added mine which doesn't convert the keys to list:
from timeit import timeit
dic = {str(i): i for i in range(10 ** 6)}
def f1():
x = list(dic.keys())[0:30]
return x
def f2():
y = list(dic.keys())
x = y[0:30]
return x
def f3():
z = dic.keys()
y = list(z)
x = y[0:30]
return x
def f4():
x = [k for _, k in zip(range(30), dic.keys())]
return x
t1 = timeit(lambda: f1(), number=10)
t2 = timeit(lambda: f2(), number=10)
t3 = timeit(lambda: f3(), number=10)
t4 = timeit(lambda: f4(), number=10)
print(t1)
print(t2)
print(t3)
print(t4)
Prints:
0.1911074290110264
0.20418328599771485
0.18727918600779958
3.5186996683478355e-05
Maybe this is due to inaccuracies in your measure of time. You can use timeit for doing this kind of things:
import timeit
dic = {str(i): i for i in range(10 ** 6)}
# 27.5125/29.0836/26.8525
timeit.timeit("x = list(dic.keys())[0:30]", number=1000, globals={"dic": dic})
# 28.6648/26.4684/30.9534
timeit.timeit("y = list(dic.keys());x=y[0:30]", number=1000)
# 31.7345/29.5301/30.7541
timeit.timeit("z=dic.keys();y=list(z);x=y[0:30]", number=1000, globals={'dic': dic})
The comments show the times I got when running the same code 3 different times. As you can see, even by performing a large number of repetitions, it is possible to obtain quite large variations in time measured. This can be due to several different things:
An item can be in the cache of your processor or not.
Your processor can be occupied doing several other things.
Etc...
As stated by #Andrej Kesely, your bottleneck is due to the fact that you cast your dictionary keys into a list. By doing so, Python goes through the entire dictionary keys, because that's how it converts something to a list generally. Hence, by avoiding this, you can get much better results.

Get output from functions which are run in parallel

I have two functions which I am running asynchronously using python multiprocessing. I need to store values from each function in an array, to use it later.
I have tried to store the output in a numpy array in each of the functions and return them. However, I am not getting any output. The print statements are working alright.
x = np.array([])
y = np.array([])
def func1():
for _ in range(1,150):
print((rstr.xeger(random.choice(sList))))
np.append(x,rstr.xeger(random.choice(sList)))
return x
def func2():
for _ in range(1,150):
print(fake.name())
np.append(y,fake.name())
return y
if __name__ == '__main__':
p1 = multiprocessing.Pool(5)
p2 = multiprocessing.Pool(5)
print("Started")
start = time.time()
x = p1.apply_async(func1)
y = p2.apply_async(func2)
print(time.time() - start)
print("Ended")
I need the arrays x and y which are being returned from the two functions.

Python OO, self calling other functions inside class?

I am new to object oriented concepts, and I've tried solving this problem using OO technique. I have solved it using normal programming technique, but I cant get it to work with OO technique.
here is the problem:
https://www.hackerrank.com/challenges/30-nested-logic?utm_campaign=30_days_of_code_continuous&utm_medium=email&utm_source=daily_reminder
What I've tried:
At first, I only called student1.print(). that didnt work so I called parseDate() and calculateFine().
I put self in all the variables in my student class as I fail to truly understand why or how self works.
Apologizes if I incorrectly labeled the title, but I didnt know what else to write, as I am not certain what exactly is the problem in my code.
class getFine():
def __init__ (self,expectedDate,actualDate):
self.expectedDate = expectedDate
self.actualDate = actualDate
def parseDates(self):
self.ya = self.actualDate[0]
self.ma = self.actualDate[1]
self.da = self.actualDate[0]
self.ye = self.expectedDate[0]
self.me = self.expectedDate[1]
self.de = self.expectedDate[2]
def calculateFine(self):
self.fine = 0
if(self.ya>self.ye):
self.fine = 10000
elif self.ya==self.ye:
if(self.ma>self.me):
self.fine = 500 * (self.ma-self.me)
elif(self.ma==self.me) and (self.da>self.de):
self.fine = 15 * (self.da-self.de)
def print(self):
print(self.fine)
def main():
expectedDate = str(input().split(" "))
actualDate = str(input().split(" "))
student1 = getFine(expectedDate, actualDate)
student1.parseDates()
student1.calculateFine()
student1.print()
if __name__ == "__main__":
main()
Your dates are strings, which you then want to subtract from each other. First convert them to integers or floats using something like this:
expectedDate = [int(i) for i in (input().split(" "))]
actualDate = [int(i) for i in (input().split(" "))]
Does this solve your problem?
If you want to only call the getFine.print() function and not the other funtions, you could call these class methods in the getFine.print() method. Since you probably want to separate the year month and date on every function call, you could move that part to the init method
class getFine():
def __init__ (self,expectedDate,actualDate):
self.expectedDate = expectedDate
self.actualDate = actualDate
self.ya = self.actualDate[0]
self.ma = self.actualDate[1]
self.da = self.actualDate[2] # typo here 0 --> 2
self.ye = self.expectedDate[0]
self.me = self.expectedDate[1]
self.de = self.expectedDate[2]
def calculateFine(self):
self.fine = 0
if(self.ya>self.ye):
self.fine = 10000
elif self.ya==self.ye:
if(self.ma>self.me):
self.fine = 500 * (self.ma-self.me)
elif(self.ma==self.me) and (self.da>self.de):
self.fine = 15 * (self.da-self.de)
def print(self):
self.calculateFine()
print(self.fine)
expectedDate = [int(i) for i in (input().split(" "))]
actualDate = [int(i) for i in (input().split(" "))]
student1 = getFine(expectedDate, actualDate)
student1.print()

Resources