Issue in modifying a for loop using joblib - python-3.x

I have a sequential set of code which generates a tuple of values for different stocks, which is passed to a multiprocessing pool to apply technical indicators. Below is the sequential piece of code, which is working as expected.
child_fn_arg_tuple_list = []
for stock in m1_ts_consistent_stock_list: # prev_day_stock_list:
f_prev_ts_stock_merged_mdf_row =
m1_df_in_mdf[(m1_df_in_mdf['stock_id']==stock) &
(m1_df_in_mdf['datetimestamp'] == prev_ts)] # previous timestamp
if f_prev_ts_stock_merged_mdf_row.empty:
f_filtered_stock_list.remove(stock)
else:
f_stock_prev_ts_merged_ohlcv_df_list_of_dict =
f_prev_ts_stock_merged_mdf_row['merged_ohlcv_df'].iloc[0]
f_current_ts_stock_ohlcv_row_df =
period_ts_ohlcv_df[(period_ts_ohlcv_df['stock_id'] == stock)].copy()
if f_current_ts_stock_ohlcv_row_df.shape[0] == 1:
pass
else:
error_string = f_current_fn + 'Expected
f_current_ts_stock_ohlcv_row_df shape for stock ' + stock \
+ 'at ts ' + str(m1_time) + ' is not 1 - ' +
str(f_current_ts_stock_ohlcv_row_df.shape[0])
f_current_ts_stock_ohlcv_row_df =
period_ts_ohlcv_df[(period_ts_ohlcv_df['stock_id'] == stock) &
(period_ts_ohlcv_df['datetimestamp'] == (m1_time -
timedelta(minutes=1)))].copy()
fn_arg_tuple = (f_from_date_list,f_run_folder_name,stock,
f_period,m1_time, f_stock_prev_ts_merged_ohlcv_df_list_of_dict,
f_current_ts_stock_ohlcv_row_df,f_grouped_column_list_dict)
child_fn_arg_tuple_list.append(fn_arg_tuple)
result_list = []
pool = multiprocessing.Pool(7)
for result in pool.starmap(single_stock_apply_indicator_df_in_df_v3, child_fn_arg_tuple_list):
result_list.append(result)
pool.close()
Since the for loop runs for around 400 stocks every minute, I am trying to speed up the for loop over stocks, before passing them for applying multiprocessing using python inner function and joblib - parallel , delayed.
def create_child_fn_arg_tuple_list(cp_stock): # cp = child parameter
f_prev_ts_stock_merged_mdf_row = m1_df_in_mdf[
(m1_df_in_mdf['stock_id'] == cp_stock) &
(m1_df_in_mdf['datetimestamp'] == prev_ts)].copy()
if f_prev_ts_stock_merged_mdf_row.empty:
f_filtered_stock_list.remove(cp_stock)
else:
f_stock_prev_ts_merged_ohlcv_df_list_of_dict = \
f_prev_ts_stock_merged_mdf_row['merged_ohlcv_df'].iloc[0]
f_current_ts_stock_ohlcv_row_df = period_ts_ohlcv_df[
(period_ts_ohlcv_df['stock_id'] == cp_stock)].copy()
if f_current_ts_stock_ohlcv_row_df.shape[0] == 1:
pass
else:
error_string = f_current_fn + 'Expected f_current_ts_stock_ohlcv_row_df
shape for stock ' + \
cp_stock + 'at ts ' + str(m1_time) + ' is not 1 - ' + \
str(f_current_ts_stock_ohlcv_row_df.shape[0])
f_current_ts_stock_ohlcv_row_df =
period_ts_ohlcv_df[(period_ts_ohlcv_df['stock_id'] == cp_stock)
& (period_ts_ohlcv_df['datetimestamp'] ==
(m1_time - timedelta(minutes=1)))].copy()
fn_arg_tuple = (f_from_date_list, f_run_folder_name, cp_stock, f_period,
m1_time,f_stock_prev_ts_merged_ohlcv_df_list_of_dict,
f_current_ts_stock_ohlcv_row_df,f_grouped_column_list_dict)
child_fn_arg_tuple_list.append(fn_arg_tuple)
return child_fn_arg_tuple_list
child_fn_arg_tuple_list = Parallel(n_jobs=7, backend='multiprocessing')\
(delayed(create_child_fn_arg_tuple_list)(in_stock) for in_stock in
m1_ts_consistent_stock_list)
result_list = []
pool = multiprocessing.Pool(7)
for result in pool.starmap(single_stock_apply_indicator_df_in_df_v3, child_fn_arg_tuple_list):
result_list.append(result)
pool.close()
I am getting an error -
AttributeError: Can't pickle local object 'multiple_stock_apply_indicator_df_in_df_v6..create_child_fn_arg_tuple_list' and occurs in the line line where I am trying to apply the joblib parallel and delayed.
Please note that there are some common variables between the main function and inner function - m1_df_in_mdf, f_filtered_stock_list
1] m1_df_in_mdf is not affected as it is used only in read only mode
2] f_filtered_stock_list is affected as some stocks are removed
My objective is to get the for loop of stocks run faster, any other approaches are also welcome.

Related

How to iterate over PyTorch tensor

I have a tensor data of size (1000,110) and I want to iterate over the first index of the tensor and calculate the following.
data = torch.randn(size=(1000,110)).to(device)
male_poor = torch.tensor(0).float().to(device)
male_rich = torch.tensor(0).float().to(device)
female_poor = torch.tensor(0).float().to(device)
female_rich = torch.tensor(0).float().to(device)
for i in data:
if torch.argmax(i[64:66]) == 0 and torch.argmax(i[108:110]) == 0:
female_poor += 1
if torch.argmax(i[64:66]) == 0 and torch.argmax(i[108:110]) == 1:
female_rich += 1
if torch.argmax(i[64:66]) == 1 and torch.argmax(i[108:110]) == 0:
male_poor += 1
if torch.argmax(i[64:66]) == 1 and torch.argmax(i[108:110]) == 1:
male_rich += 1
disparity = ((female_rich/(female_rich + female_poor))) / ((male_rich/(male_rich + male_poor)))
Is there a faster way than for loop to do this?
The key in pytorch (as well as numpy) is vectorizataion, that is if you can remove loops by operating on matrices it will be a lot faster. Loops in python are quite slow compared to the loops in the underlying compiled C code. On my machine the execution time for your code was about 0.091s, the following vectorized code was about 0.002s so about x50 faster:
import torch
torch.manual_seed(0)
device = torch.device('cpu')
data = torch.randn(size=(1000, 110)).to(device)
import time
t = time.time()
#vectorize over first dimension
argmax64_0 = torch.argmax(data[:, 64:66], dim=1) == 0
argmax64_1 = torch.argmax(data[:, 64:66], dim=1) == 1
argmax108_0 = torch.argmax(data[:, 108:110], dim=1) == 0
argmax108_1 = torch.argmax(data[:, 108:110], dim=1) == 1
female_poor = (argmax64_0 & argmax108_0).sum()
female_rich = (argmax64_0 & argmax108_1).sum()
male_poor = (argmax64_1 & argmax108_0).sum()
male_rich = (argmax64_1 & argmax108_1).sum()
disparity = ((female_rich / (female_rich + female_poor))) / ((male_rich / (male_rich + male_poor)))
print(time.time()-t)
print(disparity)

Windows 10 Crashes when Running Python Code (PyVisa)

I'm trying to automate data collection from an SR245 Boxcar using Python 3.6 and the PyVisa library (version 1.11.1). 9/10 times, it works great. However, three times over the course of two days it has caused the entire computer to crash and reboot (running on Windows 10). This has resulted in a lot of data loss, and I'm trying to figure out what I'm doing wrong that is leading to the whole system crashing. Code is below (it is part of a larger program, but I also run this piece of code by itself, and it has caused crashes). The data_processing file is not shown, but the functions there are simple calculations (e.g. divide the values in a list by the values in another list, return the average value from a list of integers, etc.)
import pyvisa
from pyvisa.constants import SerialTermination
import time
import numpy as np
from data_processing import *
def connect_boxcar(pNum):
rm = pyvisa.ResourceManager()
port = "COM"+pNum
sr = rm.open_resource(port)
return sr
def config_boxcar(boxcar):
#Configure the boxcar settings
boxcar.write_termination = '\r'
boxcar.read_termination='\r'
boxcar.baud_rate=19200
boxcar.end_output = SerialTermination.termination_char
def preset_scan(boxcar):
#Reset boxcar settings
boxcar.write('MR')
boxcar.write('MS;ET;T1;I2;W0')
def scan(boxcar, num):
#Send the SCAN command to the boxcar, set to the specified number of data points
command = 'SC1,2:' + str(num)
boxcar.write(command)
def read_data(boxcar, num):
#Read the stored scan data and return it as a value list
data_list = []
for x in range(num * 2):
data_list.append(float(boxcar.query('N')))
return data_list
def collect_baseline(boxcar, n):
#Get a baseline signal for later processing
config_boxcar(boxcar)
preset_scan(boxcar)
scan(boxcar, n)
raw_data = read_data(boxcar, n)
chan1 = raw_data[::2]
chan2 = raw_data[1::2]
normal_data = normalize(chan1, chan2, n)
return average_list(normal_data)
def main():
rm = pyvisa.ResourceManager()
n = 10
sleep_timer = 0.1 * n + 0.5
sr245 = rm.open_resource('COM5')
#Configure/preset
config_boxcar(sr245)
preset_scan(sr245)
#Set a timer to measure scanning time
t0 = time.time()
scan(sr245, n)
time.sleep(sleep_timer)
raw_data = read_data(sr245, n)
t1 = time.time()
#Breakdown data by channel and normalize
chan1 = raw_data[::2]
chan2 = raw_data[1::2]
normal_data = normalize(chan1, chan2, n)
elapsed_time = t1 - t0
print('Elapsed time: ', elapsed_time)
print('Channel 1: ', chan1)
print('Channel 2: ', chan2)
print('Normalized Data: ', normal_data)
print('Average Normalized Data: ', average_list(normal_data))
print('Standard Deviation: ', np.std(normal_data))
if __name__ == '__main__':
main()

How to create multiple Python scripts and run them at the same time?

I have a Python script that is used to find some stresses on a structure (a crane boom) when exposed to the wind from all directions. This means it creates 360 text files, 1 for each degree the structure is facing. Instead of doing 360 consecutive loops running on a single core, I want to break the task up into maybe 10 or 20 processes. Is there a way I could modify the following code so it created and ran multiple scripts with different degree ranges i.e. one script would do 0 to 20 degrees, the next 20 to 40 etc.?
import math
import csv
boomDirection = 0
time = 0
maxStress = 650
with open("SomeWindAndHeadingStressMatrix.csv") as f:
data = [row for row in csv.reader(f)]
while boomDirection < 361:
file = open("SomeWindSpeedSourceData.txt", 'r')
data_file = open("Bolt Stress - " + str(boomDirection) + " Degrees.csv", 'w')
line = file.readline()
while line != '':
try:
if len(line.split(','))>1:
windSpeedHigh = int(int(line.split(',')[19])*1.32)
windSpeedLow = int(int(line.split(',')[22])*1.32)
windDirection = int(line.split(',')[14]) - boomDirection
if windDirection < 0:
windDirection += 360
stressHigh = float(data[windSpeedHigh][windDirection])
stressLow = float(data[windSpeedLow][windDirection])
if time % 10080 == 0:
data_file.write(str(time) + ', ' + str(maxStress) + ('\n'))
time += 0.5
else:
data_file.write(str(time) + ', ' + str(round(stressHigh,1)) + ('\n'))
time += 0.5
data_file.write(str(time) + ', ' + str(round(stressLow,1)) + ('\n'))
time += 0.5
except ValueError:
pass
line = file.readline()
data_file.close()
time = 0
boomDirection = boomDirection + 1

Why is this while loop only doing the first loop?

I am trying to produce some mechanical stress info from windspeed data for a crane boom when it is between 0 and 90 degrees, with data from each angle saved into it's own file. I have the script working fine when doing just one file/angle, however when I try and use any sort of loop to do it for all angles it will create the files, but only the first has any data in it. I am a beginner and am not very savvy with Python, so I was hoping someone could spot something simple I have missed. I have included a short example file of the source data: Windspeed source file - cut down
import math
file = open("C:/Users/Jacob/Desktop/BOM Data/HD01D_Data_074272_999999999523840.txt", 'r')
boomDirection = 0
vaneSpeed = 120
maxShear = 75.97043478
maxVonMises = 500.0216811
while boomDirection < 91:
data_file = open("Bolt Stress - " + str(boomDirection) + " Degrees.csv", 'w')
line = file.readline()
line = file.readline()
while line != '':
try:
if len(line.split(','))>1:
windSpeedHigh = int(line.split(',')[19])
windSpeedLow = int(line.split(',')[22])
windDirection = int(line.split(',')[14])
relSpeedHigh = math.sin(math.radians((90-(boomDirection - windDirection))))*windSpeedHigh
relSpeedLow = math.sin(math.radians((90-(boomDirection - windDirection))))*windSpeedLow
VonMisesHigh = (maxVonMises/vaneSpeed)* relSpeedHigh
VonMisesLow = (maxVonMises/vaneSpeed)* relSpeedLow
data_file.write(str(round(VonMisesHigh,1)) + ('\n'))
data_file.write(str(round(VonMisesLow,1)) + ('\n'))
except ValueError:
pass
line = file.readline()
data_file.close()
boomDirection = boomDirection + 1

Variable defined and returned but not defined in other function, no global variables (Python3)

I'm trying to define r2 and r1 in their own functions, I've had it return and print it, so I know it's officially in the main(). The problem is when it tries to get the variable in rectangles() it gives an error that r2 is not defined.
My professor instructed us that if you have a variable defined in your base function(outside of the global space) that it would be readable in functions that stem out of the base function.
Is what I'm trying to do simply not feasible or am I just attacking it incorrectly? We are not allowed to use global variables or the global space for our codes. Below is the code I'm trying to run.
def f1(x):
return 5*x**4 + 3*x**3 - 10*x + 2
def rectangles():
step = (r2-r1)/shapes
total = 0
start = step + r1
for y in range(shapes):
res = f1(start)
start = step + start
total = total + res*step
print("The rectangle approximation is " + str(total))
def trapezoids():
step = (r2-r1)/shapes
total = 0
start = r1
for y in range(shapes):
m1 = f1(start)
total = step * (m1+f1(step+start))/2 + total
start = step + start
print("The trapezoid approximation is " + str(total))
def r1get():
ii=True
while ii == True:
r1 = input("Enter the start of the range: ")
try:
val = float(r1)
ii = 0
except ValueError:
ii== True
r1 = float(r1)
return r1
def r2get():
ii=True
while ii == True:
r2 = input("Enter the end of the range: ")
try:
val = float(r2)
ii = 0
except ValueError:
ii== True
r2 = float(r2)
return r2
#def mode():
def numshapes():
ii = True
while ii == True:
shapes = input("Enter how many shapes you want to use in your approximation: ")
try:
val = int(shapes)
shapes = int(shapes)
if shapes>=0:
ii=0
else:
ii=True
except ValueError:
ii== True
return shapes
def main():
#mode()
#print(mode)
r1 = r1get()
r2 = r2get()
shapes = numshapes()
rectangles()
trapezoids()
ma
in()

Resources