Python save panda to csv files separately within FOR loop - python-3.x

I would like to save down some panda dataframe downloads separately as csv files. I am getting an error in the last line.
Is the syntax off?
Kind regards
sortedByISIN = pd.DataFrame()
for i in data['isin'].unique():
print('Adding ' + i)
d1 = data[data['isin'] == i]
d1['next_signal'] = d1['signal'].shift(-1)
#Shift x periods in the future
d1['futprice'] = d1['mid'].shift(-6)
d1['futT'] = d1['creationTimeStamp'].shift(-6)
d1['move'] = d1.apply(lambda row: (row['futprice'] - row['mid'])/row['mid'] * 10000 if row['futT'] - row['creationTimeStamp'] < 300000 else 0, axis=1)
d1['signal_transition'] = d1['next_signal'] - d1['signal']
sortedByISIN = sortedByISIN.append(d1)
sortedByISIN['period'] = np.floor(sortedByISIN.creationTimeStamp/3600000)
sortedByISIN.to_csv('Book'%i.csv')

or you can use also:
sortedByISIN.to_csv('Book' + str(i) + '.csv')

Use format:
sortedByISIN.to_csv('Book{}.csv'.format(i))
And for python 3.6+ is possible use f-strings:
sortedByISIN.to_csv(f'Book{i}.csv')

Related

how to sort the date in sub-list using for loop in python

trading_info = [['1','1000',"symbol_name1",'1000','1100','950','1050','10000','2021-06-08'],
['2','2000',"symbol_name2",'1007','1200','850','1050','2000','2021-06-07'],
['3','3000',"symbol_name3",'1500','1200','850','1050','20000','2021-06-09'],
['4','4000',"symbol_name4",'10007','1200','850','1050','2000','2021-01-05'],
['5','5000',"symbol_name5",'1007','1200','850','1050','2000','2022-07-07'],
['6','6000',"symbol_name6",'1007','1200','850','1050','2000','2021-06-08'],
['7','7000',"symbol_name7",'1007','1200','850','1050','2000','2021-08-02'],
['8','8000',"symbol_name8",'1007','1200','850','1050','2000','2021-01-01'],
['9','9000',"symbol_name9",'1007','1200','850','1050','2000','2021-10-10'],
['10','10000',"symbol_name10",'1007','1200','850','1050','2000','2021-06-14']]
for i in range(0,len(trading_info)):
for j in range(0,len(trading_info)-i-1):
if(trading_info[j][8]>trading_info[j+1][8]):
trade=trading_info[j]
trading_info[j]=trading_info[j+1]
trading_info[j+1]=trade
trading_info = trading_info
df = pd.DataFrame(trading_info_sorted[0:],columns=trading_info_sorted[0])
df.columns=["index","symbol_token","symbol_name","opening","high","low","closing","volume","date"]
print(df)
frame = pd.DataFrame(trading_info)
frame.sort_values(8) # column 8

Calculate percentage change in pandas with rows that contain the same values

I am using Pandas to calculate percentage change(s) between values that occur more than once in the column of interest.
I want to compare the values of last weeks workout provided they're the same exercise type to get the percentage change of (weight used, reps accomplished )
I am able to get the percentages of all the rows which is halfway what I want but the conditional part is missing - so only get the percentages if the exercise_name is of the same value as we want to compare how we improve on a weekly, bi-weekly basis.
ids = self.user_data["exercise"].fillna(0)
dups = self.user_data[ids.isin(ids[ids.duplicated()])].sort_values("exercise")
dups['exercise'] = dups['exercise'].astype(str)
dups['set_one_weight'] = pd.to_numeric(dups['set_one_weight'])
dups['set_two_weight'] = pd.to_numeric(dups['set_two_weight'])
dups['set_three_weight'] = pd.to_numeric(dups['set_three_weight'])
dups['set_four_weight'] = pd.to_numeric(dups['set_four_weight'])
dups['set_one'] = pd.to_numeric(dups['set_one'])
dups['set_two'] = pd.to_numeric(dups['set_two'])
dups['set_three'] = pd.to_numeric(dups['set_three'])
dups['set_four'] = pd.to_numeric(dups['set_four'])
**percent_change = dups[['set_three_weight']].pct_change()**
the last line gets the percentage change for all the rows for column set_three_weight but is unable to do what I want above which is find rows with same name and obtain the percentage change.
UPDATE
Using Group By Solution
ids = self.user_data["exercise"].fillna(0)
dups = self.user_data[ids.isin(ids[ids.duplicated()])].sort_values("exercise")
dups['exercise'] = dups['exercise'].astype(str)
dups['set_one_weight'] = pd.to_numeric(dups['set_one_weight'])
dups['set_two_weight'] = pd.to_numeric(dups['set_two_weight'])
dups['set_three_weight'] = pd.to_numeric(dups['set_three_weight'])
dups['set_four_weight'] = pd.to_numeric(dups['set_four_weight'])
dups['set_one'] = pd.to_numeric(dups['set_one'])
dups['set_two'] = pd.to_numeric(dups['set_two'])
dups['set_three'] = pd.to_numeric(dups['set_three'])
dups['set_four'] = pd.to_numeric(dups['set_four'])
dups['routine_upload_date'] = pd.to_datetime(dups['routine_upload_date'])
# percent_change = dups[['set_three_weight']].pct_change()
# Group the exercises together and create a new cols that represent the percentage delta variation in percentages
dups.sort_values(['exercise', 'routine_upload_date'], inplace=True, ascending=[True, False])
dups['set_one_weight_delta'] = (dups.groupby('exercise')['set_one_weight'].apply(pd.Series.pct_change) + 1)
dups['set_two_weight_delta'] = (dups.groupby('exercise')['set_two_weight'].apply(pd.Series.pct_change) + 1)
dups['set_three_weight_delta'] = (dups.groupby('exercise')['set_three_weight'].apply(pd.Series.pct_change) + 1)
dups['set_four_weight_delta'] = (dups.groupby('exercise')['set_four_weight'].apply(pd.Series.pct_change) + 1)
dups['set_one_reps_delta'] = (dups.groupby('exercise')['set_one'].apply(pd.Series.pct_change) + 1)
dups['set_two_reps_delta'] = (dups.groupby('exercise')['set_two'].apply(pd.Series.pct_change) + 1)
dups['set_three_reps_delta'] = (dups.groupby('exercise')['set_three'].apply(pd.Series.pct_change) + 1)
dups['set_four_reps_delta'] = (dups.groupby('exercise')['set_four'].apply(pd.Series.pct_change) + 1)
print(dups.head())
I think this gets me the result(s) I want, would like someone to confirm

index 0 is out of bounds for axis 0 with size 0 while converting large batch of csv to xlsx using xlsxwriter

Have a batch of 1k csv files that need converting with some conditions to xlxs. Running a for loop to select csv files and name the xlsx files. The code works fine for the first 5-6 csv to xlsx file conversions. Then get an error message:
IndexError: index 0 is out of bounds for axis 0 with size 0
for i in range(client_list_items):
client = client_list[i]
TradeSignals = pd.read_csv(client + '_TradeSignals.csv',header=None)
os.remove(client + '_TradeSignals.csv')
TradeSignals.columns = ['idx','idx','Date','Open','High','Low','Close','Symbol','TrendSELL','TrendBUY']
TradeSignals = pd.merge(TradeSignals,sector[['Industry','Symbol','M-CapRank']],on='Symbol', how='left')
TradeSignals['Buy_Signal'] = (TradeSignals['Low'] <
TradeSignals['TrendBUY']) & (TradeSignals['High'] < TradeSignals['TrendSELL'])
TradeSignals['Sell_Signal'] = (TradeSignals['TrendSELL'] < TradeSignals['High']) & (TradeSignals['TrendBUY'] < TradeSignals['Low'])
TradeSignals = TradeSignals[((TradeSignals['Buy_Signal'] == True) | ((TradeSignals['Sell_Signal'] == True)))]
TradeSignals['Trade'] = np.where(TradeSignals['Buy_Signal'], 'Buy', 'Sell')
TradeSignals = TradeSignals[['Industry','Symbol','Date','Close','M-CapRank','Trade']]
TradeSignals.sort_values(by=['Industry','M-CapRank'], inplace=True);
TradeSignals.set_index(['Industry','Symbol'], inplace=True);
writer = pd.ExcelWriter(client + '_TradeSignal.xlsx', engine='xlsxwriter')
TradeSignals.to_excel(writer, sheet_name='Sheet1')# <----- error midway after a few files are generated(total batch 1k files)
writer.save()
Need to convert the entire batch of 1k csv files to xlsx
Resolved. There were a few instances of empty dataframes that was causing the error.

how can I have the different max of several lists in python

I want get different max from different list but the problem i get the same max,this is my code ,why problem in this code ,i have got the same max for the first list,what i do change for obtain a result max for different list:
def best(contactList_id,ntf_DeliveredCount):
maxtForEvryDay = []
yPredMaxForDay = 0
for day in range(1,8):
for marge in range(1,5):
result = predictUsingNewSample([[contactList_id,ntf_DeliveredCount,day,marge]])
if (result > yPredMaxForDay):
yPredMaxForDay = 0
yPredMaxForDay = result
maxtForEvryDay.append(yPredMaxForDay)
return maxtForEvryDay
best(contactList_id = 13.0,ntf_DeliveredCount = 5280.0)
result:
[1669.16010381]
[1708.32915255]
[1747.49820129]
[1786.66725003]
[1570.05500351]
[1609.22405225]
[1648.39310099]
[1687.56214973]
[1491.60792629]
[1510.11895195]
[1549.28800069]
[1588.45704943]
[1402.21845533]
[1420.73953501]
[1450.18290039]
[1489.35194913]
[1367.15490803]
[1356.21411426]
[1345.27532239]
[1390.24684884]
[1378.1190426]
[1367.17824883]
[1419.23588013]
[1486.78241686]
[1450.21261674]
[1516.04342599]
[1581.87423524]
[1647.7050445]
[array([1786.66725003]),
array([1786.66725003]),
array([1786.66725003]),
array([1786.66725003]),
array([1786.66725003]),
array([1786.66725003]),
array([1786.66725003])]
this is my fonction predictUsingNewSample(X_test)
def predictUsingNewSample(X_test):
#print(X_test)
# Load from file
with open("pickle_model.pkl", 'rb') as file:
pickle_model = pickle.load(file)
Ypredict = pickle_model.predict(X_test)
print(Ypredict)
return Ypredict
Try this:
def best(contactList_id,ntf_DeliveredCount):
maxtForEvryDay = []
for day in range(1,8):
yPredMaxForDay = 0
for marge in range(1,5):
result = predictUsingNewSample([[contactList_id,ntf_DeliveredCount,day,marge]])
if (result > yPredMaxForDay):
yPredMaxForDay = result
maxtForEvryDay.append(yPredMaxForDay)
return maxtForEvryDay
best(contactList_id = 13.0,ntf_DeliveredCount = 5280.0)
I think the problem actually comes from the fact that you never clean up your yPredMaxForDay variable for each day.

Running MPI python script in MPI azure ml pipeline

I'm trying to run distributed python job through azure ML pipelines using MPIStep pipeline class, by referring to the below example link - https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer.ipynb
I tried implemented the same but even I change the node count parameter in MpiStep class, while running the script the it shows size (i.e comm.Get_size()) as 1 always. Can you please help me in what I'm missing here. Is there any specific setup required on the cluster?
Code snippets:
Pipeline code snippet:
model_dir = model_ds.path('./'+saved_model_blob+'/',data_reference_name='saved_model_path').as_mount()
label_dir = model_ds.path('./'+model_label_blob+'/',data_reference_name='model_label_blob').as_mount()
input_images = result_ds.path('./'+score_blob_name+'/',data_reference_name='Input_images').as_mount()
output_container = 'abc'
inti_container = 'xyz'
distributed_batch_score_step = MpiStep(
name="batch_scoring",
source_directory=SCRIPT_FOLDER,
script_name="batch_scoring_script_mpi.py",
arguments=["--dataset_path", input_images,
"--model_name", model_dir,
"--label_dir", label_dir,
"--intermediate_data_container", inti_container,
"--output_container", output_container],
compute_target=gpu_cluster,
inputs=[input_images, model_dir,label_dir],
pip_packages=["tensorflow","tensorflow-gpu==1.13.1","pillow","azure-keyvault","azure-storage-blob"],
conda_packages=["mesa-libgl-cos6-x86_64","mpi4py==3.0.2","opencv=3.4.2","scikit-learn=0.21.2"],
use_gpu=True,
allow_reuse = False,
node_count = nodecount_param,
process_count_per_node = 1
)
Python Script code snippet:
def run(input_dataset,comm):
rank = comm.Get_rank()
size = comm.Get_size()
print("Rank:" , rank)
print("Size:", size) # shows always 1, even the input node count is >1
print(MPI.Get_processor_name())
file_names = get_file_names(args.dataset_path)
sorted(file_names)
partition_size = len(file_names) // size
print("partition_size-->",partition_size)
partitioned_filenames = file_names[rank * partition_size: (rank + 1) * partition_size]
print("RANK {} - is processing {} images out of the total {}".format(rank, len(partitioned_filenames),
len(file_names)))
# call to Function 01
# call to Function 02
img_names = score_df['image_name'].unique()
output_batch = pd.DataFrame()
for i in img_names:
# call to Function 3
output_batch = output_batch.append(pp_output, ignore_index=True)
output_paths_list = comm.gather(output_batch, root=0)
print("RANK {} - number of pre-aggregated output files {}".format(rank, len(output_batch)))
print("saved in", currentDT + '\\' + 'data.csv')
if rank == 0:
print("RANK {} - number of aggregated output files {}".format(rank, len(output_paths_list)))
print("RANK {} - end".format(rank))
if __name__ == "__main__":
with tf.device('/GPU:0'):
init()
comm = MPI.COMM_WORLD
run(args.dataset_path,comm)
Got to know the issue is due to package version, earlier it is installed via conda with conda_packages=["mpi4py==3.0.2"], it worked after changing the install through pip - pip_packages=["mpi4py"]

Resources