Writing multiple data to a csv, np array gets converted to string - python-3.x

I'm trying to write pixel_array data and patientID from multiple Dicom files to a CSV.
The pixel arrays, which are np arrays get saved like this;
[[0 0 0 ... 0 0 0]\n [0 0 0 ... 0 0 0]\n [0 0 0 ... 0 0 0]\n ...\n [0 0 0 ... 0 0 0]\n [0 0 0 ... 0 0 0]\n [0 0 0 ... 0 0 0]]
how do I get back the original np array?
if you could pls give me some advice on writing these properly to CSV so the np arrays are preserved as they are?
Below is the loop
def tryer(a, b):
if b == 'pixel_array':
return(a.pixel_array)
else:
try:
return(getattr(a, b))
except AttributeError:
return('No')
tryer is as defined above;
a_list = ['PatientID', 'pixel_array']
with open('imgs_n_patied_train1.csv', 'w', newline = '') as f:
thewriter = writer(f)
thewriter.writerow(['PatientID', 'pixel_array\n'])
for i in imgs:
#iterating through each file and writing all the metadata to a csv row-wise
a = dicom.read_file(r'pathto\stage_2_train_images' + '\\' + i, force = True)
app = []
for j in a_list:
app.append(tryy.tryer(a, j))
thewriter.writerow(app)

Related

Replace Column value in dataframe -1 with 0

I have a columns in pandas dataframe that has multiple values like -1 -1 -1 -1 -1 -1.........-1 around 1000. What i want is to convert those -1 with 0. so it shows like 0 0 0 0 0 0 ....0000
df_img_attr_label = pd.read_csv(r'D:\DeepFashion\Category and Attribute Prediction\list_attr_img.txt',names = ['Image_Name'],header = None)
df_img_attr_label[['Image_Name', 'attribute_label']] = df_img_attr_label["Image_Name"].str.split(" ", 1, expand=True)
df_img_attr_label["attribute_label"] = df_img_attr_label["attribute_label"]
ret_rows = df_img_attr_label.loc[0:1000,:]
df_2 = ret_rows.replace([-1, 0])
I want column values -1 -1 -1 -1 -1 -1...to be 0 0 0 0 0......
could be similar to Yasir suggestion but without quotes
df_2 = df_2.replace(-1, 0)
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html
Try this:
df_2 = df_2.replace('-1', '0')
you can use pandas's Series.map method
s = df['col_to_be_mapped']
df['col_to_be_mapped'] = s.map(lambda x: 0) # if all values are -1
# if there are other values also you can use this function
df['col_to_be_mapped'] = s.map(lambda x: 0 if x == -1 else 1)

Optimizing using Pandas Data Frame

I have the following function that loads a csv into a data frame then does some calculations. It takes about 4-5 minutes to do calculation on the csv with a little over 100,000 lines. I was hoping there is a faster way.
def calculate_adeck_errors(in_file):
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")}')
pd.set_option('display.max_columns', 12)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
#print(adeck_df)
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
#print(carq_data)
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
#print(final_df)
row_list = []
for index, row in carq_data.iterrows():
position_time = row['POSDATETIME']
for index, arow in final_df.iterrows():
if arow['POSDATETIME'] == position_time:
# match, so do calculations
storm_id = arow['STORMID']
model_base_time = arow['MODELDATETIME']
the_hour = arow['TAU']
the_model = arow['MODEL']
point1 = float(row['LAT']), float(row['LON'])
point2 = float(arow['LAT']), float(arow['LON'])
if arow['LAT'] == 0.0:
dist_error = None
else:
dist_error = int(round(haversine(point1, point2, miles=True)))
if arow['WIND'] != 0:
wind_error = int(abs(int(row['WIND']) - int(arow['WIND'])))
else: wind_error = None
if arow['PRES'] != 0:
pressure_error = int(abs(int(row['PRES']) - int(arow['PRES'])))
else:
pressure_error = None
lat_carq = row['LAT']
lon_carq = row['LON']
lat_model = arow['LAT']
lon_model = arow['LON']
wind_carq = row['WIND']
wind_model = arow['WIND']
pres_carq = row['PRES']
pres_model = arow['PRES']
row_list.append([storm_id, model_base_time, the_model, the_hour, lat_carq, lon_carq, lat_model, lon_model, dist_error,
wind_carq, wind_model, wind_error, pres_carq, pres_model, pressure_error])
result_df = pd.DataFrame(row_list)
result_df = result_df.where((pd.notnull(result_df)), None)
result_cols = ['StormID', 'ModelBasetime', 'Model' , 'Tau',
'LatCARQ', 'LonCARQ', 'LatModel', 'LonModel', 'DistError',
'WindCARQ', 'WindModel','WindError',
'PresCARQ', 'PresModel','PresError']
result_df.columns = result_cols
calculate_adeck_errors(infile)
To clarify what I'm doing:
1. The CARQ entries are the control (actual).
2. The other models are the guesses.
3. I'm comparing the control (CARQ) to the guesses to see what their errors are.
4. The basis of the comparison is the MODELBASETIME = POSBASETIME
4. A sample file I'm processing is here: http://vortexweather.com/downloads/adeck/aal062018.csv
I was hoping there is a faster way than i'm doing it, or another pandas method besides iterrows
Many thanks for suggestion.
Bryan
This code takes about 10 seconds to run your entire dataset!
The code looks very similar to what you have written, with the exception that all of the operations within the main_function have been vectorized. See Fast, Flexible, Easy and Intuitive: How to Speed Up Your Pandas Projects
2018-09-13_adeck_error_calculations.ipynb
import pandas as pd
import numpy as np
import datetime
from haversine import haversine
def main_function(df, row):
"""
The main difference here is that everything is vectorized
Returns: DataFrame
"""
df_new = pd.DataFrame()
df_storage = pd.DataFrame()
pos_datetime = df.POSDATETIME.isin([row['POSDATETIME']]) # creates a Boolean map
array_len = len(pos_datetime)
new_index = pos_datetime.index
df_new['StormID'] = df.loc[pos_datetime, 'STORMID']
df_new['ModelBaseTime'] = df.loc[pos_datetime, 'MODELDATETIME']
df_new['Model'] = df.loc[pos_datetime, 'MODEL']
df_new['Tau'] = df.loc[pos_datetime, 'TAU']
# Distance
df_new['LatCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LAT']), index=new_index).loc[pos_datetime, 0]
df_new['LonCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LON']), index=new_index).loc[pos_datetime, 0]
df_new['LatModel'] = df.loc[pos_datetime, 'LAT']
df_new['LonModel'] = df.loc[pos_datetime, 'LON']
def calc_dist_error(row):
return round(haversine((row['LatCARQ'], row['LonCARQ']), (row['LatModel'], row['LonModel']), miles=True)) if row['LatModel'] != 0.0 else None
df_new['DistError'] = df_new.apply(calc_dist_error, axis=1)
# Wind
df_new['WindCARQ'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_new['WindModel'] = df.loc[pos_datetime, 'WIND']
df_storage['row_WIND'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_storage['df_WIND'] = df.loc[pos_datetime, 'WIND']
def wind_error_calc(row):
return (row['row_WIND'] - row['df_WIND']) if row['df_WIND'] != 0 else None
df_new['WindError'] = df_storage.apply(wind_error_calc, axis=1)
# Air Pressure
df_new['PresCARQ'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_new['PresModel'] = df.loc[pos_datetime, 'PRES']
df_storage['row_PRES'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_storage['df_PRES'] = df.loc[pos_datetime, 'PRES']
def pres_error_calc(row):
return abs(row['row_PRES'] - row['df_PRES']) if row['df_PRES'] != 0 else None
df_new['PresError'] = df_storage.apply(pres_error_calc, axis=1)
del(df_storage)
return df_new
def calculate_adeck_errors(in_file):
"""
Retruns: DataFrame
"""
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
pd.set_option('max_columns', 20)
pd.set_option('max_rows', 300)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
adeck_df['MODELDATETIME'] = pd.to_datetime(adeck_df['MODELDATETIME'], format='%Y-%m-%d %H:%M')
adeck_df['POSDATETIME'] = pd.to_datetime(adeck_df['POSDATETIME'], format='%Y-%m-%d %H:%M')
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
print('Len carq_data: ', len(carq_data))
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
print('Len final_df: ', len(final_df))
df_out_new = pd.DataFrame()
for index, row in carq_data.iterrows():
test_df = main_function(final_df, row) # function call
df_out_new = df_out_new.append(test_df, sort=False)
df_out_new = df_out_new.reset_index(drop=True)
df_out_new = df_out_new.where((pd.notnull(df_out_new)), None)
print(f'Finishing Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
return df_out_new
in_file = 'aal062018.csv'
df = calculate_adeck_errors(in_file)
>>>Starting Data Calculations: 02:18:30AM on September 13, 2018
>>>Len carq_data: 56
>>>Len final_df: 137999
>>>Finishing Data Calculations: 02:18:39AM on September 13, 2018
print(len(df))
>>>95630
print(df.head(20))
Please don't forget to check the accepted solution. Enjoy!
Looks like you are creating two dataframes out of the same dataframe, and then processing them. Two things that may cut your time.
First, you are iterating over both dataframes and checking for a condition:
for _, row in carq_data.iterrows():
for _, arow in final_df.iterrows():
if arow['POSDATETIME'] == row['POSDATETIME']:
# do something by using both tables
This is essentially an implementation of a join. You are joining carq_data with final_df on 'POSDATETIME'.
As a first step, you should merge the tables:
merged = carq_data.merge(final_df, on=['POSDATETIME'])
At this point you will get multiple rows for each similar 'POSDATETIME'. In the below, let's assume column b is POSDATETIME:
>>> a
a b
0 1 11
1 1 33
>>> b
a b
0 1 2
1 1 3
2 1 4
>>> merged = a.merge(b, on=['a'])
>>> merged
a b_x b_y
0 1 11 2
1 1 11 3
2 1 11 4
3 1 33 2
4 1 33 3
5 1 33 4
Now, to do your conditional calculations, you can use the apply() function.
First, define a function:
def calc_dist_error(row):
return int(round(haversine(row['b_x'], row['b_y'], miles=True))) if row['a'] != 0.0 else None
Then apply it to every row:
merged['dist_error'] = merged.apply(calc_dist_error, axis=1)
Continuing my small example:
>>> merged['c'] = [1, 0, 0, 0, 2, 3]
>>> merged
a b_x b_y c
0 1 11 2 1
1 1 11 3 0
2 1 11 4 0
3 1 33 2 0
4 1 33 3 2
5 1 33 4 3
>>> def foo(row):
... return row['b_x'] - row['b_y'] if row['c'] != 0 else None
...
>>> merged['dist_error'] = merged.apply(foo, axis=1)
>>> merged
a b_x b_y c dist_error
0 1 11 2 1 9.0
1 1 11 3 0 NaN
2 1 11 4 0 NaN
3 1 33 2 0 NaN
4 1 33 3 2 30.0
5 1 33 4 3 29.0
This should help you reduce run time (see also this for how to check using %timeit). Hope this helps!

Pandas taking more time to execute or even no execution when large input values

The device.csv has these values (head(5)).
DEVICE_ADDRESS START_TIME UPDATE_TIME
0 00:0A:20:46:86:D2 1528711800 1528764903
1 00:0A:20:6A:17:38 1528659901 1528764905
2 00:0A:20:37:4D:C4 1528578901 1528764901
3 00:0A:20:42:96:E8 1528669200 1528764903
4 00:0A:20:3D:DF:5C 1528728729 1528764906
Each DEVICE_MAC has multiple entries of different START_TIME, UPDATE_TIME values. The CSV files are red in dataframe, then sorted in ascending order of Device_address. Once sorted we will calculate LATENCY_MIS, LATENCY_RB, RCOUNT values
import pandas as pd
from pandas import DataFrame
df = pd.read_csv(r"C:\Tool\Device.csv" ,names = [ "DEVICE_MAC", "START_TIME", "UPDATE_TIME"])
df=df.sort_values(['DEVICE_MAC', 'START_TIME', 'UPDATE_TIME'], ascending=[True, True,True])
df['LATENCY_MIS'],df['LATENCY_RB'], df['RCOUNT'], df['PAD'] = 0, 0, 0, 0
mac_ref = df.loc[0,'DEVICE_MAC']
start_refernce_time = df['UPDATE_TIME'].min()
end_reference_time = df['UPDATE_TIME'].max()
for index, row in df.iterrows():
if(mac_ref == row['DEVICE_MAC']):
if(index==0): #Starting of MAC processing
start_time_ref = row['START_TIME']
event_time_ref = row['UPDATE_TIME']
df.loc[index,'RCOUNT'] = 0
df.loc[index, 'PAD'] = row['UPDATE_TIME'] - start_refernce_time
elif(row['START_TIME'] == start_time_ref): #The same session prevails
difference_event_ts = row['UPDATE_TIME']-event_time_ref
event_time_ref = row['UPDATE_TIME']
df.loc[index,'LATENCY_MIS'] = difference_event_ts -300
df.loc[index,'RCOUNT'] = 0
if(index+1 in df.index):
if(row['DEVICE_MAC']!= df.loc[index+1,'DEVICE_MAC']):
df.loc[index, 'PAD'] = end_reference_time -row['UPDATE_TIME']
if(index== df.index[-1]):
df.loc[index, 'PAD'] = end_reference_time -row['UPDATE_TIME']
elif(row['START_TIME'] != start_time_ref): #New Session Starts
#difference_event_ts = row['START_TIME']-event_time_ref+(row['UPDATE_TIME']-row['START_TIME']-300)
df.loc[index,'LATENCY_RB'] = row['START_TIME']-event_time_ref
df.loc[index, 'LATENCY_MIS']= row['UPDATE_TIME']-row['START_TIME'] #-300*****
event_time_ref = row['UPDATE_TIME']
df.loc[index,'RCOUNT'] = 1
start_time_ref = row['START_TIME']
event_time_ref = row['UPDATE_TIME']
else: #Starting of new MAC Processing
mac_ref = row['DEVICE_MAC']
start_time_ref = row['START_TIME']
event_time_ref = row['UPDATE_TIME']
df.loc[index,'RCOUNT'] = 0
df.loc[index, 'PAD'] = row['UPDATE_TIME'] - start_refernce_time
Each row's LATENCY_MIS, LATENCY_RB, RCOUNT depends on previous rows and consecutive next row START_TIME, UPDATE_TIME values. (Except 1st and last rows of each DEVICE_MAC group).
The output looks like this
DEVICE_MAC_ADDRESS START_TIME UPDATE_TIME LATENCY_MIS LATENCY_RB RCOUNT PAD
18228 00:A0:BC:33:04:F0 1527703135 1528787401 1199 0 0 7219
18995 00:A0:BC:33:04:F0 1527703135 1528788601 600 0 0 6019
21007 00:A0:BC:33:04:F0 1527703135 1528791001 1200 0 0 3619
17981 00:A0:BC:37:60:76 1527697084 1528787100 899 0 0 7520
1384 00:A0:BC:3A:91:5C 1528596621 1528766734 599 0 0 27886
2945 00:A0:BC:3A:91:5C 1528596621 1528768533 899 0 0 26087
5832 00:A0:BC:3A:91:5C 1528596621 1528772133 600 0 0 22487
9091 00:A0:BC:3A:91:5C 1528596621 1528776334 600 0 0 18286
11989 00:A0:BC:3A:91:5C 1528596621 1528779934 600 0 0 14686
12880 00:A0:BC:3A:91:5C 1528596621 1528780834 600 0 0 13786
The middle code block to calulate LATENCY_MIS, LATENCY_RB, RCOUNT, PAD takes more time for executing or not executing when input CSV is larger.

Adding Specific Number from a Matrix within a List

I have a list initialized:
best_selected = [[2, 3, 0, 1], [1, 3, 0, 2], [1, 2, 3, 0], [1, 0, 2, 3]]
0 1 2 3
finalList = [[0, 1308, 17410, 16098], [1246, 0, 17557, 16244], [17675, 18002, 0, 5618], [16257, 16584, 5508, 0]]
0 1 2 3
The finalList as a matrix looks like this:
[[ 0 1308 17410 16098]
[ 1246 0 17557 16244]
[17675 18002 0 5618]
[16257 16584 5508 0]]
What I'm trying to do is to iterate in every list of best_selected and add them depending on which index it is in on the other list. And lets say in the first iteration is [2, 3, 0, 1] in best_selected list. The first element in the list is 2 so I'm going to start in :
[[ 0 1308 17410 16098]
[ 1246 0 17557 16244]
--> [17675 18002 0 5618]
[16257 16584 5508 0]]
Since the next number of 2 is 3 respectively in the first iteration of the best_selected, so I would get 5618 as the first value to be in the index. Since it stopped at 3 I would now check next at:
[[ 0 1308 17410 16098]
[ 1246 0 17557 16244]
[17675 18002 0 5618]
--> [16257 16584 5508 0]]
As the next element of 3 is 0*, then I would try to add 16257 in the variable 'index' that contained the value 5618 so the index would now contain the value of 16257 + 5618. I'm trying to do that in each of every iteration of best_selected and so far I'm having errors
The code I'm trying to work out:
count = len(best_selected)
index = 0
j = 0
val = best_selected[0]
while count != 0:
index = index + finalList[best_selected[val]]
if counter == 1:
pass
else:
val = best_selected[j+1]
j = j + 1
counter = counter - 1
print(index)
Any fix or an easy solution would highly be appreciated!
If I understand you correctly, you want a sliding window of the current item and the next item in the sublist. You could just iterate over the sublists and then over the range range(len(sublist)-1), so that you can assign the first index to i = sublist[indx] and the second to j = sublist[indx+1].
index = 0
for sublist in best_selected:
for indx in range(len(sublist)-1):
i = sublist[indx]
j = sublist[indx+1]
index += finalList[i][j]
The nicer variant would be to zip the sublist with sublist[1:] to get tuples with the indices which you can immediately unpack in the head of the loop:
for sublist in best_selected:
for i, j in zip(sublist, sublist[1:]):
index += finalList[i][j]
Here's the solution with the separate sublist sums:
sums = []
for sublist in best_selected:
sublist_sum = 0
for i, j in zip(sublist, sublist[1:]):
sublist_sum += finalList[i][j]
sums.append(sublist_sum)
print(sums)
print(sum(sums))

creating objects with attributes from a text file

Im making a program that reads the info of some football teams from a textile(named 30eapril.txt) and uses this data when creating some team-objects. I wonder how I can make the program read the number of teams in the textile and create objects of them. The code I've written so far works but has a lot of repetitive parts!
class team:
def __init__(self, teamdata):
self.name = teamdata[0]
self.wins = teamdata[1]
self.drawn = teamdata[2]
self.losses = teamdata[3]
def __repr__(self):
return self.name.ljust(15) + '{} {} {}'.format(self.wins, self.drawn, self.losses)
laglista = []
with open('30eapril.txt', 'rt') as file:
for line in file:
laglista.append(line)
team1data = (laglista[0]).split()
team2data = (laglista[1]).split()
team3data = (laglista[2]).split()
team4data = (laglista[3]).split()
lag1 = team(team1data)
lag2 = team(team2data)
lag3 = team(team3data)
lag4 = team(team4data)
print(lag1)
print(lag2)
print(lag3)
print(lag4)
this is what was in the textfile
Arsenal 2 1 0
Manchester 2 0 0
Liverpool 0 1 2
Newcastle 0 0 2
Hope that someone can help!
//Peter
Shortened code: (could certainly be even better)
#!/usr/bin/env python3
class team:
def __init__(self, teamdata):
self.name, self.wins, self.drawn, self.losses = teamdata
def __repr__(self):
return self.name.ljust(15) + '{} {} {}'.format(self.wins, self.drawn, self.losses)
lag = []
with open('30eapril.txt', 'rt') as file:
for line in file:
lag.append(team(line.split()))
#print("Number of teams: " + str(len(lag)))
for l in lag:
print(l)
You don't need to know the number of lines of your file.
With the same content of '30eapril.txt', the output is:
$ ./test_script3.py
Arsenal 2 1 0
Manchester 2 0 0
Liverpool 0 1 2
Newcastle 0 0 2
Same script on '30eapril.txt' having a extra line:
$ ./test_script3.py
Arsenal 2 1 0
Manchester 2 0 0
Liverpool 0 1 2
Newcastle 0 0 2
AnotherClub 1 0 2

Resources