How to use pandas df column value in if-else expression to calculate additional columns - python-3.x

I am trying to calculate additional metrics from existing pandas dataframe by using an if/else condition on existing column values.
if(df['Sell_Ind']=='N').any():
df['MarketValue'] = df.apply(lambda row: row.SharesUnits * row.CurrentPrice, axis=1).astype(float).round(2)
elif(df['Sell_Ind']=='Y').any():
df['MarketValue'] = df.apply(lambda row: row.SharesUnits * row.Sold_price, axis=1).astype(float).round(2)
else:
df['MarketValue'] = df.apply(lambda row: 0)
For the if condition the MarketValue is calculated correctly but for the elif condition, its not giving the correct value.
Can anyone point me as what wrong I am doing in this code.

I think you need numpy.select, apply can be removed and multiple columns by mul:
m1 = df['Sell_Ind']=='N'
m2 = df['Sell_Ind']=='Y'
a = df.SharesUnits.mul(df.CurrentPrice).astype(float).round(2)
b = df.SharesUnits.mul(df.Sold_price).astype(float).round(2)
df['MarketValue'] = np.select([m1, m2], [a,b], default=0)
Sample:
df = pd.DataFrame({'Sold_price':[7,8,9,4,2,3],
'SharesUnits':[1,3,5,7,1,0],
'CurrentPrice':[5,3,6,9,2,4],
'Sell_Ind':list('NNYYTT')})
#print (df)
m1 = df['Sell_Ind']=='N'
m2 = df['Sell_Ind']=='Y'
a = df.SharesUnits.mul(df.CurrentPrice).astype(float).round(2)
b = df.SharesUnits.mul(df.Sold_price).astype(float).round(2)
df['MarketValue'] = np.select([m1, m2], [a,b], default=0)
print (df)
CurrentPrice Sell_Ind SharesUnits Sold_price MarketValue
0 5 N 1 7 5.0
1 3 N 3 8 9.0
2 6 Y 5 9 45.0
3 9 Y 7 4 28.0
4 2 T 1 2 0.0
5 4 T 0 3 0.0

Related

how to compare / update with two diffrent Dataframes?

I'm trying to make a code to check the data update, and update it if needs to be.
The problem is the efficiency. I only got a idea of nested loop, and there should be a better way to do this.
there are two DataFrames; df_new, df_old.
I want to update df_old's data with newer one.
Also, I want to make a ChangeLog if there's a change. (and if not, just a timestamp).
Here's my sample code:
import pandas as pd
from datetime import datetime
df_new = pd.DataFrame({"id":[11,22,33,44,55], "a2":[2,3,8,9,99], "a3":[2,4,2,5,99]})
df_old = pd.DataFrame({"id":[11,22,33,44], "a2":[2,3,4,7], "a3":[2,2,2,7],"CHANGELOG":["","","",""]})
for row in df_new.itertuples():
flag = 0
for row2 in df_old.itertuples():
if row[1] == row2[1]:
p = str(datetime.now().date()) + "\n"
if row[2] != row2[2]:
p += "a2 : " + str(row[2]) + " -> " + str(row2[2]) + "\n"
df_old.at[row2[0],"a2"] = str(row[2])
if row[3] != row2[3]:
p += "a3 : " + str(row[3]) + " -> " + str(row2[3]) + "\n"
df_old.at[row2[0],"a3"] = str(row[3])
df_old.at[row2[0],"CHANGELOG"] = p
flag = 1
break
if flag == 0:
df_old = df_old.append(pd.DataFrame([row],columns = row._fields),ignore_index=True)
df_old.at[len(df_old)-1,"CHANGELOG"] = str(datetime.now().date()) + "\n" + "Created"
The code actually worked. But only with small datasets. if I run with tens of thousands rows (each), as you've already assumed, it takes too much time.
I've searched that there's pd.compare, but seems like it only works with two dataframes with same rows/columns. And... I'm stuck now.
Are there any functions or references that I can use?
Thank you in advance.
Indeed, as stated in the docs, pd.compare "[c]an only compare identically-labeled (i.e. same shape, identical row and column labels) DataFrames. So, let's first achieve this:
import pandas as pd
from datetime import date
df_old = pd.DataFrame({"id":[11,22,33,44],
"a2":[2,3,4,7],
"a3":[2,2,2,7],
"CHANGELOG":["","","",""]})
df_new = pd.DataFrame({"id":[11,22,33,44,55],
"a2":[2,3,8,9,99],
"a3":[2,4,2,5,99]})
# get slice of `df_old` with just columns that need comparing
df_slice = df_old.iloc[:,:3]
# get missing indices from `df_new`, build empty df and append to `df_slice`
missing_indices = set(df_new.index).difference(set(df_slice.index))
df_missing = pd.DataFrame(columns = df_slice.columns, index=missing_indices)
df_slice = pd.concat([df_slice,df_missing],axis=0)
print(df_slice)
id a2 a3
0 11 2 2
1 22 3 2
2 33 4 2
3 44 7 7
4 NaN NaN NaN
Now, we can use pd.compare:
# compare the two dfs, keep_shape=True: same rows remain in result
comp = df_slice.compare(df_new, keep_shape=True)
print(comp)
id a2 a3
self other self other self other
0 NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN 2 4.0
2 NaN NaN 4 8.0 NaN NaN
3 NaN NaN 7 9.0 7 5.0
4 NaN 55.0 NaN 99.0 NaN 99.0
Finally, let's apply a custom function to the comp df to generate the strings for the column CHANGELOG. Something like below:
# create func to build changelog strings per row
def create_change_log(row: pd.Series) -> str:
"""
Parameters
----------
row : pd.Series
e.g. comp.iloc[1] with ('id','self') etc. as index
Returns
-------
str
change_log_string per row.
"""
# start string for each row
string = str(date.today()) + "\n"
# get length pd.Series
length = len(row)
# get range for loop over 'self', so index 0,2,4 if len == 6
self_iloc = [*range(0,length,2)]
# get level 0 from index to retrieve orig col names: ['id'] etc.
cols = row.index.get_level_values(0)
# for loop, check scenarios
for i in self_iloc:
temp = str()
# both 'self' and 'other' vals are NaN, nothing changed
if row.isna().all():
break
# all of 'self' == NaN, entire row is new
if row.iloc[self_iloc].isna().all():
temp = 'Created\n'
string += temp
break
# set up comp for specific cols: comp 'self.1' == 'other.1' etc.
self_val, other_val = row[i], row[i+1]
# add `pd.notna()`, since np.nan == np.nan is actually `False`!
if self_val != other_val and pd.notna(self_val):
temp = f'{cols[i]} : {self_val} -> {other_val}\n'
string += temp
return string
Applied to comp:
change_log = comp.apply(lambda row: create_change_log(row), axis=1)
change_log.name = 'CHANGELOG'
# result
df_old_adj = pd.concat([df_new,change_log],axis=1)
print(df_old_adj)
id a2 a3 CHANGELOG
0 11 2 2 2022-08-29\n
1 22 3 4 2022-08-29\na3 : 2 -> 4.0\n
2 33 8 2 2022-08-29\na2 : 4 -> 8.0\n
3 44 9 5 2022-08-29\na2 : 7 -> 9.0\na3 : 7 -> 5.0\n
4 55 99 99 2022-08-29\nCreated\n
PS.1: my result has e.g. 2022-08-29\na3 : 2 -> 4.0\n where you generate 2022-08-29\na3 : 4 -> 2\n. The former seems to me correct; you want to convey: value 2 in column a3 has become (->) 4, no? Anyway, you can just switch the vars in {self_val} -> {other_val}, of course.
PS.2: comp is turning ints into floats automatically for other (= df_new). Hence, we end up with 2 -> 4.0 rather than 2 -> 4. I'd say the best solution to 'fix' this depends on the type of values you are expecting.

Replace values on dataset and apply quartile rule by row on pandas

I have a dataset with lots of variables. So I've extracted the numeric ones:
numeric_columns = transposed_df.select_dtypes(np.number)
Then I want to replace all 0 values for 0.0001
transposed_df[numeric_columns.columns] = numeric_columns.where(numeric_columns.eq(0, axis=0), 0.0001)
And here is the first problem. This line is not replacing the 0 values with 0.0001, but is replacing all non zero values with 0.0001.
Also after this (replacing the 0 values by 0.0001) I want to replace all values there are less than the 1th quartile of the row to -1 and leave the others as they were. But I am not managing how.
To answer your first question
In [36]: from pprint import pprint
In [37]: pprint( numeric_columns.where.__doc__)
('\n'
'Replace values where the condition is False.\n'
'\n'
'Parameters\n'
'----------\n'
because of that your all the values except 0 are getting replaced
Use DataFrame.mask and for second condition compare by DataFrame.quantile:
transposed_df = pd.DataFrame({
'A':list('abcdef'),
'B':[0,0.5,4,5,5,4],
'C':[7,8,9,4,2,3],
'D':[1,3,0,7,1,0],
'E':[5,3,6,9,2,4],
'F':list('aaabbb')
})
numeric_columns = transposed_df.select_dtypes(np.number)
m1 = numeric_columns.eq(0)
m2 = numeric_columns.lt(numeric_columns.quantile(q=0.25, axis=1), axis=0)
transposed_df[numeric_columns.columns] = numeric_columns.mask(m1, 0.0001).mask(m2, -1)
print (transposed_df)
A B C D E F
0 a -1.0 7 1.0 5 a
1 b -1.0 8 3.0 3 a
2 c 4.0 9 -1.0 6 a
3 d 5.0 -1 7.0 9 b
4 e 5.0 2 -1.0 2 b
5 f 4.0 3 -1.0 4 b
EDIT:
from scipy.stats import zscore
print (transposed_df[numeric_columns.columns].apply(zscore))
B C D E
0 -2.236068 0.570352 -0.408248 0.073521
1 0.447214 0.950586 0.408248 -0.808736
2 0.447214 1.330821 -0.816497 0.514650
3 0.447214 -0.570352 2.041241 1.838037
4 0.447214 -1.330821 -0.408248 -1.249865
5 0.447214 -0.950586 -0.816497 -0.367607
EDIT1:
transposed_df = pd.DataFrame({
'A':list('abcdef'),
'B':[0,1,1,1,1,1],
'C':[1,8,9,4,2,3],
'D':[1,3,0,7,1,0],
'E':[1,3,6,9,2,4],
'F':list('aaabbb')
})
numeric_columns = transposed_df.select_dtypes(np.number)
from scipy.stats import zscore
df1 = pd.DataFrame(numeric_columns.apply(zscore, axis=1).tolist(),index=transposed_df.index)
transposed_df[numeric_columns.columns] = df1
print (transposed_df)
A B C D E F
0 a -1.732051 0.577350 0.577350 0.577350 a
1 b -1.063410 1.643452 -0.290021 -0.290021 a
2 c -0.816497 1.360828 -1.088662 0.544331 a
3 d -1.402136 -0.412393 0.577350 1.237179 b
4 e -1.000000 1.000000 -1.000000 1.000000 b
5 f -0.632456 0.632456 -1.264911 1.264911 b

Pandas Aggregate data other than a specific value in specific column

I have my data like this in pandas dataframe python
df = pd.DataFrame({
'ID':range(1, 8),
'Type':list('XXYYZZZ'),
'Value':[2,3,2,9,6,1,4]
})
The oputput that i want to generate is
How can i generate these results using python pandas dataframe. I want to include all the Y values of type column, and does not want to aggregate them.
First filter values by boolean indexing, aggregate and append filter out rows, last sorting:
mask = df['Type'] == 'Y'
df1 = (df[~mask].groupby('Type', as_index=False)
.agg({'ID':'first', 'Value':'sum'})
.append(df[mask])
.sort_values('ID'))
print (df1)
ID Type Value
0 1 X 5
2 3 Y 2
3 4 Y 9
1 5 Z 11
If want range 1 to length of data for ID column:
mask = df['Type'] == 'Y'
df1 = (df[~mask].groupby('Type', as_index=False)
.agg({'ID':'first', 'Value':'sum'})
.append(df[mask])
.sort_values('ID')
.assign(ID = lambda x: np.arange(1, len(x) + 1)))
print (df1)
ID Type Value
0 1 X 5
2 2 Y 2
3 3 Y 9
1 4 Z 11
Another idea is create helper column for unique values only for Y rows and aggregate by both columns:
mask = df['Type'] == 'Y'
df['g'] = np.where(mask, mask.cumsum() + 1, 0)
df1 = (df.groupby(['Type','g'], as_index=False)
.agg({'ID':'first', 'Value':'sum'})
.drop('g', axis=1)[['ID','Type','Value']])
print (df1)
ID Type Value
0 1 X 5
1 3 Y 2
2 4 Y 9
3 5 Z 11
Similar alternative with Series g, then drop is not necessary:
mask = df['Type'] == 'Y'
g = np.where(mask, mask.cumsum() + 1, 0)
df1 = (df.groupby(['Type',g], as_index=False)
.agg({'ID':'first', 'Value':'sum'})[['ID','Type','Value']])

concating two columns next to eachother

I've written a function that collects some data via input(), its unimportant to the question at hand. However at the end I need to .concat two columns.
So far I've got it working to an extent but its not perfect.
{def visualise_country():
data = pd.read_csv('tas_pr_1991_2015_AC.csv')
target_frame = get_info()
df1 = pd.DataFrame(data.loc[data['country'] == target_frame[0]])
df1 = pd.DataFrame(df1.loc[df1['year'] == int(target_frame[2])])
df1 = df1[target_frame[4]]
df2 = pd.DataFrame(data.loc[data['country'] == target_frame[1]])
df2 = pd.DataFrame(df2.loc[df2['year'] == int(target_frame[3])])
df2 = df2[target_frame[4]]
frame_list = [df1,df2]
df = pd.concat(frame_list, axis=1)
print("Data for {} in comparison with {}. Comparison years for {}: {} and {}: ".format(target_frame[0],target_frame[1],target_frame[0],target_frame[2],target_frame[1],target_frame[3]))
return df}
Target_frame is just a tuple containing the collected information necessary to collect the columns.
Output:
1 - NaN
2 - NaN
3 - NaN
4 - NaN
NaN - 5
NaN - 6
NaN - 7
NaN - 8
Desired output:
1 - 5
2 - 6
3 - 7
4 - 8
Need same index values in all DataFrames:
frame_list = [x.reset_index(drop=True) for x in [df1,df2]]
Or:
df1.index = df2.index
frame_list = [df1,df2]
df = pd.concat(frame_list, axis=1)
Or:
df1 = df1[target_frame[4]].reset_index(drop=True)
df2 = df2[target_frame[4]].reset_index(drop=True)
frame_list = [df1,df2]
df = pd.concat(frame_list, axis=1)

Pandas Pivot Table Count Values (Exclude "NaN")

Given the following data frame:
import numpy as np
import pandas as pd
df = pd.DataFrame({'Site':['a','a','a','b','b','b'],
'x':[1,1,0,1,np.nan,0],
'y':[1,np.nan,0,1,1,0]
})
df
Site y x
0 a 1.0 1
1 a NaN 1
2 a 0.0 0
3 b 1.0 1
4 b 1.0 NaN
5 b 0.0 0
I'd like to pivot this data frame to get the count of values (excluding "NaN") for each column.
I tried what I found in other posts, but nothing seems to work (maybe there was a change in pandas 0.18)?
Desired result:
Item count
Site
a y 2
b y 3
a x 3
b x 2
Thanks in advance!
pvt = pd.pivot_table(df, index = "Site", values = ["x", "y"], aggfunc = "count").stack().reset_index(level = 1)
pvt.columns = ["Item", "count"]
pvt
Out[38]:
Item count
Site
a x 3
a y 2
b x 2
b y 3
You can add pvt.sort_values("Item", ascending = False) if you want y's to appear first.

Resources