iterate over a column check condition and carry calculations with values of other data frames - python-3.x

import pandas as pd
import numpy as np
I do have 3 dataframes df1, df2 and df3.
df1=
data = {'Period': ['2024-04-O1', '2024-07-O1', '2024-10-O1', '2025-01-O1', '2025-04-O1', '2025-07-O1', '2025-10-O1', '2026-01-O1', '2026-04-O1', '2026-07-O1', '2026-10-O1', '2027-01-O1', '2027-04-O1', '2027-07-O1', '2027-10-O1', '2028-01-O1', '2028-04-O1', '2028-07-O1', '2028-10-O1'],
'Price': ['NaN','NaN','NaN','NaN', 'NaN','NaN','NaN','NaN', 'NaN','NaN','NaN','NaN',
'NaN','NaN','NaN','NaN', 'NaN','NaN','NaN'],
'years': [2024,2024,2024,2025,2025,2025,2025,2026,2026,2026,2026,2027,2027,2027,2027,2028,
2028,2028,2028],
'quarters':[2,3,4, 1,2,3,4, 1,2,3,4, 1,2,3,4, 1,2,3,4]
}
df1 = pd.DataFrame(data=data)
df2=
data = {'price': [473.26,244,204,185, 152, 157],
'year': [2023, 2024, 2025, 2026, 2027, 2028]
}
df3 = pd.DataFrame(data=data)
df3=
data = {'quarters': [1,2,3,4],
'weights': [1.22, 0.81, 0.83, 1.12]
}
df2 = pd.DataFrame(data=data)
My aim is to compute the price of df1. For each iteration through df1 check condition and carry calculations accordingly. For example for the 1st iteration, check if df1['year']=2024 and df1['quarters']=2. Then df1['price']=df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==2, 'weights'].
===>>> df1['price'][0]=**473.26*0.81**.
df1['price'][1]=**473.26*0.83**.
...
...
...
and so on.
I could ha used this method but i want to write a code in a more efficient way. I would like to use the following code structure.
for i in range(len(df1)):
if (df1['year']==2024) & (df1['quarter']==2):
df1['Price']= df2.loc[df2['year']==2024, 'price'] * df3.loc[df3['quarters']==2, 'weights']
elif (df1['year']==2024) & (df1['quarter']==3):
df1['price']= df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==3, 'weights']
elif (df1['year']==2024) & (df1['quarters']==4):
df1['Price']= df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==4, 'weights']
...
...
...
Thanks!!!

I think if I understand correctly you can use pd.merge to bring these fields together first.
df1 = df1.merge(df2, how='left' , left_on='years', right_on='year')
df1 = df1.merge(df3, how='left' , left_on='quarters', right_on='quarters')
df1['Price'] = df1['price']*df1['weights']

Related

Is there a way to vectorize adding missing months using resample?

I am trying to add missing months for each ID. Added months should have info on ID and year_month, and NaN for Product. My code achieves this using apply(), but is slow -- I am looking for a vectorized version, which can run significantly faster.
Specifically, df.set_index(df.index).groupby('ID').apply(add_missing_months) takes about 20 seconds on my system with 60 000 rows. I plan to work with data with millions of rows, so I think I need to vectorize the operation. Any help is highly appreciated!
import pandas as pd
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] + 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
df.index = pd.to_datetime(df['year_month'], format = '%Y%m%d')
df = df.drop('year_month', axis = 1)
# The slow function
def add_missing_months(s):
min_d = s.index.min()
max_d = s.index.max()
s = s.reindex(pd.date_range(min_d, max_d, freq='MS'))
return(s)
df = df.set_index(df.index).groupby('ID').apply(add_missing_months)
df = df.drop('ID', axis = 1)
df = df.reset_index()
Not sure if faster, but simplier code is:
df = df.sort_index().groupby('ID').apply(lambda x: x.asfreq('MS'))
df1 = df.groupby('ID').apply(lambda x: x.asfreq('MS'))
df2 = df.set_index(df.index).groupby('ID').apply(add_missing_months)
print (df1.equals(df2))
True
EDIT: For improve performance is create month periods by Series.dt.to_period, aggregate minimal and maximal value with get difference of them with repeat indices by Index.repeat, last add counter by GroupBy.cumcount foa append months ranges, convert to timestamps by Series.dt.to_timestamp and last use left join:
df1 = (df.assign(year_month = df['year_month'].dt.to_period('m'))
.groupby(['ID'])['year_month']
.agg(['min', 'max']))
diff = df1['max'].astype('int').sub(df1['min'].astype('int')) + 1
df1 = df1.loc[df1.index.repeat(diff)]
df1 = (df1['min'].add(df1.groupby(level=0).cumcount()))
.dt.to_timestamp()
.reset_index(name='year_month'))
df = df1.merge(df.rename_axis(None), how='left')
Performance:
In [276]: %timeit jez(df)
126 ms ± 7.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [277]: %timeit vogel(df)
312 ms ± 32.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] + 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
df.index = pd.to_datetime(df['year_month'], format = '%Y%m%d')
def jez(df):
df1 = df.assign(year_month = df['year_month'].dt.to_period('m')).groupby(['ID'])['year_month'].agg(['min', 'max'])
df1 = df1.loc[df1.index.repeat( df1['max'].astype('int').sub(df1['min'].astype('int')) + 1)]
df1 = (df1['min'] + df1.groupby(level=0).cumcount()).dt.to_timestamp().reset_index(name='year_month')
return df1.merge(df.rename_axis(None), how='left')
def vogel(df):
min_d = df['year_month'].min()
max_d = df['year_month'].max()
# generate all possible combinations of date and ID
df_agg = df.groupby(['ID'])['year_month'].agg(['min', 'max'])
df = pd.DataFrame(
index=pd.MultiIndex.from_product(
[pd.date_range(min_d, max_d, freq='MS'), df_agg.index]
)
)
# reduce to only relevant dates
df = df.merge(df_agg, left_on='ID', right_index=True)
df = df.reset_index().rename(columns={'level_0': 'year_month'})
df = df[df['year_month'].between(df['min'], df['max'])]
df = df.drop(columns=['min', 'max'])
# add product information
df = df.merge(df, how='left')
return df
The code is a lot more convoluted and could probably be improved a bit, but it does seem to be much faster (down from 16 to 0.2 seconds on my system, starting from # my code starts here):
import pandas as pd
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] + 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
# my code starts here
# find overall min and max date
min_d = df['year_month'].min()
max_d = df['year_month'].max()
# generate all possible combinations of date and ID
df_agg = df.groupby(['ID'])['year_month'].agg(['min', 'max'])
df = pd.DataFrame(
index=pd.MultiIndex.from_product(
[pd.date_range(min_d, max_d, freq='MS'), df_agg.index]
)
)
# reduce to only relevant dates
df = df.merge(df_agg, left_on='ID', right_index=True)
df = df.reset_index().rename(columns={'level_0': 'year_month'})
df = df[df['year_month'].between(df['min'], df['max'])]
df = df.drop(columns=['min', 'max'])
# add product information
df = df.merge(df, how='left')

Compare two DataFrames and check for changes

I have 2 similar Spark Dataframes df1 and df2 that I want to compare for changes:
df1 and df2 share the same columns
df2 can have more rows than df1 but any additional rows in df2 which is not in df1 can be ignored when comparing
Comparaison key columns are PROGRAM_NAME and ACTION
df1 = spark.createDataFrame([
["PROG1","ACTION1","10","NEW"],
["PROG2","ACTION2","12","NEW"],
["PROG3","ACTION1","14","NEW"],
["PROG4","ACTION4","16","NEW"]
],["PROGRAM_NAME", "ACTION", "VALUE1", "STATUS"])
df2 = spark.createDataFrame([
["PROG1","ACTION1","11","IN PROGRESS"],
["PROG2","ACTION2","12","NEW"],
["PROG3","ACTION1","20","FINISHED"],
["PROG4","ACTION4","14","IN PROGRESS"],
["PROG5","ACTION1","20","NEW"]
],["PROGRAM_NAME", "ACTION", "VALUE1", "STATUS"])
Showing below in order df1, df2 and the expected result I want after comparing the 2 DataFrames.
Similar questions have been asked multiple times here in SO.
Use a simple join to get rows that are in df1 and df2 and filter on those that have different values for the 2 other columns:
from pyspark.sql.functions import col
df_final = df2.alias("new").join(
df1.alias("old"),
(col("new.PROGRAM_NAME") == col("old.PROGRAM_NAME")) & (col("new.ACTION") == col("old.ACTION"))
).filter(
(col("new.VALUE1") != col("old.VALUE1")) | (col("new.STATUS") != col("old.STATUS"))
).select("new.*")
df_final.show()
#+------------+-------+------+-----------+
#|PROGRAM_NAME| ACTION|VALUE1| STATUS|
#+------------+-------+------+-----------+
#| PROG3|ACTION1| 20| FINISHED|
#| PROG4|ACTION4| 14|IN PROGRESS|
#| PROG1|ACTION1| 11|IN PROGRESS|
#+------------+-------+------+-----------+
You can also add the filter condition directly to the join condition
You can achieve the result like this:
import pandas as pd
dict1 = {"PROGRAM_NAME":["PROG1","PROG2","PROG3","PROG4"],
"ACTION":["ACTION1","ACTION2","ACTION1","ACTION4"],
"Value1":[10,12,14,16],
"Status":["NEW","NEW","NEW","NEW"]}
dict2 = {"PROGRAM_NAME":["PROG1","PROG2","PROG3","PROG4","PROG5"],
"ACTION":["ACTION1","ACTION2","ACTION1","ACTION4","ACTION1"],
"Value1":[11,12,20,14,20],
"Status":["IN PROGRES","NEW","FINISHED","IN PROGRES","NEW"]}
DF1 = pd.DataFrame(dict1)
DF2 = pd.DataFrame(dict2)
DF3 = DF2.copy()
DF3 = DF3[DF3["PROGRAM_NAME"].isin(DF1["PROGRAM_NAME"])]
Output:
You can merge df1 and df2 and retain VALUE1 and STATUS just of df2
df1
df2
Keeping suffixes of columns of df1 as _x and that of df2 as blank and then retaining just the columns of df2
df1.merge(df2, on=['PROGRAM_NAME', 'ACTION'], suffixes=('_x', ''))[df2.columns]
Here's the solution in Spark:
import pyspark.sql.types as T
import pyspark.sql.functions as F
df1 = spark.createDataFrame(
[
('PROG1', 'ACTION1', 10, 'NEW'),
('PROG2', 'ACTION2', 12, 'NEW'),
('PROG3', 'ACTION1', 14, 'NEW'),
('PROG4', 'ACTION4', 16, 'NEW'),
],
['PROGRAM_NAME', 'ACTION', 'Value1', 'Status']
)
df2 = spark.createDataFrame(
[
('PROG1', 'ACTION1', 11, 'IN PROGRESS'),
('PROG2', 'ACTION2', 12, 'NEW'),
('PROG3', 'ACTION1', 20, 'FINISHED'),
('PROG4', 'ACTION4', 14, 'IN PROGRESS'),
('PROG5', 'ACTION1', 20, 'NEW'),
],
['PROGRAM_NAME', 'ACTION', 'Value1', 'Status']
)
df1 = df1.alias('df1')
df2 = df2.alias('df2')
df = df1.join(df2, on=['PROGRAM_NAME', 'ACTION'], how='inner')
df = df.filter(F.col('df1.Status') != F.col('df2.Status'))
df.select(
F.col('PROGRAM_NAME'),
F.col('ACTION'),
*[F.col(f'df2.{col}') for col in df2.columns[2:]]
)

Better way to swap column values and then append them in a pandas dataframe?

here is my dataframe
import pandas as pd
data = {'from':['Frida', 'Frida', 'Frida', 'Pablo','Pablo'], 'to':['Vincent','Pablo','Andy','Vincent','Andy'],
'score':[2, 2, 1, 1, 1]}
df = pd.DataFrame(data)
df
I want to swap the values in columns 'from' and 'to' and add them on because these scores work both ways.. here is what I have tried.
df_copy = df.copy()
df_copy.rename(columns={"from":"to","to":"from"}, inplace=True)
df_final = df.append(df_copy)
which works but is there a shorter way to do the same?
One line could be :
df_final = df.append(df.rename(columns={"from":"to","to":"from"}))
On the right track. However, introduce deep=True to make a true copy, otherwise your df.copy will just update df and you will be up in a circle.
df_copy = df.copy(deep=True)
df_copy.rename(columns={"from":"to","to":"from"}, inplace=True)
df_final = df.append(df_copy)

Error: 'BlockManager' object has no attribute 'T' issue while using df.at function in a loop

When i am trying to use df.at fuction without loop it works fine and change the data for a perticular column but it is giving error while using this in a loop.
Code is here.
import pandas as pd
data1 = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
'Height': [5.1, 6.2, 5.1, 5.2]}
df1 = pd.DataFrame(data1)
data2 = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
'Height': [4.1, 3.4, 7.1, 9.2]}
df2 = pd.DataFrame(data2)
df3 = pd.concat([df1, df2], axis=1)
for i in range(int(len(df1))):
for j in range(int(len(df2))):
if df1['Name'][i] != df2['Name'][j]:
continue
else:
out = (df1['Height'][i] - df2['Height'][j])
df3.at[i, 'Height_Comparison'] = out
break
print(df3)
The issue was occurring becz of duplicate column names('Name', 'Height') in Data Frame df3 becz of the concat operation. Concat make double entries with same column names ('Name', 'Height') in Data Frame df3 which is creating this problem.
once i changed the column names to Name1, Height1 in df1 and Name2, Heigh2 in df2 the issue got resolved.

How to obtain the percentage of each value from a pandas table?

I have a table with 5 columns: AWA, REM, S1, S2 and SWS. I computed the sum of each column and of each row. And also the percentages of these values. But in order to get the percentages I repeated the same line 5 times.
Is there a way to improve it in case there were more than 5 columns?
Here is my code and I have also attached an image.
import pandas as pd
df = pd.DataFrame(TABLA, columns=('AWA', 'REM', 'S1', 'S2', 'SWS'))
df.index='s' + (df.index+1).astype(str)
df['xSubject'] = df.sum(axis=1) #sums each colummn
######Here starts the repetition:
df['AWA%'] = df['AWA']/df['AWA'].sum()*100
df['REM%'] = df['REM']/df['REM'].sum()*100
df['S1%'] = df['S1']/df['S1'].sum()*100
df['S2%'] = df['S2']/df['S2'].sum()*100
df['SWS%'] = df['SWS']/df['SWS'].sum()*100
df['xSubject%'] = df['xSubject']/df['xSubject'].sum()*100
######Here ends the repetition:
df.loc['xStage'] = df.sum() #sums each row
df
Use pd.concat with a reconstruction
pd.concat([df,
pd.DataFrame(df.div(df.sum()).values * 100,
columns=df.columns.values + '%')],
axis=1)
Consider the pd.DataFrame df
df = pd.DataFrame(np.random.rand(10, 5),
columns=('AWA', 'REM', 'S1', 'S2', 'SWS'))
df
and the % calculation
df.div(df.sum())
Then using the above code
pd.concat([df,
pd.DataFrame(df.div(df.sum()).values * 100,
columns=df.columns.values + '%')],
axis=1)

Resources