Is there a way to vectorize adding missing months using resample? - python-3.x

I am trying to add missing months for each ID. Added months should have info on ID and year_month, and NaN for Product. My code achieves this using apply(), but is slow -- I am looking for a vectorized version, which can run significantly faster.
Specifically, df.set_index(df.index).groupby('ID').apply(add_missing_months) takes about 20 seconds on my system with 60 000 rows. I plan to work with data with millions of rows, so I think I need to vectorize the operation. Any help is highly appreciated!
import pandas as pd
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] + 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
df.index = pd.to_datetime(df['year_month'], format = '%Y%m%d')
df = df.drop('year_month', axis = 1)
# The slow function
def add_missing_months(s):
min_d = s.index.min()
max_d = s.index.max()
s = s.reindex(pd.date_range(min_d, max_d, freq='MS'))
return(s)
df = df.set_index(df.index).groupby('ID').apply(add_missing_months)
df = df.drop('ID', axis = 1)
df = df.reset_index()

Not sure if faster, but simplier code is:
df = df.sort_index().groupby('ID').apply(lambda x: x.asfreq('MS'))
df1 = df.groupby('ID').apply(lambda x: x.asfreq('MS'))
df2 = df.set_index(df.index).groupby('ID').apply(add_missing_months)
print (df1.equals(df2))
True
EDIT: For improve performance is create month periods by Series.dt.to_period, aggregate minimal and maximal value with get difference of them with repeat indices by Index.repeat, last add counter by GroupBy.cumcount foa append months ranges, convert to timestamps by Series.dt.to_timestamp and last use left join:
df1 = (df.assign(year_month = df['year_month'].dt.to_period('m'))
.groupby(['ID'])['year_month']
.agg(['min', 'max']))
diff = df1['max'].astype('int').sub(df1['min'].astype('int')) + 1
df1 = df1.loc[df1.index.repeat(diff)]
df1 = (df1['min'].add(df1.groupby(level=0).cumcount()))
.dt.to_timestamp()
.reset_index(name='year_month'))
df = df1.merge(df.rename_axis(None), how='left')
Performance:
In [276]: %timeit jez(df)
126 ms ± 7.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [277]: %timeit vogel(df)
312 ms ± 32.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] + 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
df.index = pd.to_datetime(df['year_month'], format = '%Y%m%d')
def jez(df):
df1 = df.assign(year_month = df['year_month'].dt.to_period('m')).groupby(['ID'])['year_month'].agg(['min', 'max'])
df1 = df1.loc[df1.index.repeat( df1['max'].astype('int').sub(df1['min'].astype('int')) + 1)]
df1 = (df1['min'] + df1.groupby(level=0).cumcount()).dt.to_timestamp().reset_index(name='year_month')
return df1.merge(df.rename_axis(None), how='left')
def vogel(df):
min_d = df['year_month'].min()
max_d = df['year_month'].max()
# generate all possible combinations of date and ID
df_agg = df.groupby(['ID'])['year_month'].agg(['min', 'max'])
df = pd.DataFrame(
index=pd.MultiIndex.from_product(
[pd.date_range(min_d, max_d, freq='MS'), df_agg.index]
)
)
# reduce to only relevant dates
df = df.merge(df_agg, left_on='ID', right_index=True)
df = df.reset_index().rename(columns={'level_0': 'year_month'})
df = df[df['year_month'].between(df['min'], df['max'])]
df = df.drop(columns=['min', 'max'])
# add product information
df = df.merge(df, how='left')
return df

The code is a lot more convoluted and could probably be improved a bit, but it does seem to be much faster (down from 16 to 0.2 seconds on my system, starting from # my code starts here):
import pandas as pd
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] + 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
# my code starts here
# find overall min and max date
min_d = df['year_month'].min()
max_d = df['year_month'].max()
# generate all possible combinations of date and ID
df_agg = df.groupby(['ID'])['year_month'].agg(['min', 'max'])
df = pd.DataFrame(
index=pd.MultiIndex.from_product(
[pd.date_range(min_d, max_d, freq='MS'), df_agg.index]
)
)
# reduce to only relevant dates
df = df.merge(df_agg, left_on='ID', right_index=True)
df = df.reset_index().rename(columns={'level_0': 'year_month'})
df = df[df['year_month'].between(df['min'], df['max'])]
df = df.drop(columns=['min', 'max'])
# add product information
df = df.merge(df, how='left')

Related

iterate over a column check condition and carry calculations with values of other data frames

import pandas as pd
import numpy as np
I do have 3 dataframes df1, df2 and df3.
df1=
data = {'Period': ['2024-04-O1', '2024-07-O1', '2024-10-O1', '2025-01-O1', '2025-04-O1', '2025-07-O1', '2025-10-O1', '2026-01-O1', '2026-04-O1', '2026-07-O1', '2026-10-O1', '2027-01-O1', '2027-04-O1', '2027-07-O1', '2027-10-O1', '2028-01-O1', '2028-04-O1', '2028-07-O1', '2028-10-O1'],
'Price': ['NaN','NaN','NaN','NaN', 'NaN','NaN','NaN','NaN', 'NaN','NaN','NaN','NaN',
'NaN','NaN','NaN','NaN', 'NaN','NaN','NaN'],
'years': [2024,2024,2024,2025,2025,2025,2025,2026,2026,2026,2026,2027,2027,2027,2027,2028,
2028,2028,2028],
'quarters':[2,3,4, 1,2,3,4, 1,2,3,4, 1,2,3,4, 1,2,3,4]
}
df1 = pd.DataFrame(data=data)
df2=
data = {'price': [473.26,244,204,185, 152, 157],
'year': [2023, 2024, 2025, 2026, 2027, 2028]
}
df3 = pd.DataFrame(data=data)
df3=
data = {'quarters': [1,2,3,4],
'weights': [1.22, 0.81, 0.83, 1.12]
}
df2 = pd.DataFrame(data=data)
My aim is to compute the price of df1. For each iteration through df1 check condition and carry calculations accordingly. For example for the 1st iteration, check if df1['year']=2024 and df1['quarters']=2. Then df1['price']=df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==2, 'weights'].
===>>> df1['price'][0]=**473.26*0.81**.
df1['price'][1]=**473.26*0.83**.
...
...
...
and so on.
I could ha used this method but i want to write a code in a more efficient way. I would like to use the following code structure.
for i in range(len(df1)):
if (df1['year']==2024) & (df1['quarter']==2):
df1['Price']= df2.loc[df2['year']==2024, 'price'] * df3.loc[df3['quarters']==2, 'weights']
elif (df1['year']==2024) & (df1['quarter']==3):
df1['price']= df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==3, 'weights']
elif (df1['year']==2024) & (df1['quarters']==4):
df1['Price']= df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==4, 'weights']
...
...
...
Thanks!!!
I think if I understand correctly you can use pd.merge to bring these fields together first.
df1 = df1.merge(df2, how='left' , left_on='years', right_on='year')
df1 = df1.merge(df3, how='left' , left_on='quarters', right_on='quarters')
df1['Price'] = df1['price']*df1['weights']

How to iterate over dfs and append data with combine names

i have this problem to solve, this is a continuation of a previus question How to iterate over pandas df with a def function variable function and the given answer worked perfectly, but now i have to append all the data in a 2 columns dataframe (Adduct_name and mass).
This is from the previous question:
My goal: i have to calculate the "adducts" for a given "Compound", both represents numbes, but for eah "Compound" there are 46 different "Adducts".
Each adduct is calculated as follow:
Adduct 1 = [Exact_mass*M/Charge + Adduct_mass]
where exact_mass = number, M and Charge = number (1, 2, 3, etc) according to each type of adduct, Adduct_mass = number (positive or negative) according to each adduct.
My data: 2 data frames. One with the Adducts names, M, Charge, Adduct_mass. The other one correspond to the Compound_name and Exact_mass of the Compounds i want to iterate over (i just put a small data set)
Adducts: df_al
import pandas as pd
data = [["M+3H", 3, 1, 1.007276], ["M+3Na", 3, 1, 22.989], ["M+H", 1, 1,
1.007276], ["2M+H", 1, 2, 1.007276], ["M-3H", 3, 1, -1.007276]]
df_al = pd.DataFrame(data, columns=["Ion_name", "Charge", "M", "Adduct_mass"])
Compounds: df
import pandas as pd
data1 = [[1, "C3H64O7", 596.465179], [2, "C30H42O7", 514.293038], [4,
"C44H56O8", 712.397498], [4, "C24H32O6S", 448.191949], [5, "C20H28O3",
316.203834]]
df = pd.DataFrame(data1, columns=["CdId", "Formula", "exact_mass"])
The solution to this problem was:
df_name = df_al["Ion_name"]
df_mass = df_al["Adduct_mass"]
df_div = df_al["Charge"]
df_M = df_al["M"]
#Defining general function
def Adduct(x,i):
return x*df_M[i]/df_div[i] + df_mass[i]
#Applying general function in a range from 0 to 5.
for i in range(5):
df[df_name.loc[i]] = df['exact_mass'].map(lambda x: Adduct(x,i))
Output
Name exact_mass M+3H M+3Na M+H 2M+H M-3H
0 a 596.465179 199.829002 221.810726 597.472455 1193.937634 197.814450
1 b 514.293038 172.438289 194.420013 515.300314 1029.593352 170.423737
2 c 712.397498 238.473109 260.454833 713.404774 1425.802272 236.458557
3 d 448.191949 150.404592 172.386316 449.199225 897.391174 148.390040
4 e 316.203834 106.408554 128.390278 317.211110 633.414944 104.39400
Now that is the rigth calculations but i need now a file where:
-only exists 2 columns (Name and mass)
-All the different adducts are appended one after another
desired out put
Name Mass
a_M+3H 199.82902
a_M+3Na 221.810726
a_M+H 597.472455
a_2M+H 1193.937634
a_M-3H 197.814450
b_M+3H 514.293038
.
.
.
c_M+3H
and so on.
Also i need to combine the name of the respective compound with the ion form (M+3H, M+H, etc).
At this point i have no code for that.
I would apprecitate any advice and a better approach since the begining.
This part is an update of the question above:
Is posible to obtain and ouput like this one:
Name Mass RT
a_M+3H 199.82902 1
a_M+3Na 221.810726 1
a_M+H 597.472455 1
a_2M+H 1193.937634 1
a_M-3H 197.814450 1
b_M+3H 514.293038 3
.
.
.
c_M+3H 2
The RT is the same value for all forms of a compound, in this example is RT for a =1, b = 3, c =2, etc.
Is posible to incorporate (Keep this column) from the data set df (which i update here below)?. As you can see that df has more columns like "Formula" and "RT" which desapear after calculations.
import pandas as pd
data1 = [[a, "C3H64O7", 596.465179, 1], [b, "C30H42O7", 514.293038, 3], [c,
"C44H56O8", 712.397498, 2], [d, "C24H32O6S", 448.191949, 4], [e, "C20H28O3",
316.203834, 1.5]]
df = pd.DataFrame(data1, columns=["Name", "Formula", "exact_mass", "RT"])
Part three! (sorry and thank you)
this is a trial i did on a small data set (df) using the code below, with the same df_al of above.
df=
Code
#Defining variables for calculation
df_name = df_al["Ion_name"]
df_mass = df_al["Adduct_mass"]
df_div = df_al["Charge"]
df_M = df_al["M"]
df_ID= df["Name"]
#Defining the RT dictionary
RT = dict(zip(df["Name"], df["RT"]))
#Removing RT column
df=df.drop(columns=["RT"])
#Defining general function
def Adduct(x,i):
return x*df_M[i]/df_div[i] + df_mass[i]
#Applying general function in a range from 0 to 46.
for i in range(47):
df[df_name.loc[i]] = df['exact_mass'].map(lambda x: Adduct(x,i))
df
output
#Melting
df = pd.melt(df, id_vars=['Name'], var_name = "Adduct", value_name= "Exact_mass", value_vars=[x for x in df.columns if 'Name' not in x and 'exact' not in x])
df['name'] = df.apply(lambda x:x[0] + "_" + x[1], axis=1)
df['RT'] = df.Name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
del df['Name']
del df['Adduct']
df['RT'] = df.name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
df
output
Why NaN?
Here is how I will go about it, pandas.melt comes to rescue:
import pandas as pd
import numpy as np
from io import StringIO
s = StringIO('''
Name exact_mass M+3H M+3Na M+H 2M+H M-3H
0 a 596.465179 199.829002 221.810726 597.472455 1193.937634 197.814450
1 b 514.293038 172.438289 194.420013 515.300314 1029.593352 170.423737
2 c 712.397498 238.473109 260.454833 713.404774 1425.802272 236.458557
3 d 448.191949 150.404592 172.386316 449.199225 897.391174 148.390040
4 e 316.203834 106.408554 128.390278 317.211110 633.414944 104.39400
''')
df = pd.read_csv(s, sep="\s+")
df = pd.melt(df, id_vars=['Name'], value_vars=[x for x in df.columns if 'Name' not in x and 'exact' not in x])
df['name'] = df.apply(lambda x:x[0] + "_" + x[1], axis=1)
del df['Name']
del df['variable']
RT = {'a':1, 'b':2, 'c':3, 'd':5, 'e':1.5}
df['RT'] = df.name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
df
Here is the output:

ARMA model order selection using arma_order_select_ic from statsmodel

I am using the arma_order_select_ic from the statsmodel library to calculate the (p,q) order for the ARMA model, I am using for loop to loop over the different companies that are in each column of the data-frame. The code is as follows:
import pandas as pd
from statsmodels.tsa.stattools import arma_order_select_ic
df = pd.read_csv("Adjusted_Log_Returns.csv", index_col = 'Date').dropna()
main_df = pd.DataFrame()
for i in range(146):
order_selection = arma_order_select_ic(df.iloc[i].values, max_ar = 4,
max_ma = 2, ic = "aic")
ticker = [df.columns[i]]
df_aic_min = pd.DataFrame([order_selection["aic_min_order"]], index =
ticker)
main_df = main_df.append(df_aic_min)
main_df.to_csv("aic_min_orders.csv")
The code runs fine and I get all the results in the csv file at the end but the thing thats confusing me is that when I compute the (p,q) outside the for loop for a single company then I get different results
order_selection = arma_order_select_ic(df["ABL"].values, max_ar = 4,
max_ma = 2, ic = "aic")
The order for the company ABL is (1,1) when computed in the for loop while its (4,1) when computed outside of it.
So my question is what am I doing wrong or why is it like this? Any help would be appreciated.
Thanks in Advance
It's pretty clear from your code that you're trying to find the parameters for an ARMA model on the columns' data, but it's not what the code is doing: you're finding in the loop the parameters for the rows.
Consider this:
import pandas as pd
df = pd.DataFrame({'a': [3, 4]})
>>> df.iloc[0]
a 3
Name: 0, dtype: int64
>>> df['a']
0 3
1 4
Name: a, dtype: int64
You should probably change your code to
for c in df.columns:
order_selection = arma_order_select_ic(df[c].values, max_ar = 4,
max_ma = 2, ic = "aic")
ticker = [c]

Append to dataframe with for loop. Python3

I'm trying to loop through a list(y) and output by appending a row for each item to a dataframe.
y=[datetime.datetime(2017, 3, 29), datetime.datetime(2017, 3, 30), datetime.datetime(2017, 3, 31)]
Desired Output:
Index Mean Last
2017-03-29 1.5 .76
2017-03-30 2.3 .4
2017-03-31 1.2 1
Here is the first and last part of the code I currently have:
import pandas as pd
import datetime
df5=pd.DataFrame(columns=['Mean','Last'],index=index)
for item0 in y:
.........
.........
df=df.rename(columns = {0:'Mean'})
df4=pd.concat([df, df3], axis=1)
print (df4)
df5.append(df4)
print (df5)
My code only puts one row into the dataframe like as opposed to a row for each item in y:
Index Mean Last
2017-03-29 1.5 .76
Try:
y = [datetime(2017, 3, 29), datetime(2017, 3, 30),datetime(2017, 3, 31)]
m = [1.5,2.3,1.2]
l = [0.76, .4, 1]
df = pd.DataFrame([],columns=['time','mean','last'])
for y0, m0, l0 in zip(y,m,l):
data = {'time':y0,'mean':m0,'last':l0}
df = df.append(data, ignore_index=True)
and if you want y to be the index:
df.index = df.time
There are a few ways to skin this, and it's hard to know which approach makes the most sense with the limited info given. But one way is to start with a dataframe that has only the index, iterate through the dataframe by row and populate the values from some other process. Here's an example of that approach:
import datetime
import numpy as np
import pandas as pd
y=[datetime.datetime(2017, 3, 29), datetime.datetime(2017, 3, 30), datetime.datetime(2017, 3, 31)]
main_df = pd.DataFrame(y, columns=['Index'])
#pop in the additional columns you want, but leave them blank
main_df['Mean'] = None
main_df['Last'] = None
#set the index
main_df.set_index(['Index'], inplace=True)
that gives us the following:
Mean Last
Index
2017-03-29 None None
2017-03-30 None None
2017-03-31 None None
Now let's loop and plug in some made up random values:
## loop through main_df and add values
for (index, row) in main_df.iterrows():
main_df.ix[index].Mean = np.random.rand()
main_df.ix[index].Last = np.random.rand()
this results in the following dataframe which has the None values filled:
Mean Last
Index
2017-03-29 0.174714 0.718738
2017-03-30 0.983188 0.648549
2017-03-31 0.07809 0.47031

How to obtain the percentage of each value from a pandas table?

I have a table with 5 columns: AWA, REM, S1, S2 and SWS. I computed the sum of each column and of each row. And also the percentages of these values. But in order to get the percentages I repeated the same line 5 times.
Is there a way to improve it in case there were more than 5 columns?
Here is my code and I have also attached an image.
import pandas as pd
df = pd.DataFrame(TABLA, columns=('AWA', 'REM', 'S1', 'S2', 'SWS'))
df.index='s' + (df.index+1).astype(str)
df['xSubject'] = df.sum(axis=1) #sums each colummn
######Here starts the repetition:
df['AWA%'] = df['AWA']/df['AWA'].sum()*100
df['REM%'] = df['REM']/df['REM'].sum()*100
df['S1%'] = df['S1']/df['S1'].sum()*100
df['S2%'] = df['S2']/df['S2'].sum()*100
df['SWS%'] = df['SWS']/df['SWS'].sum()*100
df['xSubject%'] = df['xSubject']/df['xSubject'].sum()*100
######Here ends the repetition:
df.loc['xStage'] = df.sum() #sums each row
df
Use pd.concat with a reconstruction
pd.concat([df,
pd.DataFrame(df.div(df.sum()).values * 100,
columns=df.columns.values + '%')],
axis=1)
Consider the pd.DataFrame df
df = pd.DataFrame(np.random.rand(10, 5),
columns=('AWA', 'REM', 'S1', 'S2', 'SWS'))
df
and the % calculation
df.div(df.sum())
Then using the above code
pd.concat([df,
pd.DataFrame(df.div(df.sum()).values * 100,
columns=df.columns.values + '%')],
axis=1)

Resources