Adding a grouped column header to an existing dataframe - python-3.x

How can we add to an existing Pandas dataframe a column header on a supplementary row above two sub column headers ? Here's the searched result:
Here's the current code which adds the CAPITAL header, but does not position it correctly.
import pandas as pd
OWNER = 'OWNER'
CAPITAL = 'CAPITAL'
USD = 'USD'
CHF = 'CHF'
YIELD = 'YIELD AMT'
df = pd.DataFrame({
OWNER: 2*['JOE']+3*['ROB'],
USD: [10000, 30000, 4000, 24000, 16000],
CHF: [9000, 27000, 3600, 21600, 14400],
YIELD: [100, 300, 40, 240, 160]
})
print(df)
'''
OWNER USD CHF YIELD AMT
0 JOE 10000 9000 100
1 JOE 30000 27000 300
2 ROB 4000 3600 40
3 ROB 24000 21600 240
4 ROB 16000 14400 160
'''
df.columns = pd.MultiIndex.from_product([[CAPITAL], df.columns])
print('\nUsing pd.from_product()')
print(df)
'''
CAPITAL
OWNER USD CHF YIELD AMT
0 JOE 10000 9000 100
1 JOE 30000 27000 300
2 ROB 4000 3600 40
3 ROB 24000 21600 240
4 ROB 16000 14400 160
'''

The solution is to use pd.MultiIndex.from_arrays() instead of pd.MultiIndex.from_product(). Here's the code:
import pandas as pd
OWNER = 'OWNER'
CAPITAL = 'CAPITAL'
USD = 'USD'
CHF = 'CHF'
YIELD = 'YIELD AMT'
df_ok = pd.DataFrame({
OWNER: 2*['JOE']+3*['ROB'],
USD: [10000, 30000, 4000, 24000, 16000],
CHF: [9000, 27000, 3600, 21600, 14400],
YIELD: [100, 300, 40, 240, 160]
})
df_ok.columns = pd.MultiIndex.from_arrays([[' ', ' ', CAPITAL, ' '], df_ok.columns])
print('\nUsing pd.from_arrays()')
print()
print(df_ok)
'''
CAPITAL
OWNER USD CHF YIELD AMT
0 JOE 10000 9000 100
1 JOE 30000 27000 300
2 ROB 4000 3600 40
3 ROB 24000 21600 240
4 ROB 16000 14400 160
'''

Related

Using Pandas groupby with total column and total row

The Dataframe used in my code list capital and yield amounts belonging to owners. The purpose is to group the values by owners and then to add a total column to the groupby dataframe and then add a global total row.
Here's the code:
import pandas as pd
OWNER = 'OWNER'
CAPITAL = 'CAPITAL'
YIELD = 'YIELD AMT'
TOTAL = 'TOTAL'
# defining the dataframe
df = pd.DataFrame({OWNER: 2 * ['Joe'] + 3 * ['Carla'] + ['Rob'],
CAPITAL: [10000, 5000, 20000, 3000, -4000, 2000],
YIELD: [1000, 500, 2000, 300, 400, 200]})
'''
OWNER CAPITAL YIELD AMT
0 Joe 10000 1000
1 Joe 5000 500
2 Carla 20000 2000
3 Carla 3000 300
4 Carla -4000 400
5 Rob 2000 200
'''
print(df)
print()
# grouping the rows by owner
dfg = df.groupby([OWNER]).sum().reset_index()
'''
OWNER CAPITAL YIELD AMT
0 Carla 19000 2700
1 Joe 15000 1500
2 Rob 2000 200
'''
print(dfg)
print()
# adding a TOTAL column
for index in range(0, len(dfg)):
dfg.loc[index, TOTAL] = dfg.loc[index, CAPITAL] + dfg.loc[index, YIELD]
'''
OWNER CAPITAL YIELD AMT TOTAL
0 Carla 19000 2700 21700.0
1 Joe 15000 1500 16500.0
2 Rob 2000 200 2200.0
'''
print(dfg)
print()
# resetting index to OWNER column
dfg = dfg.set_index(OWNER)
'''
CAPITAL YIELD AMT TOTAL
OWNER
Carla 19000 2700 21700.0
Joe 15000 1500 16500.0
Rob 2000 200 2200.0
'''
print(dfg)
print()
# finally, adding a TOTAL row
dfg.loc[TOTAL] = dfg.sum(numeric_only=True, axis=0)[[CAPITAL, YIELD, TOTAL]]
'''
CAPITAL YIELD AMT TOTAL
OWNER
Carla 19000.0 2700.0 21700.0
Joe 15000.0 1500.0 16500.0
Rob 2000.0 200.0 2200.0
TOTAL 36000.0 4400.0 40400.0
'''
print(dfg.fillna(''))
My question is: is there a more concise way of coding the total column or row computation using Pandas agg() or aggregate() and a lambda expression ?
df[TOTAL] = df[CAPITAL] + df[YIELD]
output = df.groupby(by=[OWNER]).sum()
is what you look for. output is the dataframe you need.

Adding total rows to a Pandas DataFrame

I define a Pandas DataFrame containing several deposit/withdrawal rows for different owners. I want to add a total row for each owner to totalize the deposits/withdrawals aswell as the yield amounts generated by each capital amount.
Here's the result of the code below:
Here's my code:
import pandas as pd
OWNER = 'OWNER'
DEPWITHDR = 'DEP/WITHDR'
DATEFROM = 'DATE FROM'
DATETO = 'DATE TO'
CAPITAL = 'CAPITAL'
YIELD = 'YIELD AMT'
TOTAL = 'TOTAL'
df = pd.DataFrame({
OWNER: 2*['JOE']+3*['ROB'],
DEPWITHDR: [10000, 20000, 4000, 20000, -8000],
CAPITAL: [10000, 30000, 4000, 24000, 16000],
DATEFROM: ['2021-01-01', '2021-01-02', '2021-01-01', '2021-01-03', '2021-01-04'],
DATETO: ['2021-01-01', '2021-01-05', '2021-01-02', '2021-01-03', '2021-01-05'],
YIELD: [100, 1200, 80, 240, 320]
})
print('SOURCE DATAFRAME\n')
print(df)
print()
newDf = pd.DataFrame(columns=[OWNER, DEPWITHDR, CAPITAL, DATEFROM, DATETO, YIELD])
currentOwner = df.loc[1, OWNER]
# using groupby function to compute the two columns totals
dfTotal = df.groupby([OWNER]).agg({DEPWITHDR:'sum', YIELD:'sum'}).reset_index()
totalIndex = 0
# deactivating SettingWithCopyWarning caueed by totalRow[OWNER] += ' total'
pd.set_option('mode.chained_assignment', None)
for index, row in df.iterrows():
if currentOwner == row[OWNER]:
newDf = newDf.append({OWNER: row[OWNER],
DEPWITHDR: row[DEPWITHDR],
CAPITAL: row[CAPITAL],
DATEFROM: row[DATEFROM],
DATETO: row[DATETO],
YIELD: row[YIELD]}, ignore_index=True)
else:
totalRow = dfTotal.loc[totalIndex]
totalRow[OWNER] += ' total'
newDf = newDf.append(totalRow, ignore_index=True)
totalIndex += 1
newDf = newDf.append({OWNER: '',
DEPWITHDR: '',
CAPITAL: '',
DATEFROM: '',
DATETO: '',
YIELD: ''}, ignore_index=True)
newDf = newDf.append({OWNER: row[OWNER],
DEPWITHDR: row[DEPWITHDR],
CAPITAL: row[CAPITAL],
DATEFROM: row[DATEFROM],
DATETO: row[DATETO],
YIELD: row[YIELD]}, ignore_index=True)
currentOwner = row[OWNER]
totalRow = dfTotal.loc[totalIndex]
totalRow[OWNER] += ' total'
newDf = newDf.append(totalRow, ignore_index=True)
print('TARGET DATAFRAME\n')
print(newDf.fillna(''))
My question is: what is a better, more Pandas friendly, way, to obtain the desired result ?
You can use groupby and concat:
df_total = pd.concat((
df,
df.replace({o: o + ' total' for o in df[OWNER].unique()}).groupby(OWNER).agg({DEPWITHDR: sum, YIELD: sum}).reset_index())
).fillna('').reset_index().sort_values([OWNER, DATEFROM, DATETO])
In detail:
df.replace({o: o + ' total' for o in df[OWNER].unique()}): replace each occurrence of the name of every owner with the name itself plus the string ' total' (e.g., 'JOE' -> 'JOE total'); so that the result of the groupby will have those values in the column OWNER.
groupby(OWNER).agg({DEPWITHDR: sum, YIELD: sum}): get the sum of the column DEPWITHDR and YIELD per each owner.
pd.concat(...).fillna('').reset_index().sort_values([OWNER, DATEFROM, DATETO]): concatenate the original DataFrame and that with the totals and then sort rows by column OWNER, than DATEFROM, than DATETO, so that the rows with the totals for each OWNER will be placed at the ends of the rows belonging to that owner (because they ends with ' total') and moreover the rows will be chronologically sorted by DATEFROM, DATETO.
Here df_total:
index OWNER DEP/WITHDR CAPITAL DATE FROM DATE TO YIELD AMT
0 0 JOE 10000 10000 2021-01-01 2021-01-01 100
1 1 JOE 20000 30000 2021-01-02 2021-01-05 1200
5 0 JOE total 30000 1300
2 2 ROB 4000 4000 2021-01-01 2021-01-02 80
3 3 ROB 20000 24000 2021-01-03 2021-01-03 240
4 4 ROB -8000 16000 2021-01-04 2021-01-05 320
6 1 ROB total 16000 640
IMHO, I'd create a different DataFrame from each owner, with only his/her data, and then a summary DataFrame with totals for each owner. But, maybe, in your use case, this is the best solution.

how to categorize salary into high/med/low group in python?

I have an employee dataset having salary details. I like to add an additional column to display their salary group like high/med/low:
Data:
Empno Sal Deptno
1 800 20
2 1600 30
3 2975 20
4 1250 30
5 2850 30
6 2450 10
7 3000 20
Expected Output:
Empno Sal Deptno Sal_Group
1 800 20 low
2 1600 30 mid
3 2975 20 ...
4 1250 30 ...
5 2850 30 ...
6 2450 10 ...
7 3000 20 high
You can try this:
import pandas as pd
import numpy as np
df = pd.read_csv("file.csv")
bins = np.linspace(min(df['Sal']), max(df['Sal']),4)
groupNames = ["low", "med", "high"]
df['SalGroup'] = pd.cut(df['Sal'], bins, labels = groupNames, include_lowest = True)
print(df)

Grouping and merging with pandas

I need some help with this.
I´ve to transform this dataframe without duplicates in column "name"
you can see that I have duplicates in column "name" ex:John , Joan
df0 = pd.DataFrame({'name':['John','John','Joan','Joan','Juan'],
'time':[20,10,11,18, 15],
'amount':[100, 400, 200, 100, 300]})
df0
name time amount
0 John 20 100
1 John 10 400
2 Joan 11 200
3 Joan 18 100
4 Juan 15 300
I need to transform this, grouping the dataframe in this way, I don´t know if is the right way.
dfend0 = df0.groupby('name').agg(lambda x: x.tolist())
dfend0
time amount
name
Joan [11, 18] [200, 100]
John [20, 10] [100, 400]
Juan [15] [300]
The column "name" is now the index, this isn´t the behavior I was looking for
list(dfend0.columns.values)
['time', 'amount']
#Now I need to merge with other dataframe
df1 = pd.DataFrame({
'name' : ['John' ,'Joan', 'Juan'],
'address' : ['streetA','streetB','streetC'],
'age' : [30,40,50]
})
df1
name address age
0 John streetA 30
1 Joan streetB 40
2 Juan streetC 50
ender = df1.merge(df0)
ender
name address age time amount
0 John streetA 30 20 100
1 John streetA 30 10 400
2 Joan streetB 40 11 200
3 Joan streetB 40 18 100
4 Juan streetC 50 15 300
This is not what I´m looking for, this example would be more accurate:
name address age time amount
0 John streetA 30 20,10 100,400
1 Joan streetB 40 11,18 200,100
2 Juan streetC 50 15 300
Any clue?
First, use as_index=False if you don't want the name as the index after the groupby operation.
2nd, there is no need for the lambda use .agg(list)
dfend0 = df0.groupby('name',as_index=False).agg(list)
then merge as usual.
df2 = pd.merge(df1,df0end,on='name')
name address age time amount
0 John streetA 30 [20, 10] [100, 400]
1 Joan streetB 40 [11, 18] [200, 100]
2 Juan streetC 50 [15] [300]
Note, if you dont want lists use (not recommended as you lost the underlying datatype and end up with a string)
df0end = df0.astype(str).groupby('name',as_index=False).agg(','.join)
name time amount
0 Joan 11,18 200,100
1 John 20,10 100,400
2 Juan 15 300
df2 = pd.merge(df1,df0end,on='name')
name address age time amount
0 John streetA 30 20,10 100,400
1 Joan streetB 40 11,18 200,100
2 Juan streetC 50 15 300
df = pd.DataFrame({'name':['John','John','Joan','Joan','Juan'],
'time':[20,10,11,18, 15],
'amount':[100, 400, 200, 100, 300]})
df=df.astype(str).groupby('name').agg({
'time':lambda x : ','.join(x),
'amount':lambda x : ','.join(x)
})
print(df)
time amount
name
Joan 11,18 200,100
John 20,10 100,400
Juan 15 300
at the end use df=df.merge(df2,on='name')

Collapse/Transpose Columns of a DataFrame Based on Repeating - pandas

I have a data frame sample_df like this,
id pd pd_dt pd_tp pd.1 pd_dt.1 pd_tp.1 pd.2 pd_dt.2 pd_tp.2
0 1 100 per year 468 200 per year 400 300 per year 320
1 2 100 per year 60 200 per year 890 300 per year 855
I need my output like this,
id pd pd_dt pd_tp
1 100 per year 468
1 200 per year 400
1 300 per year 320
2 100 per year 60
2 200 per year 890
2 300 per year 855
I tried the following,
sample_df.stack().reset_index().drop('level_1',axis=1)
This does not work.
I have pd, pd_dt, pd_tp are repeating with .1, .2 .. values.
I have How can I achieve output?
You want pd.wide_to_long, but with some tweak since your first few columns do not share the same patterns with the rest:
# rename
df.columns = [x+'.0' if '.' not in x and x != 'id' else x
for x in df.columns]
pd.wide_to_long(df, stubnames=['pd','pd_dt','pd_tp'],
i='id', j='order', sep='.')
Output:
pd pd_dt pd_tp
id order
1 0 100 per year 468
2 0 100 per year 60
1 1 200 per year 400
2 1 200 per year 890
1 2 300 per year 320
2 2 300 per year 855
You can use numpy split to split it into n arrays and concetanate them back together. Then repeat the id column by the number of rows in your new dataframe.
new_df = pd.DataFrame(np.concatenate(np.split(df.iloc[:,1:].values, (df.shape[1] - 1)/3, axis=1)))
new_df.columns = ['pd','pd_dt','pd_tp']
new_df['id'] = pd.concat([df.id] * (new_df.shape[0]//2), ignore_index=True)
new_df.sort_values('id')
Result:
pd pd_dt pd_tp id
0 100 per year 468 1
2 200 per year 400 1
4 300 per year 320 1
1 100 per year 60 2
3 200 per year 890 2
5 300 per year 855 2
You can do this:
dt_mask=df.columns.str.contains('dt')
tp_mask=df.columns.str.contains('tp')
new_df=pd.DataFrame()
new_df['pd']=df[df.columns[~(dt_mask|tp_mask)]].stack().reset_index(level=1,drop='level_1')
new_df['pd_dt']=df[df.columns[dt_mask]].stack().reset_index(level=1,drop='level_1')
new_df['pd_tp']=df[df.columns[tp_mask]].stack().reset_index(level=1,drop='level_1')
new_df.reset_index(inplace=True)
print(new_df)
id pd pd_dt pd_tp
0 1 100 per_year 468
1 1 200 per_year 400
2 1 300 per_year 320
3 2 100 per_year 60
4 2 200 per_year 890
5 2 300 per_year 855

Resources