From a csv file (initial.csv):
"Id","Name"
1,"CLO"
2,"FEV"
2,"GEN"
3,"HYP"
4,"DIA"
1,"COL"
1,"EOS"
4,"GAS"
1,"AEK"
I am grouping by the Id column and agreggating the Name column values so that each unique Id has all the Name values appended on the same row (new.csv):
"Id","Name"
1,"CLO","COL","EOS","AEK"
2,"FEV","GEN"
3,"HYP"
4,"DIA","GAS"
Now some rows have extra name values for which I want to append corresponding columns according the maximum count of Name values that exist on the rows, i.e.
"Id","Name","Name2","Name3","Name4"
1,"CLO","COL","EOS","AEK"
2,"FEV","GEN"
3,"HYP"
4,"DIA","GAS"
I do not understand how I can add new columns on dataframe to match the data.
Below is my code:
import pandas as pd
df = pd.read_csv('initial.csv', delimiter=',')
max_names_count = 0
for id in unique_ids_list:
mask = df['ID'] == id
names_count = len(df[mask])
if names_count > max_names_count:
max_names_count = names_count
group_by_id = df.groupby(["Id"]).agg({"Name": ','.join})
# Create new columns 'Id', 'Name', 'Name2', 'Name3', 'Name4'
new_column_names = ["Id", "Name"] + ['Name' + str(i) for i in range(2, max_names_count+1)]
group_by_id.columns = new_column_names # <-- ValueError: Length mismatch: Expected axis has 1 elements, new values have 5 elements
group_by_id.to_csv('new.csv', encoding='utf-8')
Try:
df = pd.read_csv("initial.csv")
df_out = (
df.groupby("Id")["Name"]
.agg(list)
.to_frame()["Name"]
.apply(pd.Series)
.rename(columns=lambda x: "Name" if x == 0 else "Name{}".format(x + 1))
.reset_index()
)
df_out.to_csv("out.csv", index=False)
Creates out.csv:
Id,Name,Name2,Name3,Name4
1,CLO,COL,EOS,AEK
2,FEV,GEN,,
3,HYP,,,
4,DIA,GAS,,
I have a pandas dataframe with the following structure. It can created using the following code
import pandas as pd
import numpy as np
word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]
df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start, level_1_end )),
columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])
I want to traverse the dataframe into a JSON. The output should look like the one below:
{
"vehicle": {
"car":{
"tyre": True,
"steering": True
"clutch": True
},
"bus":{
"break": True
}
},
"animal": {
"dog":{
"leg": True
}
}
}
What is the best way to achieve this in pandas?
You are capturing more information than required. end columns are not needed.
remove rows that have nothing in them dropna()
forward fill the tags and remove < and > from string
use a comprehension to build the dictionary from dataframe to_dict()
df = pd.DataFrame({"word":["this","is","a","test","call","this","is","a","test","call","this","is","a","test","call"],
"level_3_start":["","","<tyre>","<steering>","","","","","<leg>","","<clutch>","","","<break>",""],
"level_3_end":["","","</tyre>","","</steering>","","","","</leg>","","","","</clutch>","</break>",""],
"level_2_start":["","","<car>","","","","","","<dog>","","<car>","","","<bus>",""],
"level_2_end":["","","","","</car>","","","","</dog>","","","","</car>","</bus>",""],
"level_1_start":["","","<vehicle>","","","","","","<animal>","","<vehicle>","","","",""],
"level_1_end":["","","","","","","</vehicle>","","</animal>","","","","","</vehicle>",""]})
# cleanup
df = df.replace({"":np.nan}).dropna(subset=[c for c in df.columns if c!="word"], how="all")
for c in [c for c in df.columns if "start" in c]:
df[c].fillna(method="ffill", inplace=True)
df[c] = df[c].str.replace("<","")
df[c] = df[c].str.replace(">","")
dfd = df.loc[:,[c for c in df.columns if "level" in c]].drop_duplicates().to_dict(orient="records")
{d["level_1_start"]:
{d2["level_2_start"]:
{d3["level_3_start"]:True
for d3 in dfd if d3["level_1_start"]==d["level_1_start"] and d3["level_2_start"]==d2["level_2_start"]
}
for d2 in dfd if d2["level_1_start"]==d["level_1_start"]
}
for d in dfd
}
output
{'vehicle': {'car': {'tyre': True, 'steering': True, 'clutch': True},
'bus': {'break': True}},
'animal': {'dog': {'leg': True}}}
To get the final results, your data has to go through a 3 step process:
step 1: remove all columns that are not required for processing
step 2: clean data to remove tags and sort them in level_1, level_2, level_3 order
step 3: create the nested dictionary
Here's how I have done it. Commented each section to show clearly what we are doing.
import pandas as pd
import numpy as np
import collections
word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]
df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start, level_1_end )),
columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])
#creating df_temp for processing
df_temp = df1
#drop columns that are not important for this problem statement
df_temp = df_temp.drop(columns=['word','level_1_end','level_2_end','level_3_end'])
#remove all < and >
df_temp['level_1_start'] = df_temp['level_1_start'].str.replace("<","").str.replace(">","")
df_temp['level_2_start'] = df_temp['level_2_start'].str.replace("<","").str.replace(">","")
df_temp['level_3_start'] = df_temp['level_3_start'].str.replace("<","").str.replace(">","")
#drop all rows that don't have any value
df_temp.dropna(how='all', inplace = True)
#forwardfill all level_1 columns
df_temp['level_1_start'] = df_temp['level_1_start'].ffill()
#drop rows that have no data in level_2 and level_3
df_temp = df_temp.dropna(subset=['level_3_start','level_2_start'],how='all')
#forwardfill all level_2_start columns
df_temp['level_2_start'] = df_temp['level_2_start'].ffill()
#drop rows that have no data in level_3
df_temp = df_temp.dropna(subset=['level_3_start'],how='all')
#now we have the all data ready for processing
#sort them in level_1, level_2, level_3 order
df_temp = df_temp.sort_values(by=['level_1_start', 'level_2_start','level_3_start'])
#to create nested dictionary, you need to use collections.defaultdict
df_dict = collections.defaultdict(dict)
#iterate through the dataframe. each row will have a unique record for level_3
for idx,row in df_temp.iterrows():
lev_1 = row['level_1_start']
lev_2 = row['level_2_start']
lev_3 = row['level_3_start']
#if level_1 does not exist, create new entry for level_1, level_2, & level_3 (ex: animal does not exist)
#if level_1 exists but no level_2, create new entry for level_2 & level_3 (ex: car does not exist but bus exists)
#if level_1 and level 2 exists, then create a new entry for level 3 (ex: vehicle, car exists, but tyre does not)
if lev_1 in df_dict:
if lev_2 in df_dict[lev_1]:
df_dict[lev_1][lev_2][lev_3] = True
else:
df_dict[lev_1][lev_2] = {lev_3:True}
else:
df_dict[lev_1] = {lev_2 : {lev_3:True}}
#convert collection back to normal dictionary
df_dict = dict(df_dict)
print(df_dict)
Output will be as follows:
{'animal':
{'dog': {'leg': True}
},
'vehicle':
{'bus': {'break': True},
'car': {'clutch': True, 'steering': True, 'tyre': True}
}
}