how to traverse a pandas dataframe to form a nested json? - python-3.x

I have a pandas dataframe with the following structure. It can created using the following code
import pandas as pd
import numpy as np
word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]
df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start, level_1_end )),
columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])
I want to traverse the dataframe into a JSON. The output should look like the one below:
{
"vehicle": {
"car":{
"tyre": True,
"steering": True
"clutch": True
},
"bus":{
"break": True
}
},
"animal": {
"dog":{
"leg": True
}
}
}
What is the best way to achieve this in pandas?

You are capturing more information than required. end columns are not needed.
remove rows that have nothing in them dropna()
forward fill the tags and remove < and > from string
use a comprehension to build the dictionary from dataframe to_dict()
df = pd.DataFrame({"word":["this","is","a","test","call","this","is","a","test","call","this","is","a","test","call"],
"level_3_start":["","","<tyre>","<steering>","","","","","<leg>","","<clutch>","","","<break>",""],
"level_3_end":["","","</tyre>","","</steering>","","","","</leg>","","","","</clutch>","</break>",""],
"level_2_start":["","","<car>","","","","","","<dog>","","<car>","","","<bus>",""],
"level_2_end":["","","","","</car>","","","","</dog>","","","","</car>","</bus>",""],
"level_1_start":["","","<vehicle>","","","","","","<animal>","","<vehicle>","","","",""],
"level_1_end":["","","","","","","</vehicle>","","</animal>","","","","","</vehicle>",""]})
# cleanup
df = df.replace({"":np.nan}).dropna(subset=[c for c in df.columns if c!="word"], how="all")
for c in [c for c in df.columns if "start" in c]:
df[c].fillna(method="ffill", inplace=True)
df[c] = df[c].str.replace("<","")
df[c] = df[c].str.replace(">","")
dfd = df.loc[:,[c for c in df.columns if "level" in c]].drop_duplicates().to_dict(orient="records")
{d["level_1_start"]:
{d2["level_2_start"]:
{d3["level_3_start"]:True
for d3 in dfd if d3["level_1_start"]==d["level_1_start"] and d3["level_2_start"]==d2["level_2_start"]
}
for d2 in dfd if d2["level_1_start"]==d["level_1_start"]
}
for d in dfd
}
output
{'vehicle': {'car': {'tyre': True, 'steering': True, 'clutch': True},
'bus': {'break': True}},
'animal': {'dog': {'leg': True}}}

To get the final results, your data has to go through a 3 step process:
step 1: remove all columns that are not required for processing
step 2: clean data to remove tags and sort them in level_1, level_2, level_3 order
step 3: create the nested dictionary
Here's how I have done it. Commented each section to show clearly what we are doing.
import pandas as pd
import numpy as np
import collections
word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]
df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start, level_1_end )),
columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])
#creating df_temp for processing
df_temp = df1
#drop columns that are not important for this problem statement
df_temp = df_temp.drop(columns=['word','level_1_end','level_2_end','level_3_end'])
#remove all < and >
df_temp['level_1_start'] = df_temp['level_1_start'].str.replace("<","").str.replace(">","")
df_temp['level_2_start'] = df_temp['level_2_start'].str.replace("<","").str.replace(">","")
df_temp['level_3_start'] = df_temp['level_3_start'].str.replace("<","").str.replace(">","")
#drop all rows that don't have any value
df_temp.dropna(how='all', inplace = True)
#forwardfill all level_1 columns
df_temp['level_1_start'] = df_temp['level_1_start'].ffill()
#drop rows that have no data in level_2 and level_3
df_temp = df_temp.dropna(subset=['level_3_start','level_2_start'],how='all')
#forwardfill all level_2_start columns
df_temp['level_2_start'] = df_temp['level_2_start'].ffill()
#drop rows that have no data in level_3
df_temp = df_temp.dropna(subset=['level_3_start'],how='all')
#now we have the all data ready for processing
#sort them in level_1, level_2, level_3 order
df_temp = df_temp.sort_values(by=['level_1_start', 'level_2_start','level_3_start'])
#to create nested dictionary, you need to use collections.defaultdict
df_dict = collections.defaultdict(dict)
#iterate through the dataframe. each row will have a unique record for level_3
for idx,row in df_temp.iterrows():
lev_1 = row['level_1_start']
lev_2 = row['level_2_start']
lev_3 = row['level_3_start']
#if level_1 does not exist, create new entry for level_1, level_2, & level_3 (ex: animal does not exist)
#if level_1 exists but no level_2, create new entry for level_2 & level_3 (ex: car does not exist but bus exists)
#if level_1 and level 2 exists, then create a new entry for level 3 (ex: vehicle, car exists, but tyre does not)
if lev_1 in df_dict:
if lev_2 in df_dict[lev_1]:
df_dict[lev_1][lev_2][lev_3] = True
else:
df_dict[lev_1][lev_2] = {lev_3:True}
else:
df_dict[lev_1] = {lev_2 : {lev_3:True}}
#convert collection back to normal dictionary
df_dict = dict(df_dict)
print(df_dict)
Output will be as follows:
{'animal':
{'dog': {'leg': True}
},
'vehicle':
{'bus': {'break': True},
'car': {'clutch': True, 'steering': True, 'tyre': True}
}
}

Related

Adding new columns in a dataframe gives length mismatch error

From a csv file (initial.csv):
"Id","Name"
1,"CLO"
2,"FEV"
2,"GEN"
3,"HYP"
4,"DIA"
1,"COL"
1,"EOS"
4,"GAS"
1,"AEK"
I am grouping by the Id column and agreggating the Name column values so that each unique Id has all the Name values appended on the same row (new.csv):
"Id","Name"
1,"CLO","COL","EOS","AEK"
2,"FEV","GEN"
3,"HYP"
4,"DIA","GAS"
Now some rows have extra name values for which I want to append corresponding columns according the maximum count of Name values that exist on the rows, i.e.
"Id","Name","Name2","Name3","Name4"
1,"CLO","COL","EOS","AEK"
2,"FEV","GEN"
3,"HYP"
4,"DIA","GAS"
I do not understand how I can add new columns on dataframe to match the data.
Below is my code:
import pandas as pd
df = pd.read_csv('initial.csv', delimiter=',')
max_names_count = 0
for id in unique_ids_list:
mask = df['ID'] == id
names_count = len(df[mask])
if names_count > max_names_count:
max_names_count = names_count
group_by_id = df.groupby(["Id"]).agg({"Name": ','.join})
# Create new columns 'Id', 'Name', 'Name2', 'Name3', 'Name4'
new_column_names = ["Id", "Name"] + ['Name' + str(i) for i in range(2, max_names_count+1)]
group_by_id.columns = new_column_names # <-- ValueError: Length mismatch: Expected axis has 1 elements, new values have 5 elements
group_by_id.to_csv('new.csv', encoding='utf-8')
Try:
df = pd.read_csv("initial.csv")
df_out = (
df.groupby("Id")["Name"]
.agg(list)
.to_frame()["Name"]
.apply(pd.Series)
.rename(columns=lambda x: "Name" if x == 0 else "Name{}".format(x + 1))
.reset_index()
)
df_out.to_csv("out.csv", index=False)
Creates out.csv:
Id,Name,Name2,Name3,Name4
1,CLO,COL,EOS,AEK
2,FEV,GEN,,
3,HYP,,,
4,DIA,GAS,,

Filter dataframe based on groupby sum()

I want to filter my dataframe based on a groupby sum(). I am looking for lines where the amounts for a spesific date, gets to zero.
I have solve this by creating a for loop. I suspect this will reduce performance if the dataframe is large.
It also seems clunky.
newdf = pd.DataFrame()
newdf['name'] = ('leon','eurika','monica','wian')
newdf['surname'] = ('swart','swart','swart','swart')
newdf['birthdate'] = ('14051981','198001','20081012','20100621')
newdf['tdate'] = ('13/05/2015','14/05/2015','15/05/2015', '13/05/2015')
newdf['tamount'] = (100.10, 111.11, 123.45, -100.10)
df = newdf.groupby(['tdate'])[['tamount']].sum().reset_index()
df2 = df.loc[df["tamount"] == 0, "tdate"]
df3 = pd.DataFrame()
for i in df2:
df3 = df3.append(newdf.loc[newdf["tdate"] == i])
print (df3)
The below code is creating an output of the two lines getting to zero when combined on tamount
name surname birthdate tdate tamount
0 leon swart 1981-05-14 13/05/2015 100.1
3 wian swart 2010-06-21 13/05/2015 -100.1
Just use basic numpy :)
import numpy as np
df = newdf.groupby(['tdate'])[['tamount']].sum().reset_index()
dates = df['tdate'][np.where(df['tamount'] == 0)[0]]
newdf[np.isin(newdf['tdate'], dates) == True]
Hope this helps; let me know if you have any questions.

How to merge two dataframes and return data from another column in new column only if there is match?

I have a two df that look like this:
df1:
id
1
2
df2:
id value
2 a
3 b
How do I merge these two dataframes and only return the data from value column in a new column if there is a match?
new_merged_df
id value new_value
1
2 a a
3 b
You can try this using #JJFord3 setup:
import pandas
df1 = pandas.DataFrame(index=[1,2])
df2 = pandas.DataFrame({'value' : ['a','b']},index=[2,3])
#Use isin to create new_value
df2['new_value'] = df2['value'].where(df2.index.isin(df1.index))
#Use reindex with union to rebuild dataframe with both indexes
df2.reindex(df1.index.union(df2.index))
Output:
value new_value
1 NaN NaN
2 a a
3 b NaN
import pandas
df1 = pandas.DataFrame(index=[1,2])
df2 = pandas.DataFrame({'value' : ['a','b']},index=[2,3])
new_merged_df_outer = df1.merge(df2,how='outer',left_index=True,right_index=True)
new_merged_df_inner = df1.merge(df2,how='inner',left_index=True,right_index=True)
new_merged_df_inner.rename(columns={'value':'new_value'})
new_merged_df = new_merged_df_outer.merge(new_merged_df_inner,how='left',left_index=True,right_index=True)
First, create an outer merge to keep all indexes.
Then create an inner merge to only get the overlap.
Then merge the inner merge back to the outer merge to get the desired column setup.
You can use full outer join
Lets model your data with case classes:
case class MyClass1(id: String)
case class MyClass2(id: String, value: String)
// this one for the result type
case class MyClass3(id: String, value: Option[String] = None, value2: Option[String] = None)
Creating some inputs:
val input1: Dataset[MyClass1] = ...
val input2: Dataset[MyClass2] = ...
Joining your data:
import scala.implicits._
val joined = input1.as("1").joinWith(input2.as("2"), $"1.id" === $"2.id", "full_outer")
joined map {
case (left, null) if left != null => MyClass3(left.id)
case (null, right) if right != null => MyClass3(right.id, Some(right.value))
case (left, right) => MyClass3(left.id, Some(right.value), Some(right.value))
}
DataFrame.merge has in parameter indicator which
If True, adds a column to output DataFrame called “_merge” with information on the source of each row.
This can be used to check if there is a match
import pandas as pd
df1 = pd.DataFrame(index=[1,2])
df2 = pd.DataFrame({'value' : ['a','b']},index=[2,3])
# creates a new column `_merge` with values `right_only`, `left_only` or `both`
merged = df1.merge(df2, how='outer', right_index=True, left_index=True, indicator=True)
merged['new_value'] = merged.loc[(merged['_merge'] == 'both'), 'value']
merged = merged.drop('_merge', axis=1)
Use merge and isin:
df = df1.merge(df2,on='id',how='outer')
id_value = df2.loc[df2['id'].isin(df1.id.tolist()),'id'].unique()
mask = df['id'].isin(id_value)
df.loc[mask,'new_value'] = df.loc[mask,'value']
# alternative df['new_value'] = np.where(mask, df['value'], np.nan)
print(df)
id value new_value
0 1 NaN NaN
1 2 a a
2 3 b NaN

Subtract a single value from columns in pandas

I have two data frames, df and df_test. I am trying to create a new dataframe for each df_test row that will include the difference between x coordinates and the y coordinates. I wold also like to create a new column that gives the magnitude of this distance between objects. Below is my code.
import pandas as pd
import numpy as np
# Create Dataframe
index_numbers = np.linspace(0, 10, 11, dtype=np.int)
index_ = ['OP_%s' % number for number in index_numbers]
header = ['X', 'Y', 'D']
# print(index_)
data = np.round_(np.random.uniform(low=0, high=10, size=(len(index_), 3)), decimals=0)
# print(data)
df = pd.DataFrame(data=data, index=index_, columns=header)
df_test = df.sample(3)
# print(df)
# print(df_test)
for index, row in df_test.iterrows():
print(index)
print(row)
df_(index) = df
df_(index)['X'] = df['X'] - df_test['X'][row]
df_(index)['Y'] = df['Y'] - df_test['Y'][row]
df_(index)['Dist'] = np.sqrt(df_(index)['X']**2 + df_(index)['Y']**2)
print(df_(index))
Better For Loop
for index, row in df_test.iterrows():
# print(index)
# print(row)
# print("df_{0}".format(index))
df_temp = df.copy()
df_temp['X'] = df_temp['X'] - df_test['X'][index]
df_temp['Y'] = df_temp['Y'] - df_test['Y'][index]
df_temp['Dist'] = np.sqrt(df_temp['X']**2 + df_temp['Y']**2)
print(df_temp)
I have written a for loop to run through each row of the df_test dataframe and "try" to create the columns. The (index) in each loop is the name of the new data frame based on test row used. Once the dataframe is created with the modified and new columns I would need to save the data frames to a dictionary. The new loop produces the each of the new dataframes I need but what is the best way to save each new dataframe? Any help in creating these columns would be greatly appreciated.
Please comment with any questions so that I can make it easier to understand, if need be.

Calling a data frame via a string

I have a list of countries such as:
country = ["Brazil", "Chile", "Colombia", "Mexico", "Panama", "Peru", "Venezuela"]
I created data frames using the names from the country list:
for c in country:
c = pd.read_excel(str(c + ".xls"), skiprows = 1)
c = pd.to_datetime(c.Date, infer_datetime_format=True)
c = c[["Date", "spreads"]]
Now I want to be able to merge all the countries data frames using the columns date as the key. The idea is to create a loop like the following:
df = Brazil #this is the first dataframe, which also corresponds to the first element of the list country.
for i in range(len(country)-1):
df = df.merge(country[i+1], on = "Date", how = "inner")
df.set_index("Date", inplace=True)
I got the error ValueError: can not merge DataFrame with instance of type <class 'str'>. It seems python is not calling the data frame which the name is in the country list. How can I call those data frames starting from the country list?
Thanks masters!
Your loop doesn't modify the contents of the country list, so country is still a list of strings.
Consider building a new list of dataframes and looping over that:
country_dfs = []
for c in country:
df = pd.read_excel(c + ".xls", skiprows=1)
df = pd.to_datetime(df.Date, infer_datetime_format=True)
df = df[["Date", "spreads"]]
# add new dataframe to our list of dataframes
country_dfs.append(df)
then to merge,
merged_df = country_dfs[0]
for df in country_dfs[1:]:
merged_df = merged_df.merge(df, on='Date', how='inner')
merged_df.set_index('Date', inplace=True)

Resources