Using pandas to flatten a dict - python-3.x

[{'name': 'Test Item1',
'column_values': [{'title': 'col2', 'text': 'Oladimeji Olaolorun'},
{'title': 'col3', 'text': 'Working on it'},
{'title': 'col4', 'text': '2019-09-17'},
{'title': 'col5', 'text': '1'}],
'group': {'title': 'Group 1'}},
{'name': 'Test Item2',
'column_values': [{'title': 'col2', 'text': 'Lucie Phillips'},
{'title': 'col3', 'text': 'Done'},
{'title': 'col4', 'text': '2019-09-20'},
{'title': 'col5', 'text': '2'}],
'group': {'title': 'Group 1'}},
{'name': 'Test Item3',
'column_values': [{'title': 'col2', 'text': 'David Binns'},
{'title': 'col3', 'text': None},
{'title': 'col4', 'text': '2019-09-25'},
{'title': 'col5', 'text': '3'}],
'group': {'title': 'Group 1'}},
{'name': 'Item 4',
'column_values': [{'title': 'col2', 'text': 'Lucie Phillips'},
{'title': 'col3', 'text': 'Stuck'},
{'title': 'col4', 'text': '2019-09-06'},
{'title': 'col5', 'text': '4'}],
'group': {'title': 'Group 2'}},
{'name': 'Item 5',
'column_values': [{'title': 'col2', 'text': 'David Binns'},
{'title': 'col3', 'text': 'Done'},
{'title': 'col4', 'text': '2019-09-28'},
{'title': 'col5', 'text': '5'}],
'group': {'title': 'Group 2'}},
{'name': 'item 6',
'column_values': [{'title': 'col2', 'text': 'Lucie Phillips'},
{'title': 'col3', 'text': 'Done'},
{'title': 'col4', 'text': '2020-03-05'},
{'title': 'col5', 'text': '76'}],
'group': {'title': 'Group 2'}}]
I'm currently extracting data from Monday.com's API, my call returns the response above with a dict like above I'm trying to find the best method to flatten this dict into a Dataframe.
I'm currently using json_normalize(results['data']['boards'][0]['items']) when I seem to get the result below
The desired output is a table like below

Using the module glom, it becomes easy to extract the required 'text' keys from the nested list. Read in the data to pandas dataframe, split the names column and finally merge back to the parent dataframe.
from glom import glom
spec = {'names':('column_values',['text']),
'group': 'group.title',
'Name' : 'name'
}
the function replaces the None entry to string 'None'
def replace_none(val_list):
val_list = ['None' if v is None else v for v in val_list]
return val_list
for i in M:
i['names'] = replace_none(i['names'])
df = pd.DataFrame(M)
df_split = df['names'].str.join(',').str.split(',',expand=True).add_prefix('Col')
df = df.drop('names',axis=1)
pd.concat([df,df_split],axis=1)
group Name Col0 Col1 Col2 Col3
0 Group 1 Test Item1 Oladimeji Olaolorun Working on it 2019-09-17 1
1 Group 1 Test Item2 Lucie Phillips Done 2019-09-20 2
2 Group 1 Test Item3 David Binns None 2019-09-25 3
3 Group 2 Item 4 Lucie Phillips Stuck 2019-09-06 4
4 Group 2 Item 5 David Binns Done 2019-09-28 5
5 Group 2 item 6 Lucie Phillips Done 2020-03-05 76
Update : All of the code above is, unnecessary. the code below is simpler, less verbose, and clearer.
d=[]
for ent in data:
for entry in ent['column_values']:
entry.update({'name':ent['name']})
entry.update({'group':ent['group']['title']})
d.append(entry)
res = pd.DataFrame(d)
res.set_index(['name','group','title']).unstack()
text
title col2 col3 col4 col5
name group
Item 4 Group 2 Lucie Phillips Stuck 2019-09-06 4
Item 5 Group 2 David Binns Done 2019-09-28 5
Test Item1 Group 1 Oladimeji Olaolorun Working on it 2019-09-17 1
Test Item2 Group 1 Lucie Phillips Done 2019-09-20 2
Test Item3 Group 1 David Binns None 2019-09-25 3
item 6 Group 2 Lucie Phillips Done 2020-03-05 76

I think there are many ways to do this thing but I like the most this one. you can update the dictionary according to your need.
in the below code, we are deleting unnecessary data and updating keys and values according to our need now we can convert this dict into Dataframe
for i in range(len(d)):
data = d[i]
#changing names of dict
for value in d[i]['column_values']:
data[value['title']] = value['text']
data['group'] = data['group']['title']
del(d[i]['column_values'])
import pandas as pd
data = pd.DataFrame(d)
data.head()

I think this will help.
first, I convert the list into dict title as key and text as value and then convert it into Series it makes a data frame for use
def solve(list_d: list) ->pd.Series:
data = dict()
for item in list_d:
# taking values each items in list
# assign title as a key and text as values
data[item['title']] = item['text']
return pd.Series(data)
df['column_values'].apply(solve).join(d)
drop unnecessary columns and your dataset is ready.
if you found any difficulty understanding feel free to ping me

Related

De-duplication with merge of data

I have a dataset with duplicates, triplicates and more and I want to keep only one record of each unique with merge of data, for example:
id name address age city
1 Alex 123,blv
1 Alex 13
3 Alex 24 Florida
1 Alex Miami
Merging data using the id field:
Output:
id name address age city
1 Alex 123,blv 13 Miami
3 Alex 24 Florida
I've changed a bit the code from this answer.
Code to create the initial dataframe:
import pandas as pd
import numpy as np
d = {'id': [1,1,3,1],
'name': ["Alex", "Alex", "Alex", "Alex"],
'address': ["123,blv" , None, None, None],
'age': [None, 13, 24, None],
'city': [None, None, "Florida", "Miami"]
}
df = pd.DataFrame(data=d, index=d["id"])
print(df)
Output:
id name address age city
1 1 Alex 123,blv NaN None
1 1 Alex None 13.0 None
3 3 Alex None 24.0 Florida
1 1 Alex None NaN Miami
Aggregation code:
def get_notnull(x):
if x.notnull().any():
return x[x.notnull()]
else:
return np.nan
aggregation_functions = {'name': 'first',
'address': get_notnull,
'age': get_notnull,
'city': get_notnull
}
df = df.groupby(df['id']).aggregate(aggregation_functions)
print(df)
Output:
name address age city
id
1 Alex 123,blv 13.0 Miami
3 Alex NaN 24.0 Florida
(
df
.reset_index(drop=True) # set unique index for eash record
.drop('id', axis=1) # exclude 'id' column from processing
.groupby(df['id']) # group by 'id'
.agg(
# return first non-NA/None value for each column
lambda s: s.get(s.first_valid_index())
)
.reset_index() # get back the 'id' value for each record
)
ps. As an option:
df.replace([None, ''], pd.NA).groupby('id').first().reset_index()

Filter integer column endswith hundreds in Python

Given a small dataset df as follows:
[{'id': 110000, 'name': 'Derek Wood'},
{'id': 110101, 'name': 'Thomas Butler'},
{'id': 110105, 'name': 'Nicholas Crawford'},
{'id': 120000, 'name': 'Brian Jenkins'},
{'id': 120101, 'name': 'Eric Stokes'},
{'id': 220000, 'name': 'Christopher Mccarty'},
{'id': 220100, 'name': 'Christian Griffith'},
{'id': 220102, 'name': 'Antonio Webb'}]
or:
id name
0 110000 Derek Wood
1 110101 Thomas Butler
2 110105 Nicholas Crawford
3 120000 Brian Jenkins
4 120101 Eric Stokes
5 220000 Christopher Mccarty
6 220100 Christian Griffith
7 220102 Antonio Webb
How could filter id endswith hundreds? The expected result will like:
id name
0 110000 Derek Wood
3 120000 Brian Jenkins
5 220000 Christopher Mccarty
6 220100 Christian Griffith
My trial code, it works but I'm looking for a alternative solution without converting the dtype of id:
df['id'] = df['id'].astype(str)
df[df['id'].str.endswith('00')]
Try using modulus which is % in pandas. It returns the remainder value after division. For your use case, you want to return the rows of id divided by 100 and its remainder is 0.
condition = (df["id"]%100 == 0)
resulted_df = df[condition]

Python - Transpose/Pivot a column based based on a different column

I searched it and indeed I found a lot of similar questions but none of those seemed to answer my case.
I have a pd Dataframe which is a joined table consist of products and the countries in which they are sold.
It's 3000 rows and 50 columns in size.
I'm uploading a photo (only part of the df) of the current situation I'm in now and the expected result I want to achieve.
I want to transpose the 'Country name' column into rows grouped by the 'Product code name. Please note that the new country columns are not limited to a certain amount of countries (some products has 3, some 40).
Thank you!
Use .cumcount() to count the number of countries that a product has.
Then use .pivot() to get your dataframe in the right shape:
df = pd.DataFrame({
'Country': ['NL', 'Poland', 'Spain', 'Sweden', 'China', 'Egypt'],
'Product Code': ['123', '123', '115', '115', '117', '118'],
'Product Name': ['X', 'X', 'Y', 'Y', 'Z', 'W'],
})
df['cumcount'] = df.groupby(['Product Code', 'Product Name'])['Country'].cumcount() + 1
df_pivot = df.pivot(
index=['Product Code', 'Product Name'],
columns='cumcount',
values='Country',
).add_prefix('country_')
Resulting dataframe:
cumcount country_1 country_2
ProductCode Product Name
115 Y Spain Sweden
117 Z China NaN
118 W Egypt NaN
123 X NL Poland
Try this:
df_out = df.set_index(['Product code',
'Product name',
df.groupby('Product code').cumcount() + 1]).unstack()
df_out.columns = [f'Country_{j}' for _, j in df_out.columns]
df_out.reset_index()
Output:
Product code Product name Country_1 Country_2 Country_3
0 AAA115 Y Sweden China NaN
1 AAA117 Z Egypt Greece NaN
2 AAA118 W France Italy NaN
3 AAA123 X Netherlands Poland Spain
Details:
Reshape dataframe with set_index and unstack, using cumcount to create country columns. Then flatten multiindex header using list comprehension.

How to insert the last row of each group after each rows

My need is to duplicate the last row of each id group max(num) after each row of the same group
import pandas as pd
data = [{'id': 110, 'val1': 'A', 'num': 0},
{'id': 110, 'val1': 'B', 'num': 1},
{'id': 110, 'val1': 'C', 'num': 2},
{'id': 220, 'val1': 'E', 'num': 0},
{'id': 220, 'val1': 'F', 'num': 1},
{'id': 220, 'val1': 'G', 'num': 2},
{'id': 220, 'val1': 'X', 'num': 3},
{'id': 300, 'val1': 'H', 'num': 0},
{'id': 300, 'val1': 'I', 'num': 1}]
df = pd.DataFrame(data)
df
My dataframe:
What I m looking for:
Here is one way merge with wide_to_long, the drop_duplicates assumed the data frame is well ordered , if not , use sort_values first
s=df.merge(df.drop_duplicates('id',keep='last'),on='id').query('val1_x!=val1_y').reset_index()
newdf=pd.wide_to_long(s,['val1','num'],i=['index','id'],j='drop',suffix='\\w+').\
reset_index('id').reset_index(drop=True)
newdf
id val1 num
0 110 A 0
1 110 C 2
2 110 B 1
3 110 C 2
4 220 E 0
5 220 X 3
6 220 F 1
7 220 X 3
8 220 G 2
9 220 X 3
10 300 H 0
11 300 I 1

how to make multiple columns into list of key value in python

I have a below data frame and i want to create a key value pair in list using the columns in data frame, how can i do it in python.
df=
city code qty1 type
hyd 1 10 a
hyd 2 12 b
ban 2 15 c
ban 4 25 d
pune 1 10 e
pune 3 12 f
i want to create a new data frame as below:
df1 =
city list
hyd [{"1":"10","type":"a"},{"2":"12","type":"b"}]
ban [{"2":"15","type":"c"},{"4":"25","type":"d"}]
pune [{"1":"10","type":"e"},{"3":"12","type":"f"}]
defaultdict
from collections import defaultdict
d = defaultdict(list)
for t in df.itertuples():
d[t.city].append({t.code: t.qty1, 'type': t.type})
pd.Series(d).rename_axis('city').to_frame('list')
list
city
ban [{2: 15, 'type': 'c'}, {4: 25, 'type': 'd'}]
hyd [{1: 10, 'type': 'a'}, {2: 12, 'type': 'b'}]
pune [{1: 10, 'type': 'e'}, {3: 12, 'type': 'f'}]
groupby
pd.Series([
{c: q, 'type': t}
for c, q, t in zip(df.code, df.qty1, df.type)
]).groupby(df.city).apply(list).to_frame('list')
list
city
ban [{2: 15, 'type': 'c'}, {4: 25, 'type': 'd'}]
hyd [{1: 10, 'type': 'a'}, {2: 12, 'type': 'b'}]
pune [{1: 10, 'type': 'e'}, {3: 12, 'type': 'f'}]

Resources