How to create a weighted edgelist directed from a pandas dataframe with weights in two columns? - python-3.x

I have the following pandas DataFrame (df):
>>> import pandas as pd
>>> df = pd.DataFrame([
['A', 'B', '1'],
['A', 'B', '2'],
['B', 'A', '41'],
['A', 'C', '11'],
['C', 'B', '3'],
['B', 'D', '4'],
['D', 'B','51']
], columns=('station_i', 'station_j','UID'))
I used
>>> df2=df.groupby(by=['station_i', 'station_j']).size().to_frame(name = 'counts_ij').reset_index()
to obtain the dataframe df2:
>>> print(df2)
station_i station_j counts_ij
0 A B 2
1 A C 1
2 B A 1
3 B D 1
4 C B 1
5 D B 1
Now, I would obtain the dataframe df3, build as shown below, where couples with same values, but reversed, are dropped and counted in an extra column as showed below:
>>>print(df3)
station_i station_j counts_ij counts_ji
0 A B 2 1
1 A C 1 0
2 C B 1 0
3 B D 1 1
Would really appreciate some suggestions

import pandas as pd
import numpy as np
# find out indices that are reverse duplicated
dupe = pd.concat([
np.maximum(df2.station_i, df2.station_j),
np.minimum(df2.station_i, df2.station_j)
], axis=1).duplicated()
df2[dupe]
station_i station_j counts_ij
2 B A 1
5 D B 1
df2[~dupe]
station_i station_j counts_ij
0 A B 2
1 A C 1
3 B D 1
4 C B 1
# split by dupe, reverse the dupe station and merge with the non dupe
df2[~dupe].merge(
df2[dupe].rename(
columns={'station_i': 'station_j', 'station_j': 'station_i', 'counts_ij': 'counts_ji'}
), how='left'
).fillna(0).astype({'counts_ji': int})
station_i station_j counts_ij counts_ji
0 A B 2 1
1 A C 1 0
2 B D 1 1
3 C B 1 0

Related

Pandas: categorical column and insertion of rows for every category

I seem unable to achieve inserting rows with missing values, while having one column as Categorical.
Assume the following dataframe df, where column B is categorical and categories should appear in the order of 'd', 'b', 'c', 'a'.
df= pd.DataFrame({'A':['i', 'i', 'i', 'j', 'k'], \
'B':pd.Categorical(['d', 'c', 'b','b', 'a'], \
categories= ['d', 'b', 'c', 'a'], \
ordered=True), \
'C':[1, 0, 3 ,2, np.nan]})
I need to convert df into the following format:
A B C
0 i d 1.0
1 i b 0.0
2 i c 3.0
3 i a NaN
4 j d NaN
5 j b 2.0
6 j c NaN
7 j a NaN
8 k d NaN
9 k b NaN
10 k c NaN
11 k a NaN
Thank you in advance!
You could set the dataframe index to column B, this way we can use the reindex later on to fill the missing categorical values for each group. Use groupby column A and select the column C, then apply the reindex function as mention before, using now the desired category sequence. Afterwards, use reset_index to insert the indices (A and B) back into dataframe columns.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A':['i', 'i', 'i', 'j', 'k'], \
'B':pd.Categorical(['d', 'c', 'b','b', 'a'], \
categories= ['d', 'b', 'c', 'a'], \
ordered=True), \
'C':[1, 0, 3 ,2, np.nan]})
print(df)
df = df.set_index('B')
df = df.groupby('A')['C']\
.apply(lambda x: x.reindex(['d', 'b', 'c', 'a']))\
.reset_index()
df.B = pd.Categorical(df.B)
print(df)
Output from df
A B C
0 i d 1.0
1 i b 3.0
2 i c 0.0
3 i a NaN
4 j d NaN
5 j b 2.0
6 j c NaN
7 j a NaN
8 k d NaN
9 k b NaN
10 k c NaN
11 k a NaN

Unique values across columns row-wise in pandas with missing values

I have a dataframe like
import pandas as pd
import numpy as np
df = pd.DataFrame({"Col1": ['A', np.nan, 'B', 'B', 'C'],
"Col2": ['A', 'B', 'B', 'A', 'C'],
"Col3": ['A', 'B', 'C', 'A', 'C']})
I want to get the unique combinations across columns for each row and create a new column with those values, excluding the missing values.
The code I have right now to do this is
def handle_missing(s):
return np.unique(s[s.notnull()])
def unique_across_rows(data):
unique_vals = data.apply(handle_missing, axis = 1)
# numpy unique sorts the values automatically
merged_vals = unique_vals.apply(lambda x: x[0] if len(x) == 1 else '_'.join(x))
return merged_vals
df['Combos'] = unique_across_rows(df)
This returns the expected output:
Col1 Col2 Col3 Combos
0 A A A A
1 NaN B B B
2 B B C B_C
3 B A A A_B
4 C C C C
It seems to me that there should be a more vectorized approach that exists within Pandas to do this: how could I do that?
You can try a simple list comprehension which might be more efficient for larger dataframes:
df['combos'] = ['_'.join(sorted(k for k in set(v) if pd.notnull(k))) for v in df.values]
Or you can wrap the above list comprehension in a more readable function:
def combos():
for v in df.values:
unique = set(filter(pd.notnull, v))
yield '_'.join(sorted(unique))
df['combos'] = list(combos())
Col1 Col2 Col3 combos
0 A A A A
1 NaN B B B
2 B B C B_C
3 B A A A_B
4 C C C C
You can also use agg/apply on axis=1 like below:
df['Combos'] = df.agg(lambda x: '_'.join(sorted(x.dropna().unique())),axis=1)
print(df)
Col1 Col2 Col3 Combos
0 A A A A
1 NaN B B B
2 B B C B_C
3 B A A A_B
4 C C C C
Try (explanation to follow)
df['Combos'] = (df.stack() # this removes NaN values
.sort_values() # so we have A_B instead of B_A in 3rd row
.groupby(level=0) # group by original index
.agg(lambda x: '_'.join(x.unique())) # join the unique values
)
Output:
Col1 Col2 Col3 Combos
0 A A A A
1 NaN B B B
2 B B C B_C
3 B A A A_B
4 C C C C
fill the nan with a string place-holder '-'. Create a unique array from the col1,col2,col3 list and remove the placeholder. join the unique array values with a '-'
import pandas as pd
import numpy as np
def unique(list1):
if '-' in list1:
list1.remove('-')
x = np.array(list1)
return (np.unique(x))
df = pd.DataFrame({"Col1": ['A', np.nan, 'B', 'B', 'C'],
"Col2": ['A', 'B', 'B', 'A', 'C'],
"Col3": ['A', 'B', 'C', 'A', 'C']}).fillna('-')
s="-"
for key,row in df.iterrows():
df.loc[key,'combos']=s.join(unique([row.Col1, row.Col2, row.Col3]))
print(df.head())

my pandas dataframe is not filterable by a column condition

I am trying to only show rows where values in column A are greater than 0. I applied the following code but I am not getting the right returned dataframe. Why?
in: df.info()
out:
A non-null int64
B non-null int64
in:df['A']>0
out:
A B
5 1
0 0
Obviously, the second row should NOT show. What is going on here?
The way you wrote the condition it's actually a filter (aka mask or predicate). You can take that filter and apply it to the DataFrame to get the actual rows:
In [1]: from pandas import DataFrame
In [2]: df = DataFrame({'A': range(5), 'B': ['a', 'b', 'c', 'd', 'e']})
In [3]: df
Out[3]:
A B
0 0 a
1 1 b
2 2 c
3 3 d
4 4 e
In [4]: df['A'] > 2
Out[4]:
0 False
1 False
2 False
3 True
4 True
Name: A, dtype: bool
In [5]: df[df['A'] > 2]
Out[5]:
A B
3 3 d
4 4 e
Another way to do the same thing is to use query():
In [6]: df.query('A > 2')
Out[6]:
A B
3 3 d
4 4 e

append one dataframe column value to another dataframe

I have two dataframes. df1 is empty dataframe and df2 is having some data as shown. There are few columns common in both dfs. I want to append df2 dataframe columns data into df1 dataframe's column. df3 is expected result.
I have referred Python + Pandas + dataframe : couldn't append one dataframe to another, but not working. It gives following error:
ValueError: Plan shapes are not aligned
df1:
Empty DataFrame
Columns: [a, b, c, d, e]
Index: [] `
df2:
c e
0 11 55
1 22 66
df3 (expected output):
a b c d e
0 11 55
1 22 66
tried with append but not getting desired result
import pandas as pd
l1 = ['a', 'b', 'c', 'd', 'e']
l2 = []
df1 = pd.DataFrame(l2, columns=l1)
l3 = ['c', 'e']
l4 = [[11, 55],
[22, 66]]
df2 = pd.DataFrame(l4, columns=l3)
print("concat","\n",pd.concat([df1,df2])) # columns will be inplace
print("merge Nan","\n",pd.merge(df2, df1,how='left', on=l3)) # columns occurence is not preserved
#### Output ####
#concat
a b c d e
0 NaN NaN 11 NaN 55
1 NaN NaN 22 NaN 66
#merge
c e a b d
0 11 55 NaN NaN NaN
1 22 66 NaN NaN NaN
Append seems to work for me. Does this not do what you want?
df1 = pd.DataFrame(columns=['a', 'b', 'c'])
print("df1: ")
print(df1)
df2 = pd.DataFrame(columns=['a', 'c'], data=[[0, 1], [2, 3]])
print("df2:")
print(df2)
print("df1.append(df2):")
print(df1.append(df2, ignore_index=True, sort=False))
Output:
df1:
Empty DataFrame
Columns: [a, b, c]
Index: []
df2:
a c
0 0 1
1 2 3
df1.append(df2):
a b c
0 0 NaN 1
1 2 NaN 3
Have you tried pd.concat ?
pd.concat([df1,df2])

Pandas dataframe merge by function on column names

I say to dataframes.
df_A has columns A__a, B__b, C. (shape 5,3)
df_B has columns A_a, B_b, D. (shape 4,3)
How can I unify them (without having to iterate over all columns) to get one df with columns A,B ? (shape 9,2) - meaning A__a and A_a should be unified to the same column.
I need to use merge with applying the function lambda x: x.replace("_",""). Is it possible?
import pandas as pd
df = pd.DataFrame(np.random.randint(0,5,size=(5, 3)), columns=['A__a', 'B__b', 'C'])
df:
A__a B__b C
0 3 0 2
1 0 3 4
2 0 4 4
3 4 2 1
4 3 4 3
df2:
df2 = pd.DataFrame(np.random.randint(0,4,size=(4, 3)), columns=['A__a', 'B__b', 'D'])
A__a B__b D
0 3 2 0
1 3 1 1
2 0 2 0
3 3 2 0
df3 = pd.concat([df, df2], join='inner', ignore_index=True)
df_final = df3.rename(lambda x: str(x).split("__")[0],axis='columns')
df_final
df_final:
A B
0 3 0
1 0 3
2 0 4
3 4 2
4 3 4
5 3 2
6 3 1
7 0 2
8 3 2
A simple concatenation will do
pd.concat([df_A, df_B], join='outer')[['A', 'B']].copy().
or
'pd.concat([df_A, df_B], join='inner')
You have to merge Dataframe using 'outer'
import pandas as pd
import numpy as np
df_A = pd.DataFrame(np.random.randint(10,size=(5,3)), columns=['A','B','C'])
df_B = pd.DataFrame(np.random.randint(10,size=(4,3)), columns=['A','B','D'])
print(df_A.shape,df_B.shape)
#(5, 3) (4, 3)
new_df = df_A.merge(df_B , how= 'outer', on = ['A','B'])[['A','B']]
print(new_df.shape)
#(9,2)
If you cant change the name of the columns in advance and you want to use lambda x: x.replace("_",""), this is a way:
df = pd.concat([df1.rename_axis(lambda x: str(x).replace("_",""),axis='columns'), df2.rename_axis(lambda x: str(x).replace("_",""),axis='columns')], join='inner', ignore_index=True)
Example:
d1 = {'A__a' : ('A', 'B', 'C', 'D', 'E') , 'B__b' : ('a', 'b', 'c', 'd', 'e') ,'C': (1,2,3,4,5)}
df1 = pd.DataFrame(d1)
A__a B__b C
0 A a 1
1 B b 2
2 C c 3
3 D d 4
4 E e 5
d2 = {'A_a' : ('B', 'C', 'D','G') , 'B_b' : ('l','m','n','o') ,'D': (6,7,8,9)}
df2=pd.DataFrame(d2)
A_a B_b D
0 B l 6
1 C m 7
2 D n 8
3 G o 9
Output:
Aa Bb
0 A a
1 B b
2 C c
3 D d
4 E e
5 B l
6 C m
7 D n
8 G o
Alternative with:
df = pd.concat([df1.rename(columns={'A__a':'A', 'B__b':'B'}), df2.rename(columns={'A_a':'A', 'B_b':'B'})], join='inner', ignore_index=True)

Resources