Pandas, concatenating values of columns. - python-3.x

I have found answers to this question on here before, but none of them seem to work for me. Right now I have a data frame with a list of clients and their address. However, each address is separated into many columns and i'm trying to put them all under one.
The code I have so far read as so:
data1_df['Address'] = data1_df['Address 1'].map(str) + ", " + data1_df['Address 2'].map(str) + ", " + data1_df['Address 3'].map(str) + ", " + data1_df['city'].map(str) + ", " + data1_df['city'].map(str) + ", " + data1_df['Province/State'].map(str) + ", " + data1_df['Country'].map(str) + ", " + data1_df['Postal Code'].map(str)
However, the error I get is:
TypeError: Unary plus expects numeric dtype, not object
I'm not sure why it's not accepting the strings as they are and using the + operator. Shouldn't the plus accommodate objects?

Hopefully you'll find this example helpful:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1,2,3],
'B': list('ABC'),
'C': [4,5,np.nan],
'D': ['One', np.nan, 'Three']})
addColumns = ['B', 'C', 'D']
df['Address'] = df[addColumns].astype(str).apply(lambda x: ', '.join([i for i in x if i != 'nan']), axis=1)
df
# A B C D Address
#0 1 A 4.0 One A, 4.0, One
#1 2 B 5.0 NaN B, 5.0
#2 3 C NaN Three C, Three
The above will work as str representation of NaN is nan.
Or you can make it with filling NaN with empty strings:
df['Address'] = df[addColumns].fillna('').astype(str).apply(lambda x: ', '.join([i for i in x if i]), axis=1)

In the case of columns with NaN values that you need to add together, here's some logic:
def add_cols_w_nan(df, col_list, space_char, new_col_name):
""" Add together multiple columns where some of the columns
may contain NaN, with the appropriate amount of spacing between columns.
Examples:
'Mr.' + NaN + 'Smith' becomes 'Mr. Smith'
'Mrs.' + 'J.' + 'Smith' becomes 'Mrs. J. Smith'
NaN + 'J.' + 'Smith' becomes 'J. Smith'
Args:
df: pd.DataFrame
DataFrame for which strings are added together.
col_list: ORDERED list of column names, eg. ['first_name',
'middle_name', 'last_name']. The columns will be added in order.
space_char: str
Character to insert between concatenation of columns.
new_col_name: str
Name of the new column after adding together strings.
Returns: pd.DataFrame with a string addition column
"""
df2 = df[col_list].copy()
# Convert to strings, leave nulls alone
df2 = df2.where(df2.isnull(), df2.astype('str'))
# Add space character, NaN remains NaN, which is important
df2.loc[:, col_list[1:]] = space_char + df2.loc[:, col_list[1:]]
# Fix rows where leading columns are null
to_fix = df2.notnull().idxmax(1)
for col in col_list[1:]:
m = to_fix == col
df2.loc[m, col] = df2.loc[m, col].str.replace(space_char, '')
# So that summation works
df2[col_list] = df2[col_list].replace(np.NaN, '')
# Add together all columns
df[new_col_name] = df2[col_list].sum(axis=1)
# If all are missing replace with missing
df[new_col_name] = df[new_col_name].replace('', np.NaN)
del df2
return df
Sample Data:
import pandas as pd
import numpy as np
df = pd.DataFrame({'Address 1': ['AAA', 'ABC', np.NaN, np.NaN, np.NaN],
'Address 2': ['foo', 'bar', 'baz', None, np.NaN],
'Address 3': [np.NaN, np.NaN, 17, np.NaN, np.NaN],
'city': [np.NaN, 'here', 'there', 'anywhere', np.NaN],
'state': ['NY', 'TX', 'WA', 'MI', np.NaN]})
# Address 1 Address 2 Address 3 city state
#0 AAA foo NaN NaN NY
#1 ABC bar NaN here TX
#2 NaN baz 17.0 there WA
#3 NaN None NaN anywhere MI
#4 NaN NaN NaN NaN NaN
df = add_cols_w_nan(
df,
col_list = ['Address 1', 'Address 2', 'Address 3', 'city', 'state'],
space_char = ', ',
new_col_name = 'full_address')
df.full_address.tolist()
#['AAA, foo, NY',
# 'ABC, bar, here, TX',
# 'baz, 17.0, there, WA',
# 'anywhere, MI',
# nan]

Related

Identify the latest series of Continuous same value in Python Pandas DataFrame

I have the following DataFrame (Date in dd-mm-yyyy format):
import pandas as pd
data={'Id':['A', 'B', 'C', 'A', 'B', 'C', 'B', 'C', 'A', 'C', 'B', 'C', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
'Date':['20-10-2022', '20-10-2022', '20-10-2022', '21-10-2022', '21-10-2022', '21-10-2022',
'22-10-2022', '22-10-2022', '23-10-2022', '23-10-2022', '24-10-2022', '24-10-2022',
'25-10-2022', '25-10-2022', '26-10-2022', '26-10-2022', '26-10-2022', '27-10-2022',
'27-10-2022', '27-10-2022']}
df=pd.DataFrame.from_dict(data)
df
Id Date
0 A 20-10-2022
1 B 20-10-2022
2 C 20-10-2022
3 A 21-10-2022
4 B 21-10-2022
5 C 21-10-2022
6 B 22-10-2022
7 C 22-10-2022
8 A 23-10-2022
9 C 23-10-2022
10 B 24-10-2022
11 C 24-10-2022
12 B 25-10-2022
13 C 25-10-2022
14 A 26-10-2022
15 B 26-10-2022
16 C 26-10-2022
17 A 27-10-2022
18 B 27-10-2022
19 C 27-10-2022
This is the Final DataFrame that I want:
I have tried the following code:
# Find first occurance and last occurance of any given Id.
df_first_duplicate = df.drop_duplicates(subset=['Id'], keep='first')
df_first_duplicate.rename(columns = {'Date':'DateOfFirstOccurance'}, inplace = True)
df_first_duplicate.reset_index(inplace = True, drop = True)
df_last_duplicate = df.drop_duplicates(subset=['Id'], keep='last')
df_last_duplicate.rename(columns = {'Date':'DateOfLastOccurance'}, inplace = True)
df_last_duplicate.reset_index(inplace = True, drop = True)
# Merge the above two df's on key
df_merged = pd.merge(df_first_duplicate, df_last_duplicate, on='Id')
df_merged
But this is the output that I get:
Id DateOfFirstOccurance DateOfLastOccurance
0 A 20-10-2022 27-10-2022
1 B 20-10-2022 27-10-2022
2 C 20-10-2022 27-10-2022
What should I do to get the desired output?
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
records = []
for key, group in df.groupby(by='Id'):
filt = group['Date'].diff(-1).dt.days >= -1
filt.iloc[filt.shape[0]-1] = True
max_false_index = filt[~filt].index.max()
min_date = group['Date'].min() if type(max_false_index) == float else group.loc[max_false_index+1:, 'Date'].min()
records.append([key, min_date, group['Date'].max()])
pd.DataFrame(records, columns=['Id', 'DateOfFirstOccurance', 'DateOfLastOccurance'])
Here is one way to do it.
Sort your data by Id and Date. Use pandas.Series.diff to get the difference of each row compared to the last one, change it with dt.days to a floating number and create a boolean Series by comparing each value if it is greater/equal to 1. Convert the boolean Series from True/False to 1/0 with astype(int) and build the cumulative sum. The idx with the biggest value is the first/last occurence of your data.
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df = df.sort_values(['Id', 'Date'])
out = (
df
.groupby('Id')['Date']
.agg(
first_occurence = lambda x: x[
(x.diff().dt.days>1)
.astype(int)
.cumsum()
.idxmax()
],
last_occurence = lambda x: x[
(x.diff().dt.days==1)
.astype(int)
.cumsum()
.idxmax()
],
)
)
print(out)

Python Pandas groupby with agg() nth() and/or iloc()

Given this DF:
df = pd.DataFrame({'Col1':['A','A','A','B','B','B','B']
, 'Col2':['i', 'j', 'k', 'l', 'm', 'n', 'o']
, 'Col3':['Apple', 'Peach', 'Apricot', 'Dog', 'Cat', 'Mouse', 'Horse']
,})
df
And then using this code:
df1 = df.groupby('Col1').agg({'Col2':'count', 'Col3': lambda x: x.iloc[2]})
df1
I got this result:
What I would like now:
Being able to make the lambda function 'Col3': lambda x: x.iloc[0] to print('Not enough data') when dealing with error for example if I change "x.iloc[0]" to "x.iloc[3]" which raised an error because there's not enough data in "Col1['A'] compared to "Col1['B']".
!! Don't want to use 'last' because this is a simplified and shortened DF for purpose !!
You can use nth that will give you a NaN if the value is missing. Unfortunately, nth is not handled by agg so you need to compute it separately and join:
g = df.groupby('Col1')
df1 = g.agg({'Col2':'count'}).join(g['Col3'].nth(3))
output:
Col2 Col3
Col1
A 3 NaN
B 4 Horse
You can try with a slice object which will return empty Series if none value.
df1 = df.groupby('Col1').agg({'Col2':'count',
'Col3': lambda x: x.iloc[3:4] if len(x.iloc[3:4]) else pd.NA})
print(df1)
Col2 Col3
Col1
A 3 <NA>
B 4 Horse
You can save typing with named expression if your Python version is greater than 3.8
df1 = df.groupby('Col1').agg({'Col2':'count',
'Col3': lambda x: v if len(v := x.iloc[3:4]) else pd.NA})

changing values in data frame based on duplicates - python

I have a quite large data set of over 100k rows with many duplicates and some missing or faulty values. Trying to simplify the problem in the snippet below.
sampleData = {
'BI Business Name' : ['AAA', 'BBB', 'CCC', 'DDD','DDD'],
'BId Postcode' : ['NW1 8NZ', 'NW1 8NZ', 'WC2N 4AA', 'CV7 9JY', 'CV7 9JY'],
'BI Website' : ['www#1', 'www#1', 'www#2', 'www#3', np.nan],
'BI Telephone' : ['999', '999', '666', np.nan, '12345']
}
df = pd.DataFrame(sampleData)
I'm trying to change the values based on duplicate rows so if any three fields are matching then the forth one should match as well. I should get outcome like this:
result = {
'BI Business Name' : ['AAA', 'AAA', 'CCC', 'DDD','DDD'],
'BId Postcode' : ['NW1 8NZ', 'NW1 8NZ', 'WC2N 4AA', 'CV7 9JY', 'CV7 9JY'],
'BI Website' : ['www#1', 'www#1', 'www#2', 'www#3', 'www#3'],
'BI Telephone' : ['999', '999', '666', '12345', '12345']
}
df = pd.DataFrame(result)
I have found extremely long winded method - here showing just the part for changing the name.
df['Phone_code_web'] = df['BId Postcode'] + df['BI Website'] + df['BI Telephone']
reference_name = df[['BI Business Name', 'BI Telephone', 'BId Postcode','BI Website']]
reference_name = reference_name.dropna()
reference_name['Phone_code_web'] = reference_name['BId Postcode'] + reference_name['BI Website'] +
reference_name['BI Telephone']
duplicate_ref = reference_name[reference_name['Phone_code_web'].duplicated()]
reference_name = pd.concat([reference_name,duplicate_ref]).drop_duplicates(keep=False)
reference_name
def replace_name(row):
try:
old_name = row['BI Business Name']
reference = row['Phone_code_web']
new_name = reference_name[reference_name['Phone_code_web']==reference].iloc[0,0]
print(new_name)
return new_name
except Exception as e:
return old_name
df['BI Business Name']=df.apply(replace_name, axis=1)
df
Is there easier way of doing this?
You can try this:
import pandas as pd
sampleData = {
'BI Business Name': ['AAA', 'BBB', 'CCC', 'DDD','DDD'],
'BId Postcode': ['NW1 8NZ', 'NW1 8NZ', 'WC2N 4AA', 'CV7 9JY', 'CV7 9JY'],
'BI Website': ['www#1', 'www#1', 'www#2', 'www#3', np.nan],
'BI Telephone': ['999', '999', '666', np.nan, '12345']
}
df = pd.DataFrame(sampleData)
print(df)
def fill_gaps(_df, _x): # _df and _x are local variables that represent the dataframe and one of its rows, respectively
# pd.isnull(_x) = list of Booleans indicating which columns have NaNs
# df.columns[pd.isnull(_x)] = list of columns whose value is a NaN
for col in df.columns[pd.isnull(_x)]:
# len(set(y) & set(_x)) = length of the intersection of the row being considered (_x) and each of the other rows in turn (y)
# the mask is a list of Booleans which are True if:
# 1) y[col] is not Null (e.g. for row 3 we need to replace (BI Telephone = NaN) with a non-NaN 'BI Telephone' value)
# 2) and the length of the intersection above is at least 3 (as required)
mask = df.apply(lambda y: pd.notnull(y[col]) and len(set(y) & set(_x)) == 3, axis=1)
# if the mask has at least one "True" value, select the value in the corresponding column (if there are several possible values, select the first one)
_x[col] = df[mask][col].iloc[0] if any(mask) else _x[col]
return _x
# Apply the logic described above to each row in turn (x = each row)
df = df.apply(lambda x: fill_gaps(df, x), axis=1)
print(df)
Output:
BI Business Name BId Postcode BI Website BI Telephone
0 AAA NW1 8NZ www#1 999
1 BBB NW1 8NZ www#1 999
2 CCC WC2N 4AA www#2 666
3 DDD CV7 9JY www#3 NaN
4 DDD CV7 9JY NaN 12345
BI Business Name BId Postcode BI Website BI Telephone
0 AAA NW1 8NZ www#1 999
1 BBB NW1 8NZ www#1 999
2 CCC WC2N 4AA www#2 666
3 DDD CV7 9JY www#3 12345
4 DDD CV7 9JY www#3 12345

How to iterate over dfs and append data with combine names

i have this problem to solve, this is a continuation of a previus question How to iterate over pandas df with a def function variable function and the given answer worked perfectly, but now i have to append all the data in a 2 columns dataframe (Adduct_name and mass).
This is from the previous question:
My goal: i have to calculate the "adducts" for a given "Compound", both represents numbes, but for eah "Compound" there are 46 different "Adducts".
Each adduct is calculated as follow:
Adduct 1 = [Exact_mass*M/Charge + Adduct_mass]
where exact_mass = number, M and Charge = number (1, 2, 3, etc) according to each type of adduct, Adduct_mass = number (positive or negative) according to each adduct.
My data: 2 data frames. One with the Adducts names, M, Charge, Adduct_mass. The other one correspond to the Compound_name and Exact_mass of the Compounds i want to iterate over (i just put a small data set)
Adducts: df_al
import pandas as pd
data = [["M+3H", 3, 1, 1.007276], ["M+3Na", 3, 1, 22.989], ["M+H", 1, 1,
1.007276], ["2M+H", 1, 2, 1.007276], ["M-3H", 3, 1, -1.007276]]
df_al = pd.DataFrame(data, columns=["Ion_name", "Charge", "M", "Adduct_mass"])
Compounds: df
import pandas as pd
data1 = [[1, "C3H64O7", 596.465179], [2, "C30H42O7", 514.293038], [4,
"C44H56O8", 712.397498], [4, "C24H32O6S", 448.191949], [5, "C20H28O3",
316.203834]]
df = pd.DataFrame(data1, columns=["CdId", "Formula", "exact_mass"])
The solution to this problem was:
df_name = df_al["Ion_name"]
df_mass = df_al["Adduct_mass"]
df_div = df_al["Charge"]
df_M = df_al["M"]
#Defining general function
def Adduct(x,i):
return x*df_M[i]/df_div[i] + df_mass[i]
#Applying general function in a range from 0 to 5.
for i in range(5):
df[df_name.loc[i]] = df['exact_mass'].map(lambda x: Adduct(x,i))
Output
Name exact_mass M+3H M+3Na M+H 2M+H M-3H
0 a 596.465179 199.829002 221.810726 597.472455 1193.937634 197.814450
1 b 514.293038 172.438289 194.420013 515.300314 1029.593352 170.423737
2 c 712.397498 238.473109 260.454833 713.404774 1425.802272 236.458557
3 d 448.191949 150.404592 172.386316 449.199225 897.391174 148.390040
4 e 316.203834 106.408554 128.390278 317.211110 633.414944 104.39400
Now that is the rigth calculations but i need now a file where:
-only exists 2 columns (Name and mass)
-All the different adducts are appended one after another
desired out put
Name Mass
a_M+3H 199.82902
a_M+3Na 221.810726
a_M+H 597.472455
a_2M+H 1193.937634
a_M-3H 197.814450
b_M+3H 514.293038
.
.
.
c_M+3H
and so on.
Also i need to combine the name of the respective compound with the ion form (M+3H, M+H, etc).
At this point i have no code for that.
I would apprecitate any advice and a better approach since the begining.
This part is an update of the question above:
Is posible to obtain and ouput like this one:
Name Mass RT
a_M+3H 199.82902 1
a_M+3Na 221.810726 1
a_M+H 597.472455 1
a_2M+H 1193.937634 1
a_M-3H 197.814450 1
b_M+3H 514.293038 3
.
.
.
c_M+3H 2
The RT is the same value for all forms of a compound, in this example is RT for a =1, b = 3, c =2, etc.
Is posible to incorporate (Keep this column) from the data set df (which i update here below)?. As you can see that df has more columns like "Formula" and "RT" which desapear after calculations.
import pandas as pd
data1 = [[a, "C3H64O7", 596.465179, 1], [b, "C30H42O7", 514.293038, 3], [c,
"C44H56O8", 712.397498, 2], [d, "C24H32O6S", 448.191949, 4], [e, "C20H28O3",
316.203834, 1.5]]
df = pd.DataFrame(data1, columns=["Name", "Formula", "exact_mass", "RT"])
Part three! (sorry and thank you)
this is a trial i did on a small data set (df) using the code below, with the same df_al of above.
df=
Code
#Defining variables for calculation
df_name = df_al["Ion_name"]
df_mass = df_al["Adduct_mass"]
df_div = df_al["Charge"]
df_M = df_al["M"]
df_ID= df["Name"]
#Defining the RT dictionary
RT = dict(zip(df["Name"], df["RT"]))
#Removing RT column
df=df.drop(columns=["RT"])
#Defining general function
def Adduct(x,i):
return x*df_M[i]/df_div[i] + df_mass[i]
#Applying general function in a range from 0 to 46.
for i in range(47):
df[df_name.loc[i]] = df['exact_mass'].map(lambda x: Adduct(x,i))
df
output
#Melting
df = pd.melt(df, id_vars=['Name'], var_name = "Adduct", value_name= "Exact_mass", value_vars=[x for x in df.columns if 'Name' not in x and 'exact' not in x])
df['name'] = df.apply(lambda x:x[0] + "_" + x[1], axis=1)
df['RT'] = df.Name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
del df['Name']
del df['Adduct']
df['RT'] = df.name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
df
output
Why NaN?
Here is how I will go about it, pandas.melt comes to rescue:
import pandas as pd
import numpy as np
from io import StringIO
s = StringIO('''
Name exact_mass M+3H M+3Na M+H 2M+H M-3H
0 a 596.465179 199.829002 221.810726 597.472455 1193.937634 197.814450
1 b 514.293038 172.438289 194.420013 515.300314 1029.593352 170.423737
2 c 712.397498 238.473109 260.454833 713.404774 1425.802272 236.458557
3 d 448.191949 150.404592 172.386316 449.199225 897.391174 148.390040
4 e 316.203834 106.408554 128.390278 317.211110 633.414944 104.39400
''')
df = pd.read_csv(s, sep="\s+")
df = pd.melt(df, id_vars=['Name'], value_vars=[x for x in df.columns if 'Name' not in x and 'exact' not in x])
df['name'] = df.apply(lambda x:x[0] + "_" + x[1], axis=1)
del df['Name']
del df['variable']
RT = {'a':1, 'b':2, 'c':3, 'd':5, 'e':1.5}
df['RT'] = df.name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
df
Here is the output:

Pandas - Fastest way indexing with 2 dataframes

I am developing a software in Python 3 with Pandas library.
Time is very important but memory not so much.
For better visualization I am using the names a and b with few values, although there are many more:
a -> 50000 rows
b -> 5000 rows
I need to select from dataframe a and b (using multiples conditions)
a = pd.DataFrame({
'a1': ['x', 'y', 'z'] ,
'a2': [1, 2, 3],
'a3': [3.14, 2.73, -23.00],
'a4': [pd.np.nan, pd.np.nan, pd.np.nan]
})
a
a1 a2 a3 a4
0 x 1 3.14 NaN
1 y 2 2.73 NaN
2 z 3 -23.00 NaN
b = pd.DataFrame({
'b1': ['x', 'y', 'z', 'k', 'l'],
'b2': [2018, 2019, 2020, 2015, 2012]
})
b
b1 b2
0 x 2018
1 y 2019
2 z 2020
3 k 2015
4 l 2012
So far my code is like this:
for index, row in a.iterrows():
try:
# create a key
a1 = row["a1"]
mask = b.loc[(b['b1'] == a1) & (b['b2'] != 2019)]
# check if exists
if (len(mask.index) != 0): #not empty
a.loc[[index], ['a4']] = mask.iloc[0]['b2']
except KeyError: #not found
pass
But as you can see, I'm using for iterrows that is very slow compared to other methods and I'm changing the value of the DataFrame I'm iterating, that is not recommended.
Could you help me find a better way? The results should be like this:
a
a1 a2 a3 a4
0 x 1 3.14 2018
1 y 2 2.73 NaN
2 z 3 -23.00 2020
I tried things like this below, but I didnt made it work.
a.loc[ (a['a1'] == b['b1']) , 'a4'] = b.loc[b['b2'] != 2019]
*the real code has more conditions
Thanks!
EDIT
I benchmark using: iterrows, merge, set_index/loc. Here is the code:
import timeit
import pandas as pd
def f_iterrows():
for index, row in a.iterrows():
try:
# create a key
a1 = row["a1"]
a3 = row["a3"]
mask = b.loc[(b['b1'] == a1) & (b['b2'] != 2019)]
# check if exists
if len(mask.index) != 0: # not empty
a.loc[[index], ['a4']] = mask.iloc[0]['b2']
except: # not found
pass
def f_merge():
a.merge(b[b.b2 != 2019], left_on='a1', right_on='b1', how='left').drop(['a4', 'b1'], 1).rename(columns={'b2': 'a4'})
def f_lock():
df1 = a.set_index('a1')
df2 = b.set_index('b1')
df1.loc[:, 'a4'] = df2.b2[df2.b2 != 2019]
#variables for testing
number_rows = 100
number_iter = 100
a = pd.DataFrame({
'a1': ['x', 'y', 'z'] * number_rows,
'a2': [1, 2, 3] * number_rows,
'a3': [3.14, 2.73, -23.00] * number_rows,
'a4': [pd.np.nan, pd.np.nan, pd.np.nan] * number_rows
})
b = pd.DataFrame({
'b1': ['x', 'y', 'z', 'k', 'l'] * number_rows,
'b2': [2018, 2019, 2020, 2015, 2012] * number_rows
})
print('For: %s s' % str(timeit.timeit(f_iterrows, number=number_iter)))
print('Merge: %s s' % str(timeit.timeit(f_merge, number=number_iter)))
print('Loc: %s s' % str(timeit.timeit(f_iterrows, number=number_iter)))
They all worked :) and the time to run is:
For: 277.9994369489998 s
Loc: 274.04929955067564 s
Merge: 2.195712725706926 s
So far Merge is the fastest.
If another option appears I will update here, thanks again.
IIUC
a.merge(b[b.b2!=2019],left_on='a1',right_on='b1',how='left').drop(['a4','b1'],1).rename(columns={'b2':'a4'})
Out[263]:
a1 a2 a3 a4
0 x 1 3.14 2018.0
1 y 2 2.73 NaN
2 z 3 -23.00 2020.0

Resources