Matching subset of two columns of two different dataframes - python-3.x

Comparing specific columns from two different dataframes. Counting if subset of both dataframe is matching or not matching.
Condition:
If any element of file small['genes of cluster'] is matching with the big['genes of cluster'], output should be: match: 1.
For below example only OR4F16 is matching to both dataframes.
So Output: match: 1; unmatch: 3.
file1: big <tab separated>
cl nP genes of cluster
1 11 DDX11L1, MIR6859-3, WASH7P, MIR1302-2, FAM138C, FAM138F, FAM138A, OR4F5, LOC729737, LOC102725121, FAM138D
2 4 OR4F16, OR4F3, OR4F29, LOC100132287
3 64 LOC100133331, LOC100288069, FAM87B, LINC00115, LINC01128, FAM41C, LINC02593, SAMD11
4 7 GNB1, CALML6, TMEM52, CFAP74, GABRD, LOC105378591, PRKCZ
file2: small <tab separated>
cl nP genes of cluster
1 11 A, B, C, D
2 4 OR4F16, X, Y, Z
My Code: Python3
def genes_coordinates(big, small):
b = pd.read_csv(big, header=0, sep="\t")
s = pd.read_csv(small, header=0, sep="\t")
match = 0
unmatch = 0
for index, row in b.iterrows():
if row[row['genes of cluster'].isin(s['genes of cluster'])]:
match+1
else:
unmatch+1
print("match: ", match, "\nunmatch: ", unmatch)
genes_coordinates('big','small')

I would go with a pandas.merge() followed by counting by list comprehension.
import pandas as pd
df1 = pd.DataFrame({'cl':[1,2], 'nP':[11,4], 'gene of cluster':[['A', 'B', 'C', 'D'], ['OR4F16', 'X', 'Y', 'Z']]})
df2 = pd.DataFrame({'cl':[1,2,3,4], 'nP':[11,4,64,7], 'gene of cluster':[['DDX11L1', 'MIR6859-3', 'WASH7P', 'MIR1302-2', 'FAM138C', 'FAM138F', 'FAM138A', 'OR4F5', 'LOC729737', 'LOC102725121', 'FAM138D'], ['OR4F16', 'OR4F3', 'OR4F29', 'LOC100132287'], ['LOC100133331', 'LOC100288069', 'FAM87B', 'LINC00115', 'LINC01128', 'FAM41C', 'LINC02593', 'SAMD11'], ['GNB1', 'CALML6', 'TMEM52', 'CFAP74', 'GABRD', 'LOC105378591', 'PRKCZ']]})
df_m = df1.merge(df2, on=['cl', 'nP'], how='outer')
>>>df_m
cl nP gene of cluster_x gene of cluster_y
0 1 11 [A, B, C, D] [DDX11L1, MIR6859-3, WASH7P, MIR1302-2, FAM138...
1 2 4 [OR4F16, X, Y, Z] [OR4F16, OR4F3, OR4F29, LOC100132287]
2 3 64 NaN [LOC100133331, LOC100288069, FAM87B, LINC00115...
3 4 7 NaN [GNB1, CALML6, TMEM52, CFAP74, GABRD, LOC10537...
# An np.nan value is an outright 'unmatch'
found = []
for x in df_m.index:
if isinstance(df_m.iloc[x]['gene of cluster_x'], float):
found.append(0)
else:
if isinstance(df_m.iloc[x]['gene of cluster_y'], float):
found.append(0)
elif any([y in df_m.iloc[x]['gene of cluster_y'] for y in df_m.iloc[x]['gene of cluster_x']]):
found.append(1)
else:
found.append(0)
# The counts
match = sum(found)
unmatch = len(found) - match

Related

Identify the latest series of Continuous same value in Python Pandas DataFrame

I have the following DataFrame (Date in dd-mm-yyyy format):
import pandas as pd
data={'Id':['A', 'B', 'C', 'A', 'B', 'C', 'B', 'C', 'A', 'C', 'B', 'C', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
'Date':['20-10-2022', '20-10-2022', '20-10-2022', '21-10-2022', '21-10-2022', '21-10-2022',
'22-10-2022', '22-10-2022', '23-10-2022', '23-10-2022', '24-10-2022', '24-10-2022',
'25-10-2022', '25-10-2022', '26-10-2022', '26-10-2022', '26-10-2022', '27-10-2022',
'27-10-2022', '27-10-2022']}
df=pd.DataFrame.from_dict(data)
df
Id Date
0 A 20-10-2022
1 B 20-10-2022
2 C 20-10-2022
3 A 21-10-2022
4 B 21-10-2022
5 C 21-10-2022
6 B 22-10-2022
7 C 22-10-2022
8 A 23-10-2022
9 C 23-10-2022
10 B 24-10-2022
11 C 24-10-2022
12 B 25-10-2022
13 C 25-10-2022
14 A 26-10-2022
15 B 26-10-2022
16 C 26-10-2022
17 A 27-10-2022
18 B 27-10-2022
19 C 27-10-2022
This is the Final DataFrame that I want:
I have tried the following code:
# Find first occurance and last occurance of any given Id.
df_first_duplicate = df.drop_duplicates(subset=['Id'], keep='first')
df_first_duplicate.rename(columns = {'Date':'DateOfFirstOccurance'}, inplace = True)
df_first_duplicate.reset_index(inplace = True, drop = True)
df_last_duplicate = df.drop_duplicates(subset=['Id'], keep='last')
df_last_duplicate.rename(columns = {'Date':'DateOfLastOccurance'}, inplace = True)
df_last_duplicate.reset_index(inplace = True, drop = True)
# Merge the above two df's on key
df_merged = pd.merge(df_first_duplicate, df_last_duplicate, on='Id')
df_merged
But this is the output that I get:
Id DateOfFirstOccurance DateOfLastOccurance
0 A 20-10-2022 27-10-2022
1 B 20-10-2022 27-10-2022
2 C 20-10-2022 27-10-2022
What should I do to get the desired output?
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
records = []
for key, group in df.groupby(by='Id'):
filt = group['Date'].diff(-1).dt.days >= -1
filt.iloc[filt.shape[0]-1] = True
max_false_index = filt[~filt].index.max()
min_date = group['Date'].min() if type(max_false_index) == float else group.loc[max_false_index+1:, 'Date'].min()
records.append([key, min_date, group['Date'].max()])
pd.DataFrame(records, columns=['Id', 'DateOfFirstOccurance', 'DateOfLastOccurance'])
Here is one way to do it.
Sort your data by Id and Date. Use pandas.Series.diff to get the difference of each row compared to the last one, change it with dt.days to a floating number and create a boolean Series by comparing each value if it is greater/equal to 1. Convert the boolean Series from True/False to 1/0 with astype(int) and build the cumulative sum. The idx with the biggest value is the first/last occurence of your data.
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df = df.sort_values(['Id', 'Date'])
out = (
df
.groupby('Id')['Date']
.agg(
first_occurence = lambda x: x[
(x.diff().dt.days>1)
.astype(int)
.cumsum()
.idxmax()
],
last_occurence = lambda x: x[
(x.diff().dt.days==1)
.astype(int)
.cumsum()
.idxmax()
],
)
)
print(out)

The output of my code comes out too slowly.. How can i speed up my process

Thanks to the help from some users of this sites.
My code seems to work fine, but it's taking too long..
I'm trying to compare two data frames.(df1 has 1,291,250 rows / df2 has 1,286,692 rows)
if df1.iloc[0,0] == df2.iloc[0,0] and df1.iloc[0,1] == df2.iloc[0,1], then compare df1.iloc[0,2], df2.iloc[0,2].
If the first(df1.iloc[0,2]) is larger, I want to put the first index into the list, and if the second(df2.iloc[0,2]) is larger, I want to put the second index into the list.
Example DataFrame
In [1]: df1 = pd.DataFrame([[0, 1, 98], [1, 1, 198], [2, 2, 228]], columns = ['A1', 'B1', 'C1'])
In [2]: df1
Out[3]:
A1 B1 C1
0 0 1 98
1 1 1 198
2 2 2 228
In [4]: df2 = pd.DataFrame([[0, 1, 228], [1, 2, 110], [2, 2, 130]], columns = ['A2', 'B2', 'C2'])
In [5]: df2
Out[6]:
A2 B2 C2
0 0 1 228
1 1 2 110
2 2 2 130
In [7]: def find_high(df1, df2) # def function code is below
Out[8]: ([2], [0]) # The result what i want
This is just simple example. my data is bigger than this
my code is:
for i in range(60):
setattr(mod, f'df_1_{i}', np.array_split(df1, 60)[i])
getattr(mod, f'df_1_{i}').to_pickle(f'df_1_{i}')
import glob
files = glob.glob('df_1_*')
def find_high_pre(df1 = files, df2):
subtract_df2 = []
subtract_df1 = []
same_data = []
for df1_index, line in enumerate(df1.to_numpy()):
for df2_idx, row in enumerate(df2.to_numpy()):
if (line[0:2] == row[0:2]).all():
if line[2] < row[2]:
subtract_df2.append(df2_idx)
break
elif line[2] > row[2]:
subtract_df1.append(df1_idx)
break
else:
continue
break
return df1.iloc[subtract_df1].index.tolist(), df2.iloc[subtract_df2].index.tolist(), df1.iloc[same_data].index.to_list();
data_1 = []
for i in files:
e_data = pd.read_pickle(i)
num_cores = 30
df_split = np.array_split(e_data, num_cores)
data_1 += parmap.map(find_high_pre, df_split, pm_pbar=True, pm_processes =num_cores)
My code seems to work fine, but it's taking too long..
Chances are that replacing your nested for loops with a DataFrame.merge operation will take less time:
keys = ['A', 'B']
df1.columns = [*keys, 'C1']
df2.columns = [*keys, 'C2']
df = df1.reset_index().set_index(keys).merge(
df2.reset_index().set_index(keys), on=keys)
# now we have a merged dataframe like this:
# index_x C1 index_y C2
# A B
# 0 1 0 98 0 228
# 2 2 2 228 2 130
# therefrom we can easily extract the wanted indexes
data = [df.loc[df['C1'] > df['C2'], 'index_x'].values,
df.loc[df['C1'] < df['C2'], 'index_y'].values]

Python Passing Dynamic Table Name in For Loop

table_name = []
counter=0
for year in ['2017', '2018', '2019']:
table_name.append(f'temp_df_{year}')
print(table_name[counter])
table_name[counter] = pd.merge(table1, table2.loc[table2.loc[:, 'year'] == year, :], left_on='col1', right_on='col1', how='left')
counter += 1
temp_df_2017
The print statement outputs are correct:
temp_df_2017,
temp_df_2018,
temp_df_2019
However, when I try to see what's in temp_df_2017, I get an error: name 'temp_df_2017' is not defined
I would like to create those three tables. How can I make this work?
PS: ['2017', '2018', '2019'] list will vary. It can be a list of quarters. That's why I want to do this in a loop, instead of using the merge statement 3x.
I think the easiest/most practical approach would be to create a dictionary to store names/df.
import pandas as pd
import numpy as np
# Create dummy data
data = np.arange(9).reshape(3,3)
df = pd.DataFrame(data, columns=['a', 'b', 'c'])
df
Out:
a b c
0 0 1 2
1 3 4 5
2 6 7 8
df_year_names = ['2017', '2018', '2019']
dict_of_dfs = {}
for year in df_year_names:
df_name = f'some_name_year_{year}'
dict_of_dfs[df_name] = df
dict_of_dfs.keys()
Out:
dict_keys(['some_name_year_2017', 'some_name_year_2018', 'some_name_year_2019'])
Then to access a particular year:
dict_of_dfs['some_name_year_2018']
Out:
a b c
0 0 1 2
1 3 4 5
2 6 7 8

How to iterate over dfs and append data with combine names

i have this problem to solve, this is a continuation of a previus question How to iterate over pandas df with a def function variable function and the given answer worked perfectly, but now i have to append all the data in a 2 columns dataframe (Adduct_name and mass).
This is from the previous question:
My goal: i have to calculate the "adducts" for a given "Compound", both represents numbes, but for eah "Compound" there are 46 different "Adducts".
Each adduct is calculated as follow:
Adduct 1 = [Exact_mass*M/Charge + Adduct_mass]
where exact_mass = number, M and Charge = number (1, 2, 3, etc) according to each type of adduct, Adduct_mass = number (positive or negative) according to each adduct.
My data: 2 data frames. One with the Adducts names, M, Charge, Adduct_mass. The other one correspond to the Compound_name and Exact_mass of the Compounds i want to iterate over (i just put a small data set)
Adducts: df_al
import pandas as pd
data = [["M+3H", 3, 1, 1.007276], ["M+3Na", 3, 1, 22.989], ["M+H", 1, 1,
1.007276], ["2M+H", 1, 2, 1.007276], ["M-3H", 3, 1, -1.007276]]
df_al = pd.DataFrame(data, columns=["Ion_name", "Charge", "M", "Adduct_mass"])
Compounds: df
import pandas as pd
data1 = [[1, "C3H64O7", 596.465179], [2, "C30H42O7", 514.293038], [4,
"C44H56O8", 712.397498], [4, "C24H32O6S", 448.191949], [5, "C20H28O3",
316.203834]]
df = pd.DataFrame(data1, columns=["CdId", "Formula", "exact_mass"])
The solution to this problem was:
df_name = df_al["Ion_name"]
df_mass = df_al["Adduct_mass"]
df_div = df_al["Charge"]
df_M = df_al["M"]
#Defining general function
def Adduct(x,i):
return x*df_M[i]/df_div[i] + df_mass[i]
#Applying general function in a range from 0 to 5.
for i in range(5):
df[df_name.loc[i]] = df['exact_mass'].map(lambda x: Adduct(x,i))
Output
Name exact_mass M+3H M+3Na M+H 2M+H M-3H
0 a 596.465179 199.829002 221.810726 597.472455 1193.937634 197.814450
1 b 514.293038 172.438289 194.420013 515.300314 1029.593352 170.423737
2 c 712.397498 238.473109 260.454833 713.404774 1425.802272 236.458557
3 d 448.191949 150.404592 172.386316 449.199225 897.391174 148.390040
4 e 316.203834 106.408554 128.390278 317.211110 633.414944 104.39400
Now that is the rigth calculations but i need now a file where:
-only exists 2 columns (Name and mass)
-All the different adducts are appended one after another
desired out put
Name Mass
a_M+3H 199.82902
a_M+3Na 221.810726
a_M+H 597.472455
a_2M+H 1193.937634
a_M-3H 197.814450
b_M+3H 514.293038
.
.
.
c_M+3H
and so on.
Also i need to combine the name of the respective compound with the ion form (M+3H, M+H, etc).
At this point i have no code for that.
I would apprecitate any advice and a better approach since the begining.
This part is an update of the question above:
Is posible to obtain and ouput like this one:
Name Mass RT
a_M+3H 199.82902 1
a_M+3Na 221.810726 1
a_M+H 597.472455 1
a_2M+H 1193.937634 1
a_M-3H 197.814450 1
b_M+3H 514.293038 3
.
.
.
c_M+3H 2
The RT is the same value for all forms of a compound, in this example is RT for a =1, b = 3, c =2, etc.
Is posible to incorporate (Keep this column) from the data set df (which i update here below)?. As you can see that df has more columns like "Formula" and "RT" which desapear after calculations.
import pandas as pd
data1 = [[a, "C3H64O7", 596.465179, 1], [b, "C30H42O7", 514.293038, 3], [c,
"C44H56O8", 712.397498, 2], [d, "C24H32O6S", 448.191949, 4], [e, "C20H28O3",
316.203834, 1.5]]
df = pd.DataFrame(data1, columns=["Name", "Formula", "exact_mass", "RT"])
Part three! (sorry and thank you)
this is a trial i did on a small data set (df) using the code below, with the same df_al of above.
df=
Code
#Defining variables for calculation
df_name = df_al["Ion_name"]
df_mass = df_al["Adduct_mass"]
df_div = df_al["Charge"]
df_M = df_al["M"]
df_ID= df["Name"]
#Defining the RT dictionary
RT = dict(zip(df["Name"], df["RT"]))
#Removing RT column
df=df.drop(columns=["RT"])
#Defining general function
def Adduct(x,i):
return x*df_M[i]/df_div[i] + df_mass[i]
#Applying general function in a range from 0 to 46.
for i in range(47):
df[df_name.loc[i]] = df['exact_mass'].map(lambda x: Adduct(x,i))
df
output
#Melting
df = pd.melt(df, id_vars=['Name'], var_name = "Adduct", value_name= "Exact_mass", value_vars=[x for x in df.columns if 'Name' not in x and 'exact' not in x])
df['name'] = df.apply(lambda x:x[0] + "_" + x[1], axis=1)
df['RT'] = df.Name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
del df['Name']
del df['Adduct']
df['RT'] = df.name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
df
output
Why NaN?
Here is how I will go about it, pandas.melt comes to rescue:
import pandas as pd
import numpy as np
from io import StringIO
s = StringIO('''
Name exact_mass M+3H M+3Na M+H 2M+H M-3H
0 a 596.465179 199.829002 221.810726 597.472455 1193.937634 197.814450
1 b 514.293038 172.438289 194.420013 515.300314 1029.593352 170.423737
2 c 712.397498 238.473109 260.454833 713.404774 1425.802272 236.458557
3 d 448.191949 150.404592 172.386316 449.199225 897.391174 148.390040
4 e 316.203834 106.408554 128.390278 317.211110 633.414944 104.39400
''')
df = pd.read_csv(s, sep="\s+")
df = pd.melt(df, id_vars=['Name'], value_vars=[x for x in df.columns if 'Name' not in x and 'exact' not in x])
df['name'] = df.apply(lambda x:x[0] + "_" + x[1], axis=1)
del df['Name']
del df['variable']
RT = {'a':1, 'b':2, 'c':3, 'd':5, 'e':1.5}
df['RT'] = df.name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
df
Here is the output:

Pandas - Fastest way indexing with 2 dataframes

I am developing a software in Python 3 with Pandas library.
Time is very important but memory not so much.
For better visualization I am using the names a and b with few values, although there are many more:
a -> 50000 rows
b -> 5000 rows
I need to select from dataframe a and b (using multiples conditions)
a = pd.DataFrame({
'a1': ['x', 'y', 'z'] ,
'a2': [1, 2, 3],
'a3': [3.14, 2.73, -23.00],
'a4': [pd.np.nan, pd.np.nan, pd.np.nan]
})
a
a1 a2 a3 a4
0 x 1 3.14 NaN
1 y 2 2.73 NaN
2 z 3 -23.00 NaN
b = pd.DataFrame({
'b1': ['x', 'y', 'z', 'k', 'l'],
'b2': [2018, 2019, 2020, 2015, 2012]
})
b
b1 b2
0 x 2018
1 y 2019
2 z 2020
3 k 2015
4 l 2012
So far my code is like this:
for index, row in a.iterrows():
try:
# create a key
a1 = row["a1"]
mask = b.loc[(b['b1'] == a1) & (b['b2'] != 2019)]
# check if exists
if (len(mask.index) != 0): #not empty
a.loc[[index], ['a4']] = mask.iloc[0]['b2']
except KeyError: #not found
pass
But as you can see, I'm using for iterrows that is very slow compared to other methods and I'm changing the value of the DataFrame I'm iterating, that is not recommended.
Could you help me find a better way? The results should be like this:
a
a1 a2 a3 a4
0 x 1 3.14 2018
1 y 2 2.73 NaN
2 z 3 -23.00 2020
I tried things like this below, but I didnt made it work.
a.loc[ (a['a1'] == b['b1']) , 'a4'] = b.loc[b['b2'] != 2019]
*the real code has more conditions
Thanks!
EDIT
I benchmark using: iterrows, merge, set_index/loc. Here is the code:
import timeit
import pandas as pd
def f_iterrows():
for index, row in a.iterrows():
try:
# create a key
a1 = row["a1"]
a3 = row["a3"]
mask = b.loc[(b['b1'] == a1) & (b['b2'] != 2019)]
# check if exists
if len(mask.index) != 0: # not empty
a.loc[[index], ['a4']] = mask.iloc[0]['b2']
except: # not found
pass
def f_merge():
a.merge(b[b.b2 != 2019], left_on='a1', right_on='b1', how='left').drop(['a4', 'b1'], 1).rename(columns={'b2': 'a4'})
def f_lock():
df1 = a.set_index('a1')
df2 = b.set_index('b1')
df1.loc[:, 'a4'] = df2.b2[df2.b2 != 2019]
#variables for testing
number_rows = 100
number_iter = 100
a = pd.DataFrame({
'a1': ['x', 'y', 'z'] * number_rows,
'a2': [1, 2, 3] * number_rows,
'a3': [3.14, 2.73, -23.00] * number_rows,
'a4': [pd.np.nan, pd.np.nan, pd.np.nan] * number_rows
})
b = pd.DataFrame({
'b1': ['x', 'y', 'z', 'k', 'l'] * number_rows,
'b2': [2018, 2019, 2020, 2015, 2012] * number_rows
})
print('For: %s s' % str(timeit.timeit(f_iterrows, number=number_iter)))
print('Merge: %s s' % str(timeit.timeit(f_merge, number=number_iter)))
print('Loc: %s s' % str(timeit.timeit(f_iterrows, number=number_iter)))
They all worked :) and the time to run is:
For: 277.9994369489998 s
Loc: 274.04929955067564 s
Merge: 2.195712725706926 s
So far Merge is the fastest.
If another option appears I will update here, thanks again.
IIUC
a.merge(b[b.b2!=2019],left_on='a1',right_on='b1',how='left').drop(['a4','b1'],1).rename(columns={'b2':'a4'})
Out[263]:
a1 a2 a3 a4
0 x 1 3.14 2018.0
1 y 2 2.73 NaN
2 z 3 -23.00 2020.0

Resources