How do I mask two different ranges of values in Seaborn - python-3.x

So I wish to red color the values in heatmap that are between the 2.3e-6-0.05. And I wanted to do that with plotting one heatmap on another. But I can't seem to find a way to mask numbers of different values. Here is my try.
from scipy.stats import pearsonr
N = 10
data = np.random.uniform(0, 45, size=(N, N))
for x, y in np.random.randint(0, N, 50).reshape(-1, 2):
data[x, y] = np.nan # fill in some nans at random places
df = pd.DataFrame(data)
def pearsonr_pval(x,y):
return pearsonr(x,y)[1]
data = df.loc[:, (df != 0).any(axis=0)]
data = data.iloc[:,3:50]
to_log = data.columns
df_log = data[to_log].applymap(lambda x: np.log(x+1))
X = df_log.corr(method = pearsonr_pval)
sns.set_style("darkgrid")
mask = np.zeros_like(X)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(X,
mask=mask,
vmax=1,
vmin=0,
square=True,
cmap="YlGnBu",
annot_kws={"size": 1})
ax = sns.heatmap(X,
mask=(X.values<2.3e-6) & (0.05<X.values) & mask.astype(bool),
vmax=1,
vmin=0,
square=True,
cmap="rocket",
annot_kws={"size": 1})
But I get an error: TypeError: ufunc 'bitwise_and' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'
Edit with the code above I get :

As explained in this answer, for element-wise Boolean comparisons in Pandas you need to use & and |, and to enclose each condition in parentheses. So to combine your three conditions, you would need
mask=(X<2.3e-6) | (0.05<X) | mask.astype(bool),

Related

How to merge multiple tuples or lists in to dictionary using loops?

Here is my code to merge all tuple in to dictionary,
x = (1,2,3)
y = ('car',"truck","plane")
z=("merc","scania","boeing")
products={}
for i in x,y,z:
products[x[i]]= {y[i]:z[i]}
output:
error:
6 for i in x,y,z:
----> 7 products[x[i]]= {y[i]:z[i]}
8
9 print(products)
TypeError: tuple indices must be integers or slices, not a tuple
Now if i use indexing method inside loop for identifying positions like below code,
for i in x,y,z:
products[x[0]]= {y[0]:z[0]}
print(products)
out:
{1: {'car': 'merc'}}
here, I could only create what I need but only for a specified index how do create a complete dictionary using multiple lists/tuples??
is it also possible to use Zip & map functions?
Use zip to iterate over your separate iterables/tuples in parallel
list(zip(x, y, z)) # [(1, 'car', 'merc'), (2, 'truck', 'scania'), (3, 'plane', 'boeing')]
x = (1, 2, 3)
y = ("car", "truck", "plane")
z = ("merc", "scania", "boeing")
products = {i: {k: v} for i, k, v in zip(x, y, z)}
print(products) # {1: {'car': 'merc'}, 2: {'truck': 'scania'}, 3: {'plane': 'boeing'}}
You should use integer as indices.
x = (1,2,3)
y = ('car',"truck","plane")
z=("merc","scania","boeing")
products={}
for i in range(len(x)):
products[x[i]]= {y[i]:z[i]}
This should solve your problem
To add for above answer, I'm posting a solution using map,
x = (1,2,3)
y = ('car',"truck","plane")
z=("merc","scania","boeing")
products=dict(map(lambda x,y,z:(x,{y:z}),x,y,z))
print(products)

Replace specific column values with pd.NA

I am working on a data set that contains longitude and latitude values.
I converted those values to clusters using DBSCAN.
Then I plotted the clusters just as a sanity check.
I get this:
The point at (0, 0) is obviously an issue.
So I ran this code to capture which row(s) are a problem.
a = df3.loc[(df3['latitude'] < 0.01) & (df3['longitude'] < 0.01)].index
print(a) # 1812 rows with 0.0 longitude and -2e-08 latitude
I have 1812 rows with missing data all represented as 0.0 longitude and -2e-08 latitude in the source file.
I am debating some imputation strategies but first I want to replace the 0.0 and -2e-08 values
with np.NA or np.nan so that I can then use fillna() with whatever I ultimately decide to do.
I have tried both:
df3.replace((df3['longitude'] == 0.0), pd.NA, inplace=True)
df3.replace((df3['latitude'] == -2e-08), pd.NA, inplace=True)
print(df3['longitude'].value_counts(dropna=False), '\n')
print(df3['latitude'].value_counts(dropna=False))
and
df3.replace((df3['longitude'] < 0.01), pd.NA, inplace=True)
df3.replace((df3['latitude'] < 0.01), pd.NA, inplace=True)
print(df3['longitude'].value_counts(dropna=False), '\n')
print(df3['latitude'].value_counts(dropna=False))
In both cases the existing values remain in place, i.e., the desired substitution with pd.NA
is not occurring.
What would be the correct procedure to replace the unwanted 1812 values in both the latitude and longitude columns with pd.NA or np.nan, as I simply plan to the impute something to replace the null values.
Try this one out:
df3['longitude'] = df3['longitude'].apply(lambda x:np.nan if x == 0.0 else x)
df3['latitude'] = df3['latitude'].apply(lambda x:np.nan if x==-2e-08 else x)
print(df3['longitude'].value_counts(dropna=False), '\n')
print(df3['latitude'].value_counts(dropna=False))
With an example
import numpy as np
import pandas as pd
a = [1, 2, 0.0, -2e-08]
b = [1, 2, 0.0, -2e-08]
df = pd.DataFrame(zip(a, b))
df.columns = ['lat', 'long']
df.long = df.long.apply(lambda x:np.nan if x == 0.0 else x)
df.lat = df.lat.apply(lambda x:np.nan if x==-2e-08 else x)

How to insert z values onto x,y coordinates?

I have three list, which has some repeated values. I am using them as x and y coordinates. Thus, my third list is corresponds to the values that I want to plot as a heatmap.
For now, I can assign the values of z for only one list, how can I make a for loop to do the same for the rest of the grid?
x = [1,1,1,2,2,2,3,3,3]
y = [1,2,3,1,2,3,1,2,3]
z = [5.9617e-09,6.3562e-09,6.819e-09,7.3562e-09,7.989e-09,8.6735e-
09,9.3898e-09,1.0139e-08,1.0912e-08,1.0912e-08]
xs = len(set(x))
ys = len(set(y))
grid = []
counter = 0
for row in range(ys):
rows = [] # creating the rows on the grid
if len(rows) < ys: # I want to loop over ys and assign the values os z to each coordinate
grid.append(z[counter])
counter = counter+1
print(grid)
Once I have a 2d array, then I can use the heatmap to plot it nicely.
The easiest way is to use numpy:
In [1]: z = [5.9617e-09,6.3562e-09,6.819e-09,7.3562e-09,
7.989e-09,8.6735e-09,9.3898e-09,1.0139e-08,
1.0912e-08,1.0912e-08]
In [2]: len(z)
Out[2]: 10
In [3]: import numpy as np
Ten numbers don't fit in a 3x3 grid, so skip the last one.
In [4]: nz = np.array(z[:-1])
Out[4]:
array([5.9617e-09, 6.3562e-09, 6.8190e-09, 7.3562e-09, 7.9890e-09,
8.6735e-09, 9.3898e-09, 1.0139e-08, 1.0912e-08])
In [5]: nz.reshape((3,3))
Out[5]:
array([[5.9617e-09, 6.3562e-09, 6.8190e-09],
[7.3562e-09, 7.9890e-09, 8.6735e-09],
[9.3898e-09, 1.0139e-08, 1.0912e-08]])
A plain Python solution using itertools and functools:
In [6]: import itertools as it
...: import functools as ft
In [7]: def chunked(iterable, n): # {{{1
...: def take(n, iterable):
...: return list(it.islice(iterable, n))
...: return iter(ft.partial(take, n, iter(iterable)), [])
...:
In [8]: list(chunked(z[:-1], 3))
Out[8]:
[[5.9617e-09, 6.3562e-09, 6.819e-09],
[7.3562e-09, 7.989e-09, 8.6735e-09],
[9.3898e-09, 1.0139e-08, 1.0912e-08]]

How to iterate over dfs and append data with combine names

i have this problem to solve, this is a continuation of a previus question How to iterate over pandas df with a def function variable function and the given answer worked perfectly, but now i have to append all the data in a 2 columns dataframe (Adduct_name and mass).
This is from the previous question:
My goal: i have to calculate the "adducts" for a given "Compound", both represents numbes, but for eah "Compound" there are 46 different "Adducts".
Each adduct is calculated as follow:
Adduct 1 = [Exact_mass*M/Charge + Adduct_mass]
where exact_mass = number, M and Charge = number (1, 2, 3, etc) according to each type of adduct, Adduct_mass = number (positive or negative) according to each adduct.
My data: 2 data frames. One with the Adducts names, M, Charge, Adduct_mass. The other one correspond to the Compound_name and Exact_mass of the Compounds i want to iterate over (i just put a small data set)
Adducts: df_al
import pandas as pd
data = [["M+3H", 3, 1, 1.007276], ["M+3Na", 3, 1, 22.989], ["M+H", 1, 1,
1.007276], ["2M+H", 1, 2, 1.007276], ["M-3H", 3, 1, -1.007276]]
df_al = pd.DataFrame(data, columns=["Ion_name", "Charge", "M", "Adduct_mass"])
Compounds: df
import pandas as pd
data1 = [[1, "C3H64O7", 596.465179], [2, "C30H42O7", 514.293038], [4,
"C44H56O8", 712.397498], [4, "C24H32O6S", 448.191949], [5, "C20H28O3",
316.203834]]
df = pd.DataFrame(data1, columns=["CdId", "Formula", "exact_mass"])
The solution to this problem was:
df_name = df_al["Ion_name"]
df_mass = df_al["Adduct_mass"]
df_div = df_al["Charge"]
df_M = df_al["M"]
#Defining general function
def Adduct(x,i):
return x*df_M[i]/df_div[i] + df_mass[i]
#Applying general function in a range from 0 to 5.
for i in range(5):
df[df_name.loc[i]] = df['exact_mass'].map(lambda x: Adduct(x,i))
Output
Name exact_mass M+3H M+3Na M+H 2M+H M-3H
0 a 596.465179 199.829002 221.810726 597.472455 1193.937634 197.814450
1 b 514.293038 172.438289 194.420013 515.300314 1029.593352 170.423737
2 c 712.397498 238.473109 260.454833 713.404774 1425.802272 236.458557
3 d 448.191949 150.404592 172.386316 449.199225 897.391174 148.390040
4 e 316.203834 106.408554 128.390278 317.211110 633.414944 104.39400
Now that is the rigth calculations but i need now a file where:
-only exists 2 columns (Name and mass)
-All the different adducts are appended one after another
desired out put
Name Mass
a_M+3H 199.82902
a_M+3Na 221.810726
a_M+H 597.472455
a_2M+H 1193.937634
a_M-3H 197.814450
b_M+3H 514.293038
.
.
.
c_M+3H
and so on.
Also i need to combine the name of the respective compound with the ion form (M+3H, M+H, etc).
At this point i have no code for that.
I would apprecitate any advice and a better approach since the begining.
This part is an update of the question above:
Is posible to obtain and ouput like this one:
Name Mass RT
a_M+3H 199.82902 1
a_M+3Na 221.810726 1
a_M+H 597.472455 1
a_2M+H 1193.937634 1
a_M-3H 197.814450 1
b_M+3H 514.293038 3
.
.
.
c_M+3H 2
The RT is the same value for all forms of a compound, in this example is RT for a =1, b = 3, c =2, etc.
Is posible to incorporate (Keep this column) from the data set df (which i update here below)?. As you can see that df has more columns like "Formula" and "RT" which desapear after calculations.
import pandas as pd
data1 = [[a, "C3H64O7", 596.465179, 1], [b, "C30H42O7", 514.293038, 3], [c,
"C44H56O8", 712.397498, 2], [d, "C24H32O6S", 448.191949, 4], [e, "C20H28O3",
316.203834, 1.5]]
df = pd.DataFrame(data1, columns=["Name", "Formula", "exact_mass", "RT"])
Part three! (sorry and thank you)
this is a trial i did on a small data set (df) using the code below, with the same df_al of above.
df=
Code
#Defining variables for calculation
df_name = df_al["Ion_name"]
df_mass = df_al["Adduct_mass"]
df_div = df_al["Charge"]
df_M = df_al["M"]
df_ID= df["Name"]
#Defining the RT dictionary
RT = dict(zip(df["Name"], df["RT"]))
#Removing RT column
df=df.drop(columns=["RT"])
#Defining general function
def Adduct(x,i):
return x*df_M[i]/df_div[i] + df_mass[i]
#Applying general function in a range from 0 to 46.
for i in range(47):
df[df_name.loc[i]] = df['exact_mass'].map(lambda x: Adduct(x,i))
df
output
#Melting
df = pd.melt(df, id_vars=['Name'], var_name = "Adduct", value_name= "Exact_mass", value_vars=[x for x in df.columns if 'Name' not in x and 'exact' not in x])
df['name'] = df.apply(lambda x:x[0] + "_" + x[1], axis=1)
df['RT'] = df.Name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
del df['Name']
del df['Adduct']
df['RT'] = df.name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
df
output
Why NaN?
Here is how I will go about it, pandas.melt comes to rescue:
import pandas as pd
import numpy as np
from io import StringIO
s = StringIO('''
Name exact_mass M+3H M+3Na M+H 2M+H M-3H
0 a 596.465179 199.829002 221.810726 597.472455 1193.937634 197.814450
1 b 514.293038 172.438289 194.420013 515.300314 1029.593352 170.423737
2 c 712.397498 238.473109 260.454833 713.404774 1425.802272 236.458557
3 d 448.191949 150.404592 172.386316 449.199225 897.391174 148.390040
4 e 316.203834 106.408554 128.390278 317.211110 633.414944 104.39400
''')
df = pd.read_csv(s, sep="\s+")
df = pd.melt(df, id_vars=['Name'], value_vars=[x for x in df.columns if 'Name' not in x and 'exact' not in x])
df['name'] = df.apply(lambda x:x[0] + "_" + x[1], axis=1)
del df['Name']
del df['variable']
RT = {'a':1, 'b':2, 'c':3, 'd':5, 'e':1.5}
df['RT'] = df.name.apply(lambda x: RT[x[0]] if x[0] in RT else np.nan)
df
Here is the output:

add secondary description in axis values, plotly

I am using a dataframe which includes the following columns:
Country, GNI, CarSalesPerCap. I am using kmeans to create clusters. In the algorithm i pass the dataframe with the two numeric columns: 'GNI', 'CarSalesPerCap'.
Then i am using plotly to create a scatter plot, where x-axis is the CarsalesPerCap and Y-axis is GNI. My question is, how am i going to add to the plot the corresponding country for each point plotted on the graph.
df = pd.read_sql_query(query,conn)
df = df.dropna()
#Cluster the data
kmeans = KMeans(n_clusters=6, random_state=0).fit(df1)
labels = kmeans.labels_
#Glue back to originaal data
df['clusters'] = labels
#Lets analyze the clusters
print (df)
cluster0=df.loc[df['clusters'] == 0]
cluster1=df.loc[df['clusters'] == 1]
cluster2=df.loc[df['clusters'] == 2]
cluster3=df.loc[df['clusters'] == 3]
cluster4=df.loc[df['clusters'] == 4]
cluster5=df.loc[df['clusters'] == 5]
p0 = go.Scatter(x=cluster0['CarSalesPerCap'],
y= cluster0['GNI'],
mode='markers',
marker=dict(color='black')
)
p1 = go.Scatter(x=cluster1['CarSalesPerCap'],
y= cluster1['GNI'],
mode='markers',
marker=dict(color='teal')
)
p2 = go.Scatter(x=cluster2['CarSalesPerCap'],
y= cluster2['GNI'],
mode='markers',
marker=dict(color='grey')
)
p3 = go.Scatter(x=cluster3['CarSalesPerCap'],
y= cluster3['GNI'],
mode='markers',
marker=dict(color='pink')
)
p4 = go.Scatter(x=cluster4['CarSalesPerCap'],
y= cluster4['GNI'],
mode='markers',
marker=dict(color='purple')
)
p5 = go.Scatter(x=cluster5['CarSalesPerCap'],
y= cluster5['GNI'],
mode='markers',
marker=dict(color='orange')
)
layout = go.Layout(xaxis=dict(ticks='',
showticklabels=True,
zeroline=True,
title = 'CarSalesPerCap'),
yaxis=dict(ticks='',
showticklabels=True,
zeroline=True,
title='GNI'),
showlegend=False, hovermode='closest')
fig = go.Figure(data=[p0,p1,p2,p3,p4,p5], layout=layout)
py.offline.plot(fig)
You can add a text element to your trace and it will allow you to overlay anything you want. If you add your country column then it will be displayed on hover. If you want a permanent label you can add annotations
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import pandas as pd
df = pd.DataFrame({'country':["USA", "MEXICO", "CANADA"], 'x':[1, 2, 4], 'y':[5, 6, 7]})
p0 = go.Scatter(
x=df.x,
y= df.y,
mode='markers',
marker=dict(
color='#E90',
size=15
),
text = df.country,
)
data = [p0]
iplot(data)

Resources