Python: how to create a smoothed version of a 2D binned "color map"? - python-3.x

I would like to create a version of this 2D binned "color map" with smoothed colors.
I am not even sure this would be the correct nomenclature for the plot, but, essentially, I want my figure to be color coded by the median values of a third variable for points that reside in each defined bin of my (X, Y) space.
Even though I am able to accomplish that to a certain degree (see example), I would like to find a way to create a version of the same plot with a smoothed color gradient. That would allow me to visualize the overall behavior of my distribution.
I tried ideas described here: Smoothing 2D map in python
and here: Python: binned_statistic_2d mean calculation ignoring NaNs in data
as well as links therein, but could not find a clear solution to the problem.
This is what I have so far:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy.stats import binned_statistic_2d
import random
random.seed(999)
x = np.random.normal (0,10,5000)
y = np.random.normal (0,10,5000)
z = np.random.uniform(0,10,5000)
fig = plt.figure(figsize=(20, 20))
plt.rcParams.update({'font.size': 10})
ax = fig.add_subplot(3,3,1)
ax.set_axisbelow(True)
plt.grid(b=True, lw=0.5, zorder=-1)
x_bins = np.arange(-50., 50.5, 1.)
y_bins = np.arange(-50., 50.5, 1.)
cmap = plt.cm.get_cmap('jet_r',1000) #just a colormap
ret = binned_statistic_2d(x, y, z, statistic=np.median, bins=[x_bins, y_bins]) # Bin (X, Y) and create a map of the medians of "Colors"
plt.imshow(ret.statistic.T, origin='bottom', extent=(-50, 50, -50, 50), cmap=cmap)
plt.xlim(-40,40)
plt.ylim(-40,40)
plt.xlabel("X", fontsize=15)
plt.ylabel("Y", fontsize=15)
ax.set_yticks([-40,-30,-20,-10,0,10,20,30,40])
bounds = np.arange(2.0, 20.0, 1.0)
plt.colorbar(ticks=bounds, label="Color", fraction=0.046, pad=0.04)
# save plots
plt.savefig("Whatever_name.png", bbox_inches='tight')
Which produces the following image (from random data):
Therefore, the simple question would be: how to smooth these colors?
Thanks in advance!
PS: sorry for excessive coding, but I believe a clear visualization is crucial for this particular problem.

Thanks to everyone who viewed this issue and tried to help!
I ended up being able to solve my own problem. In the end, it was all about image smoothing with Gaussian Kernel.
This link: Gaussian filtering a image with Nan in Python gave me the insight for the solution.
I, basically, implemented the exactly same code, but, in the end, mapped the previously known NaN pixels from the original 2D array to the resulting smoothed version. Unlike the solution from the link, my version does NOT fill NaN pixels with some value derived from the pixels around. Or, it does, but then I erase those again.
Here is the final figure produced for the example I provided:
Final code, for reference, for those who might need in the future:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy.stats import binned_statistic_2d
import scipy.stats as st
import scipy.ndimage
import scipy as sp
import random
random.seed(999)
x = np.random.normal (0,10,5000)
y = np.random.normal (0,10,5000)
z = np.random.uniform(0,10,5000)
fig = plt.figure(figsize=(20, 20))
plt.rcParams.update({'font.size': 10})
ax = fig.add_subplot(3,3,1)
ax.set_axisbelow(True)
plt.grid(b=True, lw=0.5, zorder=-1)
x_bins = np.arange(-50., 50.5, 1.)
y_bins = np.arange(-50., 50.5, 1.)
cmap = plt.cm.get_cmap('jet_r',1000) #just a colormap
ret = binned_statistic_2d(x, y, z, statistic=np.median, bins=[x_bins, y_bins]) # Bin (X, Y) and create a map of the medians of "Colors"
sigma=1 # standard deviation for Gaussian kernel
truncate=5.0 # truncate filter at this many sigmas
U = ret.statistic.T.copy()
V=U.copy()
V[np.isnan(U)]=0
VV=sp.ndimage.gaussian_filter(V,sigma=sigma)
W=0*U.copy()+1
W[np.isnan(U)]=0
WW=sp.ndimage.gaussian_filter(W,sigma=sigma)
np.seterr(divide='ignore', invalid='ignore')
Z=VV/WW
for i in range(len(Z)):
for j in range(len(Z[0])):
if np.isnan(U[i][j]):
Z[i][j] = np.nan
plt.imshow(Z, origin='bottom', extent=(-50, 50, -50, 50), cmap=cmap)
plt.xlim(-40,40)
plt.ylim(-40,40)
plt.xlabel("X", fontsize=15)
plt.ylabel("Y", fontsize=15)
ax.set_yticks([-40,-30,-20,-10,0,10,20,30,40])
bounds = np.arange(2.0, 20.0, 1.0)
plt.colorbar(ticks=bounds, label="Color", fraction=0.046, pad=0.04)
# save plots
plt.savefig("Whatever_name.png", bbox_inches='tight')

Related

Is there a library that will help me fit data easily? I found fitter and i will provide the code but it shows some errors

So, here is my code:
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from fitter import Fitter, get_common_distributions
df = pd.read_csv("project3.csv")
bins = [282.33, 594.33, 906.33, 1281.33, 15030.33, 1842.33, 2154.33, 2466.33, 2778.33, 3090.33, 3402.33]
#declaring
facecolor = '#EAEAEA'
color_bars = '#3475D0'
txt_color1 = '#252525'
txt_color2 = '#004C74'
fig, ax = plt.subplots(1, figsize=(16, 6), facecolor=facecolor)
ax.set_facecolor(facecolor)
n, bins, patches = plt.hist(df.City1, color=color_bars, bins=10)
#grid
minor_locator = AutoMinorLocator(2)
plt.gca().xaxis.set_minor_locator(minor_locator)
plt.grid(which='minor', color=facecolor, lw = 0.5)
xticks = [(bins[idx+1] + value)/2 for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:.0f}-{:.0f}".format(value, bins[idx+1]) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels=xticks_labels, c=txt_color1, fontsize=13)
#beautify
ax.tick_params(axis='x', which='both',length=0)
plt.yticks([])
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
for idx, value in enumerate(n):
if value > 0:
plt.text(xticks[idx], value+5, int(value), ha='center', fontsize=16, c=txt_color1)
plt.title('Histogram of rainfall in City1\n', loc = 'right', fontsize = 20, c=txt_color1)
plt.xlabel('\nCentimeters of rainfall', c=txt_color2, fontsize=14)
plt.ylabel('Frequency of occurrence', c=txt_color2, fontsize=14)
plt.tight_layout()
#plt.savefig('City1_Raw.png', facecolor=facecolor)
plt.show()
city1 = df['City1'].values
f = Fitter(city1, distributions=get_common_distributions())
f.fit()
fig = f.plot_pdf(names=None, Nbest=4, lw=1, method='sumsquare_error')
plt.show()
print(f.get_best(method = 'sumsquare_error'))
The issue is with the plots it shows. The first histogram it generates is
Next I get another graph with best fitted distributions which is
Then an output statement
{'chi2': {'df': 10.692966790090342, 'loc': 16.690849400411103, 'scale': 118.71595997157786}}
Process finished with exit code 0
I have a couple of questions. Why is chi2, the best fitted distribution not plotted on the graph?
How do I plot these distributions on top of the histograms and not separately? The hist() function in fitter library can do that but there I don't get to control the bins and so I end up getting like 100 bins with some flat looking data.
How do I solve this issue? I need to plot the best fit curve on the histogram that looks like image1. Can I use any other module/package to get the work done in similar way? This uses least squares fit but I am OK with least likelihood or log likelihood too.
Simple way of plotting things on top of each other (using some properties of the Fitter class)
import scipy.stats as st
import matplotlib.pyplot as plt
from fitter import Fitter, get_common_distributions
from scipy import stats
numberofpoints=50000
df = stats.norm.rvs( loc=1090, scale=500, size=numberofpoints)
fig, ax = plt.subplots(1, figsize=(16, 6))
n, bins, patches = ax.hist( df, bins=30, density=True)
f = Fitter(df, distributions=get_common_distributions())
f.fit()
errorlist = sorted(
[
[f._fitted_errors[dist], dist]
for dist in get_common_distributions()
]
)[:4]
for err, dist in errorlist:
ax.plot( f.x, f.fitted_pdf[dist] )
plt.show()
Using the histogram normalization, one would need to play with scaling to generalize again.

Need to force overlapping for seaborn's heatmap and kdeplot

I'm trying to combine seaborn's heatmap and kdeplot in one figure, but so far the result is not very promising since I cannot find a way to make them overlap. As a result, the heatmap is just squeezed to the left side of the figure.
I think the reason is that seaborn doesn't seem to recognize the x-axis as the same one in two charts (see picture below), although the data points are exactly the same. The only difference is that for heatmap I needed to pivot them, while for the kdeplot pivoting is not needed.
Therefore, data for the axis are coming from the same dataset, but in the different forms as it can be seen in the code below.
The dataset sample looks something like this:
X Y Z
7,75 280 52,73
3,25 340 54,19
5,75 340 53,61
2,5 180 54,67
3 340 53,66
1,75 340 54,81
4,5 380 55,18
4 240 56,49
4,75 380 55,17
4,25 180 55,40
2 420 56,42
2,25 380 54,90
My code:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(11, 9), dpi=300)
plt.tick_params(bottom='on')
# dataset is just a pandas frame with data
X1 = dataset.iloc[:, :3].pivot("X", "Y", "Z")
X2 = dataset.iloc[:, :2]
ax = sns.heatmap(X1, cmap="Spectral")
ax.invert_yaxis()
ax2 = plt.twinx()
sns.kdeplot(X2.iloc[:, 1], X2.iloc[:, 0], ax=ax2, zorder=2)
ax.axis('tight')
plt.show()
Please help me with placing kdeplot on top of the heatmap. Ideally, I would like my final plot to look something like this:
Any tips or hints will be greatly appreciated!
The question can be a bit hard to understand, because the dataset can't be "just some data". The X and Y values need to lie on a very regular grid. No X,Y combination can be repeated, but not all values appear. The kdeplot will then show where the used values of X,Y are concentrated.
Such a dataset can be simulated by first generating dummy data for a full grid, and then take a subset.
Now, a seaborn heatmap uses categorical X and Y axes. Such axes are very hard to align with the kdeplot. To obtain a similar heatmap with numerical axes, ax.pcolor() can be used.
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
xs = np.arange(2, 10, 0.25)
ys = np.arange(150, 400, 10)
# first create a dummy dataset over a full grid
dataset = pd.DataFrame({'X': np.repeat(xs, len(ys)),
'Y': np.tile(ys, len(xs)),
'Z': np.random.uniform(50, 60, len(xs) * len(ys))})
# take a random subset of the rows
dataset = dataset.sample(200)
fig, ax = plt.subplots(figsize=(11, 9), dpi=300)
X1 = dataset.pivot("X", "Y", "Z")
collection = ax.pcolor(X1.columns, X1.index, X1, shading='nearest', cmap="Spectral")
plt.colorbar(collection, ax=ax, pad=0.02)
# default, cut=3, which causes a lot of surrounding whitespace
sns.kdeplot(x=dataset["Y"], y=dataset["X"], cut=1.5, ax=ax)
fig.tight_layout()
plt.show()

When plotting the Wigner function of a coherent state using QuTiP strange patterns appear

I noticed something strange this day when I plotted the Wigner function of a coherent state using the open source quantum toolbox QuTiP in python.
When I do the plot I noticed these strange patterns just around the edge of the plot that are not supposed to be there. I believe it's just some sort of numerical error but I don't know how I can get rid or minimize them or most impartant: what's causing them.
Here is the code
# import packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib as mpl
from matplotlib import cm
from qutip import *
N = 60 # number of levels in Hilbert space
# density matrix of a coherent state
rho_coherent = coherent_dm(N, 1-1j)
X = np.linspace(-3, 3, 300)
Y = np.linspace(-3, 3, 300)
# Wigner function
W = wigner(rho_coherent, X, Y, 'iterative', 2)
X, Y = np.meshgrid(X, Y)
# Color Normalization
class MidpointNormalize(colors.Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
colors.Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
# contour plot
plt.subplot(111, aspect='equal')
plt.contourf(X, Y, W, 100, cmap = cm.RdBu_r, norm = MidpointNormalize(midpoint=0.))
plt.show()
and here is the plot
The blue spots as you can clearly see that's around the edges are not supposed to be there! The blue spots indicate that the Wigner function is negative at that point, but a coherent state should have a Wigner function thats positive everywhere!
I also noticed that when I reduce the linspace steps from 300 to 100 the blue parts disappear.
Would appreciate very much if someone can explain what's causing this problem to appear.
This is simply due to truncation. When using a finite number of modes (in your case N=60), the Wigner function will go negative at some point.
Reducing the linspace steps brings the negative regions you see on the plot into the zero value increment and displays these regions as zero. Reducing the linspace steps is probably the best solution to your problem. Your plot will only be as accurate as the errors introduced by truncation, so simply reduce the resolution until those errors disappear.

Using python and networkx to find the probability density function

I'm struggling to draw a power law graph for Facebook Data that I found online. I'm using Networkx and I've found how to draw a Degree Histogram and a degree rank. The problem that I'm having is I want the y axis to be a probability so I'm assuming I need to sum up each y value and divide by the total number of nodes? Can anyone please help me do this? Once I've got this I'd like to draw a log-log graph to see if I can obtain a straight line. I'd really appreciate it if anyone could help! Here's my code:
import collections
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import community
import math
import pylab as plt
g = nx.read_edgelist("/Users/Michael/Desktop/anaconda3/facebook_combined.txt","r")
nx.info(g)
degree_sequence = sorted([d for n, d in g.degree()], reverse=True)
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
fig, ax = plt.subplots()
plt.bar(deg, cnt, width=0.80, color='b')
plt.title("Degree Histogram for Facebook Data")
plt.ylabel("Count")
plt.xlabel("Degree")
ax.set_xticks([d + 0.4 for d in deg])
ax.set_xticklabels(deg)
plt.show()
plt.loglog(degree_sequence, 'b-', marker='o')
plt.title("Degree rank plot")
plt.ylabel("Degree")
plt.xlabel("Rank")
plt.show()
You seem to be on the right tracks, but some simplifications will likely help you. The code below uses only 2 libraries.
Without access your graph, we can use some graph generators instead. I've chosen 2 qualitatively different types here, and deliberately chosen different sizes so that the normalization of the histogram is needed.
import networkx as nx
import matplotlib.pyplot as plt
g1 = nx.scale_free_graph(1000, )
g2 = nx.watts_strogatz_graph(2000, 6, p=0.8)
# we don't need to sort the values since the histogram will handle it for us
deg_g1 = nx.degree(g1).values()
deg_g2 = nx.degree(g2).values()
# there are smarter ways to choose bin locations, but since
# degrees must be discrete, we can be lazy...
max_degree = max(deg_g1 + deg_g2)
# plot different styles to see both
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(deg_g1, bins=xrange(0, max_degree), density=True, histtype='bar', rwidth=0.8)
ax.hist(deg_g2, bins=xrange(0, max_degree), density=True, histtype='step', lw=3)
# setup the axes to be log/log scaled
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('degree')
ax.set_ylabel('relative density')
ax.legend()
plt.show()
This produces an output plot like this (both g1,g2 are randomised so won't be identical):
Here we can see that g1 has an approximately straight line decay in the degree distribution -- as expected for scale-free distributions on log-log axes. Conversely, g2 does not have a scale-free degree distribution.
To say anything more formal, you could look at the toolboxes from Aaron Clauset: http://tuvalu.santafe.edu/~aaronc/powerlaws/ which implement model fitting and statistical testing of power-law distributions.

Smooth curves in Python Plots [duplicate]

I've got the following simple script that plots a graph:
import matplotlib.pyplot as plt
import numpy as np
T = np.array([6, 7, 8, 9, 10, 11, 12])
power = np.array([1.53E+03, 5.92E+02, 2.04E+02, 7.24E+01, 2.72E+01, 1.10E+01, 4.70E+00])
plt.plot(T,power)
plt.show()
As it is now, the line goes straight from point to point which looks ok, but could be better in my opinion. What I want is to smooth the line between the points. In Gnuplot I would have plotted with smooth cplines.
Is there an easy way to do this in PyPlot? I've found some tutorials, but they all seem rather complex.
You could use scipy.interpolate.spline to smooth out your data yourself:
from scipy.interpolate import spline
# 300 represents number of points to make between T.min and T.max
xnew = np.linspace(T.min(), T.max(), 300)
power_smooth = spline(T, power, xnew)
plt.plot(xnew,power_smooth)
plt.show()
spline is deprecated in scipy 0.19.0, use BSpline class instead.
Switching from spline to BSpline isn't a straightforward copy/paste and requires a little tweaking:
from scipy.interpolate import make_interp_spline, BSpline
# 300 represents number of points to make between T.min and T.max
xnew = np.linspace(T.min(), T.max(), 300)
spl = make_interp_spline(T, power, k=3) # type: BSpline
power_smooth = spl(xnew)
plt.plot(xnew, power_smooth)
plt.show()
Before:
After:
For this example spline works well, but if the function is not smooth inherently and you want to have smoothed version you can also try:
from scipy.ndimage.filters import gaussian_filter1d
ysmoothed = gaussian_filter1d(y, sigma=2)
plt.plot(x, ysmoothed)
plt.show()
if you increase sigma you can get a more smoothed function.
Proceed with caution with this one. It modifies the original values and may not be what you want.
See the scipy.interpolate documentation for some examples.
The following example demonstrates its use, for linear and cubic spline interpolation:
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import interp1d
# Define x, y, and xnew to resample at.
x = np.linspace(0, 10, num=11, endpoint=True)
y = np.cos(-x**2/9.0)
xnew = np.linspace(0, 10, num=41, endpoint=True)
# Define interpolators.
f_linear = interp1d(x, y)
f_cubic = interp1d(x, y, kind='cubic')
# Plot.
plt.plot(x, y, 'o', label='data')
plt.plot(xnew, f_linear(xnew), '-', label='linear')
plt.plot(xnew, f_cubic(xnew), '--', label='cubic')
plt.legend(loc='best')
plt.show()
Slightly modified for increased readability.
One of the easiest implementations I found was to use that Exponential Moving Average the Tensorboard uses:
def smooth(scalars: List[float], weight: float) -> List[float]: # Weight between 0 and 1
last = scalars[0] # First value in the plot (first timestep)
smoothed = list()
for point in scalars:
smoothed_val = last * weight + (1 - weight) * point # Calculate smoothed value
smoothed.append(smoothed_val) # Save it
last = smoothed_val # Anchor the last smoothed value
return smoothed
ax.plot(x_labels, smooth(train_data, .9), x_labels, train_data)
I presume you mean curve-fitting and not anti-aliasing from the context of your question. PyPlot doesn't have any built-in support for this, but you can easily implement some basic curve-fitting yourself, like the code seen here, or if you're using GuiQwt it has a curve fitting module. (You could probably also steal the code from SciPy to do this as well).
Here is a simple solution for dates:
from scipy.interpolate import make_interp_spline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from datetime import datetime
data = {
datetime(2016, 9, 26, 0, 0): 26060, datetime(2016, 9, 27, 0, 0): 23243,
datetime(2016, 9, 28, 0, 0): 22534, datetime(2016, 9, 29, 0, 0): 22841,
datetime(2016, 9, 30, 0, 0): 22441, datetime(2016, 10, 1, 0, 0): 23248
}
#create data
date_np = np.array(list(data.keys()))
value_np = np.array(list(data.values()))
date_num = dates.date2num(date_np)
# smooth
date_num_smooth = np.linspace(date_num.min(), date_num.max(), 100)
spl = make_interp_spline(date_num, value_np, k=3)
value_np_smooth = spl(date_num_smooth)
# print
plt.plot(date_np, value_np)
plt.plot(dates.num2date(date_num_smooth), value_np_smooth)
plt.show()
It's worth your time looking at seaborn for plotting smoothed lines.
The seaborn lmplot function will plot data and regression model fits.
The following illustrates both polynomial and lowess fits:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
T = np.array([6, 7, 8, 9, 10, 11, 12])
power = np.array([1.53E+03, 5.92E+02, 2.04E+02, 7.24E+01, 2.72E+01, 1.10E+01, 4.70E+00])
df = pd.DataFrame(data = {'T': T, 'power': power})
sns.lmplot(x='T', y='power', data=df, ci=None, order=4, truncate=False)
sns.lmplot(x='T', y='power', data=df, ci=None, lowess=True, truncate=False)
The order = 4 polynomial fit is overfitting this toy dataset. I don't show it here but order = 2 and order = 3 gave worse results.
The lowess = True fit is underfitting this tiny dataset but may give better results on larger datasets.
Check the seaborn regression tutorial for more examples.
Another way to go, which slightly modifies the function depending on the parameters you use:
from statsmodels.nonparametric.smoothers_lowess import lowess
def smoothing(x, y):
lowess_frac = 0.15 # size of data (%) for estimation =~ smoothing window
lowess_it = 0
x_smooth = x
y_smooth = lowess(y, x, is_sorted=False, frac=lowess_frac, it=lowess_it, return_sorted=False)
return x_smooth, y_smooth
That was better suited than other answers for my specific application case.

Resources