where does pyplot define the number of levels - python-3.x

I'm trying to understand how levels, values, boundaries are created in the contourf-toolchain.
import matplotlib.pyplot as plt
import matplotlib as mpl
t=np.array([[-5,0,5,0,0,0],[0,0,5,0,0,0],[0,0,5,0,0,0],[5,5.2,10,5,5,0],
[0,0,5,0,0,0]], dtype=np.double)
print ("print as is")
print (t)
print ("appearing in a contourf as:")
print (np.flipud((t)))
# assuming indices: array[x,y]
# value 10 supposed to be at 0,0
# contourf's pos. x-axis is pointing up!!
xmin, xmax=-2, 3
ymin, ymax=-3, 1
fig, axs = plt.subplots(nrows=1, ncols=2)
ax0 = axs[0]
ax1 = axs[1]
# plot and add colorbar
set0 = ax0.contourf(t, extent=(xmin,xmax,ymin,ymax), cmap="brg")
#colorbar with coarse steps
cba = plt.colorbar(set0,ax=ax0)
# # This makes the colorbar "appear continuos"
# cba.boundaries=None
# cba.values = None
set0.changed()
# define colorbar and then plot
cNorm = mpl.colors.Normalize(vmin=np.min(t), vmax=np.max(t))
sm = mpl.cm.ScalarMappable(cNorm, cmap="brg")
cbb = plt.colorbar(sm,ax=ax1)
set1 = ax1.contourf(t, extent=(ymin,ymax,xmin,xmax), cmap="brg")
plt.show()
print("left .boundaries: {:s}".format(str(cba.boundaries)))
print("left ._boundaries: {:s}".format(str(cba._boundaries)))
print("right .boundaries: {:s}".format(str(cbb.boundaries)))
print("right len(._boundaries): {:s}".format(str(len(cbb._boundaries))))
For the left plot the colorbar has 9 boundaries and 8 discrete color values for the right one there are 257 boundaries.
I did a couple of vars(...) and dir(...) on the various objects. Where is decision made on the amount of boundaries and values made?
What is the deviation of the right plots colorbar creation from the default behaviour?

Related

Add percentage axis to Seaborn catplot with correct axis tick labels

I'm plotting the counts of a categorical variable and want to add a second y-axis that shows the percentage of the total number of samples.
import matplotlib.pyplot as plt
import seaborn as sns
titanic = sns.load_dataset("titanic")
g = sns.catplot(x="alive", col="embark_town", col_wrap=4,
data=titanic[titanic.deck.notnull()],
kind="count", height=4, aspect=.8)
for i, ax in enumerate(g.axes.flat):
# Create second y-axis for the percentages on the right
ax1 = ax.twinx()
### Attempt to fix percentages by plotting the bars over
#g = sns.catplot(x="alive", col="embark_town", col_wrap=4,
# data=titanic[titanic.deck.notnull()],
# kind="count", height=4, aspect=.8,
# ax = ax1)
# Label by the percentages
ax1.set_ylim(ax.get_ylim())
ax1.set_yticklabels(np.round(ax.get_yticks()/titanic[titanic.deck.notnull()].shape[0],1))
ax1.set_ylabel('Percentage')
# Rotate x-labels
labels = ax.get_xticklabels() # get x labels
ax.set_xticklabels(labels, rotation=90)
# Ensure good spacing
g.fig.tight_layout()
Right, now my issue is that the percentages are being duplicated on the right y-axis, as show in the image below
I've tried to correct this by plotting the counts on the new axis, but that adds another row of subplots (see commented out code in the for loop). How can I get the right y-axis labels to not have duplicate values and actually reflect the percentages of the total count?
import matplotlib.pyplot as plt
import seaborn as sns
titanic = sns.load_dataset("titanic")
g = sns.catplot(x="alive", col="embark_town", col_wrap=4,
data=titanic[titanic.deck.notnull()],
kind="count", height=4, aspect=.8)
# calculate numbre of samples:
total_samples = len(titanic[titanic.deck.notnull()])
for i, ax in enumerate(g.axes.flat):
# bounds of the left y-axis:
ymin, ymax = ax.get_ylim()
# # Create second y-axis for the percentages on the right
ax1 = ax.twinx()
# scale right axis labels to total samples and mutliply with 100 for percentages
ax1.set_ylim(100*ymin/total_samples, 100*ymax/total_samples)
# Ensure good spacing
g.fig.tight_layout()

How to make the fluctuation range of three polylines all obvious in same figure by matplotlib? [duplicate]

I'm trying to create a plot using pyplot that has a discontinuous x-axis. The usual way this is drawn is that the axis will have something like this:
(values)----//----(later values)
where the // indicates that you're skipping everything between (values) and (later values).
I haven't been able to find any examples of this, so I'm wondering if it's even possible. I know you can join data over a discontinuity for, eg, financial data, but I'd like to make the jump in the axis more explicit. At the moment I'm just using subplots but I'd really like to have everything end up on the same graph in the end.
Paul's answer is a perfectly fine method of doing this.
However, if you don't want to make a custom transform, you can just use two subplots to create the same effect.
Rather than put together an example from scratch, there's an excellent example of this written by Paul Ivanov in the matplotlib examples (It's only in the current git tip, as it was only committed a few months ago. It's not on the webpage yet.).
This is just a simple modification of this example to have a discontinuous x-axis instead of the y-axis. (Which is why I'm making this post a CW)
Basically, you just do something like this:
import matplotlib.pylab as plt
import numpy as np
# If you're not familiar with np.r_, don't worry too much about this. It's just
# a series with points from 0 to 1 spaced at 0.1, and 9 to 10 with the same spacing.
x = np.r_[0:1:0.1, 9:10:0.1]
y = np.sin(x)
fig,(ax,ax2) = plt.subplots(1, 2, sharey=True)
# plot the same data on both axes
ax.plot(x, y, 'bo')
ax2.plot(x, y, 'bo')
# zoom-in / limit the view to different portions of the data
ax.set_xlim(0,1) # most of the data
ax2.set_xlim(9,10) # outliers only
# hide the spines between ax and ax2
ax.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax.yaxis.tick_left()
ax.tick_params(labeltop='off') # don't put tick labels at the top
ax2.yaxis.tick_right()
# Make the spacing between the two axes a bit smaller
plt.subplots_adjust(wspace=0.15)
plt.show()
To add the broken axis lines // effect, we can do this (again, modified from Paul Ivanov's example):
import matplotlib.pylab as plt
import numpy as np
# If you're not familiar with np.r_, don't worry too much about this. It's just
# a series with points from 0 to 1 spaced at 0.1, and 9 to 10 with the same spacing.
x = np.r_[0:1:0.1, 9:10:0.1]
y = np.sin(x)
fig,(ax,ax2) = plt.subplots(1, 2, sharey=True)
# plot the same data on both axes
ax.plot(x, y, 'bo')
ax2.plot(x, y, 'bo')
# zoom-in / limit the view to different portions of the data
ax.set_xlim(0,1) # most of the data
ax2.set_xlim(9,10) # outliers only
# hide the spines between ax and ax2
ax.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax.yaxis.tick_left()
ax.tick_params(labeltop='off') # don't put tick labels at the top
ax2.yaxis.tick_right()
# Make the spacing between the two axes a bit smaller
plt.subplots_adjust(wspace=0.15)
# This looks pretty good, and was fairly painless, but you can get that
# cut-out diagonal lines look with just a bit more work. The important
# thing to know here is that in axes coordinates, which are always
# between 0-1, spine endpoints are at these locations (0,0), (0,1),
# (1,0), and (1,1). Thus, we just need to put the diagonals in the
# appropriate corners of each of our axes, and so long as we use the
# right transform and disable clipping.
d = .015 # how big to make the diagonal lines in axes coordinates
# arguments to pass plot, just so we don't keep repeating them
kwargs = dict(transform=ax.transAxes, color='k', clip_on=False)
ax.plot((1-d,1+d),(-d,+d), **kwargs) # top-left diagonal
ax.plot((1-d,1+d),(1-d,1+d), **kwargs) # bottom-left diagonal
kwargs.update(transform=ax2.transAxes) # switch to the bottom axes
ax2.plot((-d,d),(-d,+d), **kwargs) # top-right diagonal
ax2.plot((-d,d),(1-d,1+d), **kwargs) # bottom-right diagonal
# What's cool about this is that now if we vary the distance between
# ax and ax2 via f.subplots_adjust(hspace=...) or plt.subplot_tool(),
# the diagonal lines will move accordingly, and stay right at the tips
# of the spines they are 'breaking'
plt.show()
I see many suggestions for this feature but no indication that it's been implemented. Here is a workable solution for the time-being. It applies a step-function transform to the x-axis. It's a lot of code, but it's fairly simple since most of it is boilerplate custom scale stuff. I have not added any graphics to indicate the location of the break, since that is a matter of style. Good luck finishing the job.
from matplotlib import pyplot as plt
from matplotlib import scale as mscale
from matplotlib import transforms as mtransforms
import numpy as np
def CustomScaleFactory(l, u):
class CustomScale(mscale.ScaleBase):
name = 'custom'
def __init__(self, axis, **kwargs):
mscale.ScaleBase.__init__(self)
self.thresh = None #thresh
def get_transform(self):
return self.CustomTransform(self.thresh)
def set_default_locators_and_formatters(self, axis):
pass
class CustomTransform(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
lower = l
upper = u
def __init__(self, thresh):
mtransforms.Transform.__init__(self)
self.thresh = thresh
def transform(self, a):
aa = a.copy()
aa[a>self.lower] = a[a>self.lower]-(self.upper-self.lower)
aa[(a>self.lower)&(a<self.upper)] = self.lower
return aa
def inverted(self):
return CustomScale.InvertedCustomTransform(self.thresh)
class InvertedCustomTransform(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
lower = l
upper = u
def __init__(self, thresh):
mtransforms.Transform.__init__(self)
self.thresh = thresh
def transform(self, a):
aa = a.copy()
aa[a>self.lower] = a[a>self.lower]+(self.upper-self.lower)
return aa
def inverted(self):
return CustomScale.CustomTransform(self.thresh)
return CustomScale
mscale.register_scale(CustomScaleFactory(1.12, 8.88))
x = np.concatenate((np.linspace(0,1,10), np.linspace(9,10,10)))
xticks = np.concatenate((np.linspace(0,1,6), np.linspace(9,10,6)))
y = np.sin(x)
plt.plot(x, y, '.')
ax = plt.gca()
ax.set_xscale('custom')
ax.set_xticks(xticks)
plt.show()
Check the brokenaxes package:
import matplotlib.pyplot as plt
from brokenaxes import brokenaxes
import numpy as np
fig = plt.figure(figsize=(5,2))
bax = brokenaxes(
xlims=((0, .1), (.4, .7)),
ylims=((-1, .7), (.79, 1)),
hspace=.05
)
x = np.linspace(0, 1, 100)
bax.plot(x, np.sin(10 * x), label='sin')
bax.plot(x, np.cos(10 * x), label='cos')
bax.legend(loc=3)
bax.set_xlabel('time')
bax.set_ylabel('value')
A very simple hack is to
scatter plot rectangles over the axes' spines and
draw the "//" as text at that position.
Worked like a charm for me:
# FAKE BROKEN AXES
# plot a white rectangle on the x-axis-spine to "break" it
xpos = 10 # x position of the "break"
ypos = plt.gca().get_ylim()[0] # y position of the "break"
plt.scatter(xpos, ypos, color='white', marker='s', s=80, clip_on=False, zorder=100)
# draw "//" on the same place as text
plt.text(xpos, ymin-0.125, r'//', fontsize=label_size, zorder=101, horizontalalignment='center', verticalalignment='center')
Example Plot:
For those interested, I've expanded upon #Paul's answer and added it to the matplotlib wrapper proplot. It can do axis "jumps", "speedups", and "slowdowns".
There is no way currently to add "crosses" that indicate the discrete jump like in Joe's answer, but I plan to add this in the future. I also plan to add a default "tick locator" that sets sensible default tick locations depending on the CutoffScale arguments.
Adressing Frederick Nord's question how to enable parallel orientation of the diagonal "breaking" lines when using a gridspec with ratios unequal 1:1, the following changes based on the proposals of Paul Ivanov and Joe Kingtons may be helpful. Width ratio can be varied using variables n and m.
import matplotlib.pylab as plt
import numpy as np
import matplotlib.gridspec as gridspec
x = np.r_[0:1:0.1, 9:10:0.1]
y = np.sin(x)
n = 5; m = 1;
gs = gridspec.GridSpec(1,2, width_ratios = [n,m])
plt.figure(figsize=(10,8))
ax = plt.subplot(gs[0,0])
ax2 = plt.subplot(gs[0,1], sharey = ax)
plt.setp(ax2.get_yticklabels(), visible=False)
plt.subplots_adjust(wspace = 0.1)
ax.plot(x, y, 'bo')
ax2.plot(x, y, 'bo')
ax.set_xlim(0,1)
ax2.set_xlim(10,8)
# hide the spines between ax and ax2
ax.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax.yaxis.tick_left()
ax.tick_params(labeltop='off') # don't put tick labels at the top
ax2.yaxis.tick_right()
d = .015 # how big to make the diagonal lines in axes coordinates
# arguments to pass plot, just so we don't keep repeating them
kwargs = dict(transform=ax.transAxes, color='k', clip_on=False)
on = (n+m)/n; om = (n+m)/m;
ax.plot((1-d*on,1+d*on),(-d,d), **kwargs) # bottom-left diagonal
ax.plot((1-d*on,1+d*on),(1-d,1+d), **kwargs) # top-left diagonal
kwargs.update(transform=ax2.transAxes) # switch to the bottom axes
ax2.plot((-d*om,d*om),(-d,d), **kwargs) # bottom-right diagonal
ax2.plot((-d*om,d*om),(1-d,1+d), **kwargs) # top-right diagonal
plt.show()
This is a hacky but pretty solution for x-axis breaks.
The solution is based on https://matplotlib.org/stable/gallery/subplots_axes_and_figures/broken_axis.html, which gets rid of the problem with positioning the break above the spine, solved by How can I plot points so they appear over top of the spines with matplotlib?
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
def axis_break(axis, xpos=[0.1, 0.125], slant=1.5):
d = slant # proportion of vertical to horizontal extent of the slanted line
anchor = (xpos[0], -1)
w = xpos[1] - xpos[0]
h = 1
kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12, zorder=3,
linestyle="none", color='k', mec='k', mew=1, clip_on=False)
axis.add_patch(Rectangle(
anchor, w, h, fill=True, color="white",
transform=axis.transAxes, clip_on=False, zorder=3)
)
axis.plot(xpos, [0, 0], transform=axis.transAxes, **kwargs)
fig, ax = plt.subplots(1,1)
plt.plot(np.arange(10))
axis_break(ax, xpos=[0.1, 0.12], slant=1.5)
axis_break(ax, xpos=[0.3, 0.31], slant=-10)
if you want to replace an axis label, this would do the trick:
from matplotlib import ticker
def replace_pos_with_label(fig, pos, label, axis):
fig.canvas.draw() # this is needed to set up the x-ticks
labs = axis.get_xticklabels()
labels = []
locs = []
for text in labs:
x = text._x
lab = text._text
if x == pos:
lab = label
labels.append(lab)
locs.append(x)
axis.xaxis.set_major_locator(ticker.FixedLocator(locs))
axis.set_xticklabels(labels)
fig, ax = plt.subplots(1,1)
plt.plot(np.arange(10))
replace_pos_with_label(fig, 0, "-10", axis=ax)
replace_pos_with_label(fig, 6, "$10^{4}$", axis=ax)
axis_break(ax, xpos=[0.1, 0.12], slant=2)

Control marker properties in seaborn pairwise boxplot

I'm trying to plot a boxplot for two different datasets on the same plot. The x axis are the hours in a day, while the y axis goes from 0 to 1 (let's call it Efficiency). I would like to have different markers for the means of each dataset' boxes. I use the 'meanprops' for seaborn but that changes the marker style for both datasets at the same time. I've added 2000 lines of data in the excel that can be downloaded here. The values might not coincide with the ones in the picture but should be enough.
Basically I want the red squares to be blue on the orange boxplot, and red on the blue boxplot. Here is what I managed to do so far:
I tried changing the meanprops by using a dictionary with the labels as keys , but it seems to be entering a loop (in PyCharm is says Evaluating...)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
#make sure you have your path sorted out
group1 = pd.read_excel('group1.xls')
ax,fig = plt.subplots(figsize = (20,10))
#does not work
#ax = sns.boxplot(data=group1, x='hour', y='M1_eff', hue='labels',showfliers=False, showmeans=True,\
# meanprops={"marker":{'7':"s",'8':'s'},"markerfacecolor":{'7':"white",'8':'white'},
#"markeredgecolor":{'7':"blue",'8':'red'})
#works but produces similar markers
ax = sns.boxplot(data=group1, x='hour', y='M1_eff', hue='labels',showfliers=False, showmeans=True,\
meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"blue"})
plt.legend(title='Groups', loc=2, bbox_to_anchor=(1, 1),borderaxespad=0.5)
# Add transparency to colors
for patch in ax.artists:
r, g, b, a = patch.get_facecolor()
patch.set_facecolor((r, g, b, .4))
ax.set_xlabel("Hours",fontsize=14)
ax.set_ylabel("M1 Efficiency",fontsize=14)
ax.tick_params(labelsize=10)
plt.show()
I also tried the FacetGrid but to no avail (Stops at 'Evaluating...'):
g = sns.FacetGrid(group1, col="M1_eff", hue="labels",hue_kws=dict(marker=["^", "v"]))
g = (g.map(plt.boxplot, "hour", "M1_eff")
.add_legend())
g.show()
Any help is appreciated!
I don't think you can do this using sns.boxplot() directly. I think you'll have to draw the means "by hand"
N=100
df = pd.DataFrame({'hour':np.random.randint(0,3,size=(N,)),
'M1_eff': np.random.random(size=(N,)),
'labels':np.random.choice([7,8],size=(N,))})
x_col = 'hour'
y_col = 'M1_eff'
hue_col = 'labels'
width = 0.8
hue_order=[7,8]
marker_colors = ['red','blue']
# get the offsets used by boxplot when hue-nesting is used
# https://github.com/mwaskom/seaborn/blob/c73055b2a9d9830c6fbbace07127c370389d04dd/seaborn/categorical.py#L367
n_levels = len(hue_order)
each_width = width / n_levels
offsets = np.linspace(0, width - each_width, n_levels)
offsets -= offsets.mean()
fig, ax = plt.subplots()
ax = sns.boxplot(data=df, x=x_col, y=y_col, hue=hue_col, hue_order=hue_order, showfliers=False, showmeans=False)
means = df.groupby([hue_col,x_col])[y_col].mean()
for (gr,temp),o,c in zip(means.groupby(level=0),offsets,marker_colors):
ax.plot(np.arange(temp.values.size)+o, temp.values, 's', c=c)

Bar Plot with inverted y axis and bars attached to bottom

The code below creates a bar plot with an inverted y-axis. What I don't manage yet is that the bars do not "hang from above" but start at the bottom. In other words, I like the bars to start at the maximum value of the y axis (i.e. at the x-axis) and ending at the value of df['y']. How can I do that?
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(data={'x_cat': ['aaaaa',
'bvvvvvv',
'deeeee',
'qqqqqqq',
'rr rrrrrrrr',
'rss sdasr',
'cccccccccccc',
'aarrrrrrrrrrra'
],
'y': [11.91,
35.19,
43.61,
46.12,
75.03,
81.39,
83.28,
89.20]
})
df['rank'] = df['y'].rank(method='dense') - 1
fig = plt.figure()
ax = fig.add_subplot(111)
# increase space below subplot
fig.subplots_adjust(bottom=0.3)
ax.bar(df['rank'],
df['y'],
width=0.8,
)
# invert y axis
ax.invert_yaxis()
# label x axis
ax.set_xticks(range(len(df)))
ax.set_xticklabels(df['x_cat'],
fontdict={'fontsize': 14})
for tick in ax.get_xticklabels():
tick.set_rotation(90)
You would need to calculate the new bottom. (Note that
because the axis is inverted, the "bottom" becomes the visual top of the bars.) The bottom is the value, the height is maximum minus the value itself.
I changed some other aspects of your plot, e.g. if your values are not sorted, calculating the rank and using it for plotting would result in wrong labelling. Hence better sort the dataframe beforehands (and forget about the rank).
Finally, we would need to adjust the "sticky edges" of the bars, because they should sit tight to the bottom of the figure (i.e. the top of the axis).
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'x_cat': ['aaaaa', 'bvvvvvv', 'deeeee', 'qqqqqqq', 'rr rrrrrrrr',
'rss sdasr', 'cccccccccccc', 'aarrrrrrrrrrra'],
'y': [11.91, 35.19, 43.61, 46.12, 75.03, 81.39, 83.28, 89.20]})
df.sort_values("y", inplace=True)
fig = plt.figure()
ax = fig.add_subplot(111)
# increase space below subplot
fig.subplots_adjust(bottom=0.3)
bars = ax.bar(df['x_cat'], df['y'].max()-df['y'], bottom=df['y'], width=0.8, )
# invert y axis
ax.invert_yaxis()
ax.tick_params(axis="x", rotation=90, labelsize=14)
for bar in bars:
bar.sticky_edges.y[:] = [df['y'].values.max()]
ax.autoscale()
plt.show()

Recreating decision-boundary plot in python with scikit-learn and matplotlib

I found this wonderful graph in post here Variation on "How to plot decision boundary of a k-nearest neighbor classifier from Elements of Statistical Learning?". In this example K-NN is used to clasify data into three classes. I especially enjoy that it features the probability of class membership as a indication of the "confidence".
r and ggplot seem to do a great job.I wonder, whether this can be re-created in python? My initial thought tends to scikit-learn and matplotlib. Here is the iris example from scikit:
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
n_neighbors = 15
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.show()
This produces a graph in a sense very similar:
I have three questions:
How can I introduce the confidence to the plot?
How can I plot the decision-boundaries with a connected line?
Let's say I have a new observation, how can I introduce it to the plot and plot if it is classified correctly?
I stumbled upon your question about a year ago, and loved the plot -- I just never got around to answering it, until now. Hopefully the code comments below are self-explanitory enough (I also blogged about, if you want more details). Maybe four years too late, haha.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from matplotlib.lines import Line2D
from matplotlib.ticker import MaxNLocator
from sklearn import neighbors
iris = datasets.load_iris()
x = iris.data[:,0:2]
y = iris.target
# create the x0, x1 feature
x0 = x[:,0]
x1 = x[:,1]
# set main parameters for KNN plot
N_NEIGHBORS = 15 # KNN number of neighbors
H = 0.1 # mesh stepsize
PROB_DOT_SCALE = 40 # modifier to scale the probability dots
PROB_DOT_SCALE_POWER = 3 # exponential used to increase/decrease size of prob dots
TRUE_DOT_SIZE = 50 # size of the true labels
PAD = 1.0 # how much to "pad" around the true labels
clf = neighbors.KNeighborsClassifier(N_NEIGHBORS, weights='uniform')
clf.fit(x, y)
# find the min/max points for both x0 and x1 features
# these min/max values will be used to set the bounds
# for the plot
x0_min, x0_max = np.round(x0.min())-PAD, np.round(x0.max()+PAD)
x1_min, x1_max = np.round(x1.min())-PAD, np.round(x1.max()+PAD)
# create 1D arrays representing the range of probability data points
# on both the x0 and x1 axes.
x0_axis_range = np.arange(x0_min,x0_max, H)
x1_axis_range = np.arange(x1_min,x1_max, H)
# create meshgrid between the two axis ranges
xx0, xx1 = np.meshgrid(x0_axis_range, x1_axis_range)
# put the xx in the same dimensional format as the original x
# because it's easier to work with that way (at least for me)
# * shape will be: [no_dots, no_dimensions]
# where no_dimensions = 2 (x0 and x1 axis)
xx = np.reshape(np.stack((xx0.ravel(),xx1.ravel()),axis=1),(-1,2))
yy_hat = clf.predict(xx) # prediction of all the little dots
yy_prob = clf.predict_proba(xx) # probability of each dot being
# the predicted color
yy_size = np.max(yy_prob, axis=1)
# make figure
plt.style.use('seaborn-whitegrid') # set style because it looks nice
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,6), dpi=150)
# establish colors and colormap
# * color blind colors, from
# https://towardsdatascience.com/two-simple-steps-to-create-colorblind-friendly-data-visualizations-2ed781a167ec
redish = '#d73027'
orangeish = '#fc8d59'
yellowish = '#fee090'
blueish = '#4575b4'
colormap = np.array([redish,blueish,orangeish])
# plot all the little dots, position defined by the xx values, color
# defined by the knn predictions (yy_hat), and size defined by the
# probability of that color (yy_prob)
# * because the yy_hat values are either 0, 1, 2, we can use
# these as values to index into the colormap array
# * size of dots (the probability) increases exponentially (^3), so that there is
# a nice difference between different probabilities. I'm sure there is a more
# elegant way to do this though...
# * linewidths=0 so that there are no "edges" around the dots
ax.scatter(xx[:,0], xx[:,1], c=colormap[yy_hat], alpha=0.4,
s=PROB_DOT_SCALE*yy_size**PROB_DOT_SCALE_POWER, linewidths=0,)
# plot the contours
# * we have to reshape the yy_hat to get it into a
# 2D dimensional format, representing both the x0
# and x1 axis
# * the number of levels and color scheme was manually tuned
# to make sense for this data. Would probably change, for
# instance, if there were 4, or 5 (etc.) classes
ax.contour(x0_axis_range, x1_axis_range,
np.reshape(yy_hat,(xx0.shape[0],-1)),
levels=3, linewidths=1,
colors=[redish,blueish, blueish,orangeish,])
# plot the original x values.
# * zorder is 3 so that the dots appear above all the other dots
ax.scatter(x[:,0], x[:,1], c=colormap[y], s=TRUE_DOT_SIZE, zorder=3,
linewidths=0.7, edgecolor='k')
# create legends
x_min, x_max = ax.get_xlim()
y_min, y_max = ax.get_ylim()
# set x-y labels
ax.set_ylabel(r"$x_1$")
ax.set_xlabel(r"$x_0$")
# create class legend
# Line2D properties: https://matplotlib.org/stable/api/_as_gen/matplotlib.lines.Line2D.html
# about size of scatter plot points: https://stackoverflow.com/a/47403507/9214620
legend_class = []
for flower_class, color in zip(['c', 's', 'v'], [blueish, redish, orangeish]):
legend_class.append(Line2D([0], [0], marker='o', label=flower_class,ls='None',
markerfacecolor=color, markersize=np.sqrt(TRUE_DOT_SIZE),
markeredgecolor='k', markeredgewidth=0.7))
# iterate over each of the probabilities to create prob legend
prob_values = [0.4, 0.6, 0.8, 1.0]
legend_prob = []
for prob in prob_values:
legend_prob.append(Line2D([0], [0], marker='o', label=prob, ls='None', alpha=0.8,
markerfacecolor='grey',
markersize=np.sqrt(PROB_DOT_SCALE*prob**PROB_DOT_SCALE_POWER),
markeredgecolor='k', markeredgewidth=0))
legend1 = ax.legend(handles=legend_class, loc='center',
bbox_to_anchor=(1.05, 0.35),
frameon=False, title='class')
legend2 = ax.legend(handles=legend_prob, loc='center',
bbox_to_anchor=(1.05, 0.65),
frameon=False, title='prob', )
ax.add_artist(legend1) # add legend back after it disappears
ax.set_yticks(np.arange(x1_min,x1_max, 1)) # I don't like the decimals
ax.grid(False) # remove gridlines (inherited from 'seaborn-whitegrid' style)
# only use integers for axis tick labels
# from: https://stackoverflow.com/a/34880501/9214620
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
# set the aspect ratio to 1, for looks
ax.set_aspect(1)
# remove first ticks from axis labels, for looks
# from: https://stackoverflow.com/a/19503828/9214620
ax.set_xticks(ax.get_xticks()[1:-1])
ax.set_yticks(np.arange(x1_min,x1_max, 1)[1:])
plt.show()

Resources