How to plot cdf on histogram in matplotlib - python-3.x

I currently have a script that will plot a histogram of relative frequency, given a pandas series. The code is:
def to_percent3(y, position):
s = str(100 * y)
if matplotlib.rcParams['text.usetex'] is True:
return s + r'$\%$'
else:
return s + '%'
df = pd.read_csv('mycsv.csv')
waypointfreq = df['Waypoint Frequency(Secs)']
cumfreq = df['Waypoint Frequency(Secs)']
perctile = np.percentile(waypointfreq, 95) # claculates 95th percentile
bins = np.arange(0,perctile+1,1) # creates list increasing by 1 to 96th percentile
plt.hist(waypointfreq, bins = bins, normed=True)
formatter = FuncFormatter(to_percent3) #changes y axis to percent
plt.gca().yaxis.set_major_formatter(formatter)
plt.axis([0, perctile, 0, 0.03]) #Defines the axis' by the 95th percentile and 10%Relative frequency
plt.xlabel('Waypoint Frequency(Secs)')
plt.xticks(np.arange(0, perctile, 15.0))
plt.title('Relative Frequency of Average Waypoint Frequency')
plt.grid(True)
plt.show()
It produces a plot that looks like this:
What I'd like is to overlay this plot with a line showing the cdf, plotted against a secondary axis. I know that I can create the cumulative graph with the command:
waypointfreq = df['Waypoint Frequency(Secs)']
perctile = np.percentile(waypointfreq, 95) # claculates 90th percentile
bins = np.arange(0,perctile+5,1) # creates list increasing by 2 to 90th percentile
plt.hist(waypointfreq, bins = bins, normed=True, histtype='stepfilled',cumulative=True)
formatter = FuncFormatter(to_percent3) #changes y axis to percent
plt.gca().yaxis.set_major_formatter(formatter)
plt.axis([0, perctile, 0, 1]) #Defines the axis' by the 90th percentile and 10%Relative frequency
plt.xlabel('Waypoint Frequency(Secs)')
plt.xticks(np.arange(0, perctile, 15.0))
plt.title('Cumulative Frequency of Average Waypoint Frequency')
plt.grid(True)
plt.savefig(r'output\4 Cumulative Frequency of Waypoint Frequency.png', bbox_inches='tight')
plt.show()
However, this is plotted on a separate graph, instead of over the previous one. Any help or insight would be appreciated.

Maybe this code snippet helps:
import numpy as np
from scipy.integrate import cumtrapz
from scipy.stats import norm
from matplotlib import pyplot as plt
n = 1000
x = np.linspace(-3,3, n)
data = norm.rvs(size=n)
data = data + abs(min(data))
data = np.sort(data)
cdf = cumtrapz(x=x, y=data )
cdf = cdf / max(cdf)
fig, ax = plt.subplots(ncols=1)
ax1 = ax.twinx()
ax.hist(data, normed=True, histtype='stepfilled', alpha=0.2)
ax1.plot(data[1:],cdf)
If your CDF is not smooth, you could fit a distribution

Related

How to draw vertical average lines for overlapping histograms in a loop

I'm trying to draw with matplotlib two average vertical line for every overlapping histograms using a loop. I have managed to draw the first one, but I don't know how to draw the second one. I'm using two variables from a dataset to draw the histograms. One variable (feat) is categorical (0 - 1), and the other one (objective) is numerical. The code is the following:
for chas in df[feat].unique():
plt.hist(df.loc[df[feat] == chas, objective], bins = 15, alpha = 0.5, density = True, label = chas)
plt.axvline(df[objective].mean(), linestyle = 'dashed', linewidth = 2)
plt.title(objective)
plt.legend(loc = 'upper right')
I also have to add to the legend the mean and standard deviation values for each histogram.
How can I do it? Thank you in advance.
I recommend you using axes to plot your figure. Pls see code below and the artist tutorial here.
import numpy as np
import matplotlib.pyplot as plt
# Fixing random state for reproducibility
np.random.seed(19680801)
mu1, sigma1 = 100, 8
mu2, sigma2 = 150, 15
x1 = mu1 + sigma1 * np.random.randn(10000)
x2 = mu2 + sigma2 * np.random.randn(10000)
fig, ax = plt.subplots(1, 1, figsize=(7.2, 7.2))
# the histogram of the data
lbs = ['a', 'b']
colors = ['r', 'g']
for i, x in enumerate([x1, x2]):
n, bins, patches = ax.hist(x, 50, density=True, facecolor=colors[i], alpha=0.75, label=lbs[i])
ax.axvline(bins.mean())
ax.legend()

Plotting Multiple Plots on a single figure from within a for loop - Python

I have reviewed the response to this question: How would I iterate over a list of files and plot them as subplots on a single figure?
But am none the wiser on how to achieve my goal. I would like to plot multiple data sets, with differing x axes, onto a single figure in Python. I have included a snippet of my code below, which performs an FFT on a dataset, then calculates 3 Butterworth filter outputs. Ideally I would like to have all plotted on a single figure, which I have attempted to achieve in the code below.
The for loop calculates the 3 Butterworth filter outputs, the code above - the FFT and the code directly below attempts to append the FFT curve and sqrt(0.5) line to the previously generated plots for display.
Any Direction or advice would be appreciated.
"""Performs a Fast Fourier Transform on the data specified at the base of the code"""
def FFT(col):
x = io2.loc[1:,'Time']
y = io2.loc[1:,col]
# Number of samplepoints
#N = 600
N = pd.Series.count(x)
N2 = int(N/2)
# sample spacing
#T = 1.0 / 800.0
T = 1/(io2.loc[2,'Time'] - io2.loc[1,'Time'])
#x = np.linspace(0.0, N*T, N)
#y = np.sin(50.0 * 2.0*np.pi*x) + 0.5*np.sin(80.0 * 2.0*np.pi*x)
yf = scipy.fftpack.fft(y)
xf = np.linspace(0.0, 1.0/(2.0*T), N2)
fig=plt.figure()
plt.clf()
i=1
for order in [3, 6, 9]:
ax=fig.add_subplot(111, label="order = %d" % order)
b, a = butter_lowpass(cutoff, fs, order=order)
w, h = freqz(b, a, worN=2000)
ax.plot((fs * 0.5 / np.pi) * w, abs(h))
i=i+1
ax4=fig.add_subplot(111, label='sqrt(0.5)', frame_on=False)
ax5=fig.add_subplot(111, label="FFT of "+col, frame_on=False)
ax4.plot([0, 0.5 * fs], [np.sqrt(0.5), np.sqrt(0.5)], '--')
ax5.plot(xf, 2.0/N * np.abs(yf[:N2]))
plt.xlabel('Frequency (Hz)')
plt.ylabel('Gain')
plt.grid(True)
plt.legend(loc='best')
#fig, ax = plt.subplots()
#ax.plot(xf, 2.0/N * np.abs(yf[:N2]), label="FFT of "+col)
plt.axis([0,5000,0,0.1])
#plt.xlabel('Frequency (Hz)')
#plt.ylabel('Amplitude (mm)')
#plt.legend(loc=0)
plt.show()
return
Kind Regards,
Here you can find a minimal example of how to plot multiple lines with different x and y datasets. You are recreating the plot every time you type add_subplot(111). Instead, you should call plot multiple times. I have added an example for a single plot with multiple lines, as well as an example for one subplot per line.
import numpy as np
import matplotlib.pyplot as plt
x1 = np.arange(0, 10, 1)
x2 = np.arange(3, 12, 0.1)
x3 = np.arange(2, 8, 0.01)
y1 = np.sin(x1)
y2 = np.cos(x2**0.8)
y3 = np.sin(4.*x3)**3
data = []
data.append((x1, y1, 'label1'))
data.append((x2, y2, 'label2'))
data.append((x3, y3, 'label3'))
# All lines in one plot.
plt.figure()
for n in data:
plt.plot(n[0], n[1], label=n[2])
plt.legend(loc=0, frameon=False)
# One subplot per data set.
cols = 2
rows = len(data)//2 + len(data)%2
plt.figure()
gs = plt.GridSpec(rows, cols)
for n in range(len(data)):
i = n%2
j = n//2
plt.subplot(gs[j,i])
plt.plot(data[n][0], data[n][1])
plt.title(data[n][2])
plt.tight_layout()
plt.show()

Plotting multiple density curves on the same plot: weighting the subset categories in Python 3

I am trying to recreate this density plot in python 3: math.stackexchange.com/questions/845424/the-expected-outcome-of-a-random-game-of-chess
End Goal: I need my density plot to look like this
The area under the blue curve is equal to that of the red, green, and purple curves combined because the different outcomes (Draw, Black wins, and White wins) are the subset of the total (All).
How do I have python realize and plot this accordingly?
Here is the .csv file of results_df after 1000 simulations pastebin.com/YDVMx2DL
from matplotlib import pyplot as plt
import seaborn as sns
black = results_df.loc[results_df['outcome'] == 'Black']
white = results_df.loc[results_df['outcome'] == 'White']
draw = results_df.loc[results_df['outcome'] == 'Draw']
win = results_df.loc[results_df['outcome'] != 'Draw']
Total = len(results_df.index)
Wins = len(win.index)
PercentBlack = "Black Wins ≈ %s" %('{0:.2%}'.format(len(black.index)/Total))
PercentWhite = "White Wins ≈ %s" %('{0:.2%}'.format(len(white.index)/Total))
PercentDraw = "Draw ≈ %s" %('{0:.2%}'.format(len(draw.index)/Total))
AllTitle = 'Distribution of Moves by All Outcomes (nSample = %s)' %(workers)
sns.distplot(results_df.moves, hist=False, label = "All")
sns.distplot(black.moves, hist=False, label=PercentBlack)
sns.distplot(white.moves, hist=False, label=PercentWhite)
sns.distplot(draw.moves, hist=False, label=PercentDraw)
plt.title(AllTitle)
plt.ylabel('Density')
plt.xlabel('Number of Moves')
plt.legend()
plt.show()
The code above produces density curves without weights, which I really need to figure out how to generate density curve weights accordingly as well as preserve my labels in the legend
density curves, no weights; help
I also tried frequency histograms, that scaled the distribution heights correctly but I would rather keep the 4 curves overlaid on top of each other for a "cleaner" look...I don't like this frequency plot but this is my current fix at the moment.
results_df.moves.hist(alpha=0.4, bins=range(0, 700, 10), label = "All")
draw.moves.hist(alpha=0.4, bins=range(0, 700, 10), label = PercentDraw)
white.moves.hist(alpha=0.4, bins=range(0, 700, 10), label = PercentWhite)
black.moves.hist(alpha=0.4, bins=range(0, 700, 10), label = PercentBlack)
plt.title(AllTitle)
plt.ylabel('Frequency')
plt.xlabel('Number of Moves')
plt.legend()
plt.show()
If anyone can write the python 3 code that outputs the first plot with 4 density curves with correct subset weights as well as preserves the custom legend that show percentages, that would be much appreciated.
Once the density curves are plotted with the correct subset weights, I am also interested in the python 3 code in finding the max point coordinates of each density curve that shows max frequency of moves once I scale it up to 500,000 iterations.
Thanks
You need to be careful. The plot that you have produced is correct. All the curves shown are probability density functions of the underlying distributions.
In the plot that you want to have, only the curve labeled "All" is a probability density function. The other curves are not.
In any case, you will need to calculate the kernel density estimate yourself, if you want to scale it like shown in the desired plot. This can be done using scipy.stats.gaussial_kde().
In order to reproduce the desired plot, I see two options.
Calculate the kde for all involved cases and scale them with the number of samples.
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
import scipy.stats
a = np.random.gumbel(80, 25, 1000).astype(int)
b = np.random.gumbel(200, 46, 4000).astype(int)
kdea = scipy.stats.gaussian_kde(a)
kdeb = scipy.stats.gaussian_kde(b)
both = np.hstack((a,b))
kdeboth = scipy.stats.gaussian_kde(both)
grid = np.arange(500)
#weighted kde curves
wa = kdea(grid)*(len(a)/float(len(both)))
wb = kdeb(grid)*(len(b)/float(len(both)))
print "a.sum ", wa.sum()
print "b.sum ", wb.sum()
print "total.sum ", kdeb(grid).sum()
fig, ax = plt.subplots()
ax.plot(grid, wa, lw=1, label = "weighted a")
ax.plot(grid, wb, lw=1, label = "weighted b")
ax.plot(grid, kdeboth(grid), color="crimson", lw=2, label = "pdf")
plt.legend()
plt.show()
Calculate the kde for all individual cases, normalize their sum to obtain the total.
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
import scipy.stats
a = np.random.gumbel(80, 25, 1000).astype(int)
b = np.random.gumbel(200, 46, 4000).astype(int)
kdea = scipy.stats.gaussian_kde(a)
kdeb = scipy.stats.gaussian_kde(b)
grid = np.arange(500)
#weighted kde curves
wa = kdea(grid)*(len(a)/float(len(a)+len(b)))
wb = kdeb(grid)*(len(b)/float(len(a)+len(b)))
total = wa+wb
fig, ax = plt.subplots(figsize=(5,3))
ax.plot(grid, wa, lw=1, label = "weighted a")
ax.plot(grid, wb, lw=1, label = "weighted b")
ax.plot(grid, total, color="crimson", lw=2, label = "pdf")
plt.legend()
plt.show()

Can't add matplotlib colorbar ticks

I am trying to add ticks and labels to a color bar, but it just doesn't seem to show up in the output. I have tried two approaches(as shown in the code below). Second appraoch was to do as shown in another question on Stack Overflow here: How to add Matplotlib Colorbar Ticks.
I must be overlooking something very simple here as I am a beginner in Matplotlib and Python.
I have managed to obtain the color bar, but the ticks I want just don't show up. Any help here will be greatly appreciated as I have been stuck at it for hours after trying and searching.
Here is the code I used to generate a heatmap using hexbin over a basemap.
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import cm
#Loading data from CSV file
DATA_FILE = '....../Population_data.csv'
roc_data = pd.read_csv(DATA_FILE)
roc_data.head()
#Creating figure window
fig = plt.figure(figsize=(14,10))
ax = fig.add_subplot(111)
#Drawing the basemap
m = Basemap(projection='merc', lat_0=43.12, lon_0=-77.626,
resolution = 'i',llcrnrlon=-78.236,
llcrnrlat=42.935,
urcrnrlon=-77.072,
urcrnrlat=43.349)
m.drawcoastlines()
m.drawcounties(zorder=20, color='red')
m.drawcountries()
m.drawmapboundary()
#plotting the heatmap using hexbin
x, y = m(roc_data['Longitude'].values, roc_data['Latitude'].values)
values = roc_data['Total(20-64)']
m.hexbin(x, y, gridsize = 125, bins = 'log', C = values, cmap = cm.Reds)
#Defining minimum, mean and maximum population values
max_p = roc_data['Total(20-64)'].max()
min_p = roc_data['Total(20-64)'].min()
mean_p = roc_data['Total(20-64)'].mean()
#Adding Colorbar
cb = m.colorbar(location = 'bottom', format = '%d', label = 'Population by Census Blocks')
#setting ticks
#cb.set_ticks([48, 107, 1302]) #First approach, didn't work
#cb.set_ticklabels(['Min', 'Mean', 'Max'])
cb.set_ticks([min_p, mean_p, max_p]) #Second appraoch, assumed ticks and tick labels should be same
cb.set_ticklabels([min_p, mean_p, max_p]) #from the above mentioned stackoverflow question, but did't work
plt.show()
The output I get by using the first or second approach for colorbar ticks is the same. It is as here:
Heatmap and colorbar with no ticks and labels
I want the minimum, median and maximum population values (48, 107 and 1302) to be shown on the colorbar with the labels Min, Mean and Max. Thank you for your time
When plotting the hexbin plot with mode bins = 'log', the colors will be plotted with a logarithmic scaling. This means that if the data minimum, mean and maximum are min, mean and max, their values on the logarithmically scaled colorbar are log10(min), log10(mean), log10(max).
The ticks on the colorbar therefore needs to be set with the log values. The ticklabels can be set to any value. However I would think that simply putting something like "mean" on a logarithmic scale may not be too informative.
A particularity is that the minimum of the colorbar is actually log10(min+1). The +1 is due to the log which is negative below 1.
Here is a complete example.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)
from mpl_toolkits.basemap import Basemap
from matplotlib import cm
lon = -78.236+np.random.rand(1000)*(-77.072+78.236)
lat = 42.935 + np.random.rand(1000)*(43.349-42.935)
t = 99+np.random.normal(10,20,1000)
t[:50] = np.linspace(48,1302)
roc_data = pd.DataFrame({'Longitude':lon, 'Latitude':lat, "T":t })
#Creating figure window
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
#Drawing the basemap
m = Basemap(projection='merc', lat_0=43.12, lon_0=-77.626,
resolution = 'i',llcrnrlon=-78.236,
llcrnrlat=42.935,
urcrnrlon=-77.072,
urcrnrlat=43.349)
m.drawcoastlines()
m.drawcounties(zorder=20, color='red')
m.drawcountries()
m.drawmapboundary()
#plotting the heatmap using hexbin
x, y = m(roc_data['Longitude'].values, roc_data['Latitude'].values)
values = roc_data['T']
m.hexbin(x, y, gridsize = 125, bins = 'log', C = values, cmap = cm.Reds) #bins = 'log',
#Defining minimum, mean and maximum population values
max_p = roc_data['T'].max()
min_p = roc_data['T'].min()
mean_p = roc_data['T'].mean()
print [min_p, mean_p, max_p]
print [np.log10(min_p), np.log10(mean_p), np.log10(max_p)]
#Adding Colorbar
cb = m.colorbar(location = 'bottom', format = '%d', label = 'Population by Census Blocks') #format = '%d',
#setting ticks
cb.set_ticks([np.log10(min_p+1), np.log10(mean_p), np.log10(max_p)])
cb.set_ticklabels(['Min\n({:.1f})'.format(min_p), 'Mean\n({:.1f})'.format(mean_p), 'Max\n({:.1f})'.format(max_p)])
plt.tight_layout()
plt.show()

Recreating decision-boundary plot in python with scikit-learn and matplotlib

I found this wonderful graph in post here Variation on "How to plot decision boundary of a k-nearest neighbor classifier from Elements of Statistical Learning?". In this example K-NN is used to clasify data into three classes. I especially enjoy that it features the probability of class membership as a indication of the "confidence".
r and ggplot seem to do a great job.I wonder, whether this can be re-created in python? My initial thought tends to scikit-learn and matplotlib. Here is the iris example from scikit:
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
n_neighbors = 15
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.show()
This produces a graph in a sense very similar:
I have three questions:
How can I introduce the confidence to the plot?
How can I plot the decision-boundaries with a connected line?
Let's say I have a new observation, how can I introduce it to the plot and plot if it is classified correctly?
I stumbled upon your question about a year ago, and loved the plot -- I just never got around to answering it, until now. Hopefully the code comments below are self-explanitory enough (I also blogged about, if you want more details). Maybe four years too late, haha.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from matplotlib.lines import Line2D
from matplotlib.ticker import MaxNLocator
from sklearn import neighbors
iris = datasets.load_iris()
x = iris.data[:,0:2]
y = iris.target
# create the x0, x1 feature
x0 = x[:,0]
x1 = x[:,1]
# set main parameters for KNN plot
N_NEIGHBORS = 15 # KNN number of neighbors
H = 0.1 # mesh stepsize
PROB_DOT_SCALE = 40 # modifier to scale the probability dots
PROB_DOT_SCALE_POWER = 3 # exponential used to increase/decrease size of prob dots
TRUE_DOT_SIZE = 50 # size of the true labels
PAD = 1.0 # how much to "pad" around the true labels
clf = neighbors.KNeighborsClassifier(N_NEIGHBORS, weights='uniform')
clf.fit(x, y)
# find the min/max points for both x0 and x1 features
# these min/max values will be used to set the bounds
# for the plot
x0_min, x0_max = np.round(x0.min())-PAD, np.round(x0.max()+PAD)
x1_min, x1_max = np.round(x1.min())-PAD, np.round(x1.max()+PAD)
# create 1D arrays representing the range of probability data points
# on both the x0 and x1 axes.
x0_axis_range = np.arange(x0_min,x0_max, H)
x1_axis_range = np.arange(x1_min,x1_max, H)
# create meshgrid between the two axis ranges
xx0, xx1 = np.meshgrid(x0_axis_range, x1_axis_range)
# put the xx in the same dimensional format as the original x
# because it's easier to work with that way (at least for me)
# * shape will be: [no_dots, no_dimensions]
# where no_dimensions = 2 (x0 and x1 axis)
xx = np.reshape(np.stack((xx0.ravel(),xx1.ravel()),axis=1),(-1,2))
yy_hat = clf.predict(xx) # prediction of all the little dots
yy_prob = clf.predict_proba(xx) # probability of each dot being
# the predicted color
yy_size = np.max(yy_prob, axis=1)
# make figure
plt.style.use('seaborn-whitegrid') # set style because it looks nice
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,6), dpi=150)
# establish colors and colormap
# * color blind colors, from
# https://towardsdatascience.com/two-simple-steps-to-create-colorblind-friendly-data-visualizations-2ed781a167ec
redish = '#d73027'
orangeish = '#fc8d59'
yellowish = '#fee090'
blueish = '#4575b4'
colormap = np.array([redish,blueish,orangeish])
# plot all the little dots, position defined by the xx values, color
# defined by the knn predictions (yy_hat), and size defined by the
# probability of that color (yy_prob)
# * because the yy_hat values are either 0, 1, 2, we can use
# these as values to index into the colormap array
# * size of dots (the probability) increases exponentially (^3), so that there is
# a nice difference between different probabilities. I'm sure there is a more
# elegant way to do this though...
# * linewidths=0 so that there are no "edges" around the dots
ax.scatter(xx[:,0], xx[:,1], c=colormap[yy_hat], alpha=0.4,
s=PROB_DOT_SCALE*yy_size**PROB_DOT_SCALE_POWER, linewidths=0,)
# plot the contours
# * we have to reshape the yy_hat to get it into a
# 2D dimensional format, representing both the x0
# and x1 axis
# * the number of levels and color scheme was manually tuned
# to make sense for this data. Would probably change, for
# instance, if there were 4, or 5 (etc.) classes
ax.contour(x0_axis_range, x1_axis_range,
np.reshape(yy_hat,(xx0.shape[0],-1)),
levels=3, linewidths=1,
colors=[redish,blueish, blueish,orangeish,])
# plot the original x values.
# * zorder is 3 so that the dots appear above all the other dots
ax.scatter(x[:,0], x[:,1], c=colormap[y], s=TRUE_DOT_SIZE, zorder=3,
linewidths=0.7, edgecolor='k')
# create legends
x_min, x_max = ax.get_xlim()
y_min, y_max = ax.get_ylim()
# set x-y labels
ax.set_ylabel(r"$x_1$")
ax.set_xlabel(r"$x_0$")
# create class legend
# Line2D properties: https://matplotlib.org/stable/api/_as_gen/matplotlib.lines.Line2D.html
# about size of scatter plot points: https://stackoverflow.com/a/47403507/9214620
legend_class = []
for flower_class, color in zip(['c', 's', 'v'], [blueish, redish, orangeish]):
legend_class.append(Line2D([0], [0], marker='o', label=flower_class,ls='None',
markerfacecolor=color, markersize=np.sqrt(TRUE_DOT_SIZE),
markeredgecolor='k', markeredgewidth=0.7))
# iterate over each of the probabilities to create prob legend
prob_values = [0.4, 0.6, 0.8, 1.0]
legend_prob = []
for prob in prob_values:
legend_prob.append(Line2D([0], [0], marker='o', label=prob, ls='None', alpha=0.8,
markerfacecolor='grey',
markersize=np.sqrt(PROB_DOT_SCALE*prob**PROB_DOT_SCALE_POWER),
markeredgecolor='k', markeredgewidth=0))
legend1 = ax.legend(handles=legend_class, loc='center',
bbox_to_anchor=(1.05, 0.35),
frameon=False, title='class')
legend2 = ax.legend(handles=legend_prob, loc='center',
bbox_to_anchor=(1.05, 0.65),
frameon=False, title='prob', )
ax.add_artist(legend1) # add legend back after it disappears
ax.set_yticks(np.arange(x1_min,x1_max, 1)) # I don't like the decimals
ax.grid(False) # remove gridlines (inherited from 'seaborn-whitegrid' style)
# only use integers for axis tick labels
# from: https://stackoverflow.com/a/34880501/9214620
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
# set the aspect ratio to 1, for looks
ax.set_aspect(1)
# remove first ticks from axis labels, for looks
# from: https://stackoverflow.com/a/19503828/9214620
ax.set_xticks(ax.get_xticks()[1:-1])
ax.set_yticks(np.arange(x1_min,x1_max, 1)[1:])
plt.show()

Resources