Related
I have generated this scatter plot via the plotting of the first two PCA elements from a feature extraction...PCA1 and PCA2.
The plot shown above is for 3 classes and with PCA1 (x-axis) vs PCA2 (y-axis). I have generated the plot as follow:
target_names = ['class_1', 'class_2', 'class_3']
plt.figure(figsize=(11, 8))
Xt = pca.fit_transform(X)
plot = plt.scatter(Xt[:,0], Xt[:,1], c=y, cmap=plt.cm.jet,
s=30, linewidths=0, alpha=0.7)
#centers = kmeans.cluster_centers_
#plt.scatter(centers[:, 0], centers[:, 1], c=['black', 'green', 'red'], marker='^', s=100, #alpha=0.5);
plt.legend(handles=plot.legend_elements()[0], labels=list(target_names))
plt.show()
I wanted to know how to correctly get the centroid of each of the classes from the plot.
Here are the first few columns of the data:
Xt1 Xt2 y
-107.988187 -23.70121 1
-128.578852 -20.222378 1
-124.522967 -25.298283 1
-96.222918 -25.028239 1
-95.152954 -23.94496 1
-113.275804 -26.563129 1
-101.803 -24.22359 1
-94.662469 -22.94211 1
-104.118882 -24.037226 1
439.765098 -101.532469 2
50.100362 -34.278841 2
-69.229603 62.178599 2
-60.915475 53.296491 2
64.797364 91.991527 2
-112.815192 0.263505 0
-91.287067 -25.207217 0
-74.181941 -2.457892 0
-83.273718 -0.608004 0
-100.881393 -22.387571 0
-107.861711 -15.848869 0
-85.866992 -18.79126 0
-53.96314 -28.885316 0
-59.195432 -3.373361 0
Any help will be greatly appreciated.
Assuming that y is an array of labels corresponding to the rows of X (and therefore Xt), we can create a data frame with Xt[:, :2] and y and then use groupby('y') to aggregate the mean values for Xt[:, 0] and Xt[:, 1] for each value of y:
import pandas as pd
df = pd.DataFrame(Xt[:, :2], columns=['Xt1', 'Xt2'])
df['y'] = y
df.groupby('y').mean()
This will produce the means of Xt[:, 0] and Xt[:, 1] for each label in y, which are the centroid coordinates of each label in y in the first two principal components of the data.
With the snippet of data that the OP provided, the following script computes the centroids and overlays them on the plot as 'X's of the same color as the data:
df = pd.DataFrame(Xt[:, :2], columns=['Xt1', 'Xt2'])
df['y'] = y
df_centroid = df.groupby('y').mean().reset_index()
target_names = ['class_1', 'class_2', 'class_3']
plt.figure(figsize=(11, 8))
plot = plt.scatter(Xt[:, 0], Xt[:, 1], c=y, cmap=plt.cm.jet,
s=30, linewidths=0, alpha=0.5)
# Overlays the centroids on the plot as 'X'
plt.scatter(df_centroid.Xt1, df_centroid.Xt2, marker='x', s=60,
c=df_centroid.y, cmap=plt.cm.jet)
plt.legend(handles=plot.legend_elements()[0], labels=list(target_names))
plt.show()
I am reading CSV file:
Notation Level RFResult PRIResult PDResult Total Result
AAA 1 1.23 0 2 3.23
AAA 1 3.4 1 0 4.4
BBB 2 0.26 1 1.42 2.68
BBB 2 0.73 1 1.3 3.03
CCC 3 0.30 0 2.73 3.03
DDD 4 0.25 1 1.50 2.75
AAA 5 0.25 1 1.50 2.75
FFF 6 0.26 1 1.42 2.68
...
...
Here is the code
import pandas as pd
import matplotlib.pyplot as plt
df = pd.rad_csv('home\NewFiles\Files.csv')
Notation = df['Notation']
Level = df['Level']
RFResult = df['RFResult']
PRIResult = df['PRIResult']
PDResult = df['PDResult']
fig, axes = plt.subplots(nrows=7, ncols=1)
ax1, ax2, ax3, ax4, ax5, ax6, ax7 = axes.flatten()
n_bins = 13
ax1.hist(data['Total'], n_bins, histtype='bar') #Current this shows all Total Results in one plot
plt.show()
I want to show each Level Total Result in each different axes like as follow:
ax1 will show Level 1 Total Result
ax2 will show Level 2 Total Result
ax3 will show Level 3 Total Result
ax4 will show Level 4 Total Result
ax5 will show Level 5 Total Result
ax6 will show Level 6 Total Result
ax7 will show Level 7 Total Result
You can select a filtered part of a dataframe just by indexing: df[df['Level'] == level]['Total']. You can loop through the axes using for ax in axes.flatten(). To also get the index, use for ind, ax in enumerate(axes.flatten()). Note that Python normally starts counting from 1, so adding 1 to the index would be a good choice to indicate the level.
Note that when you have backslashes in a string, you can escape them using an r-string: r'home\NewFiles\Files.csv'.
The default ylim is from 0 to the maximum bar height, plus some padding. This can be changed for each ax separately. In the example below a list of ymax values is used to show the principle.
ax.grid(True, axis='both) sets the grid on for that ax. Instead of 'both', also 'x' or 'y' can be used to only set the grid for that axis. A grid line is drawn for each tick value. (The example below tries to use little space, so only a few gridlines are visible.)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
N = 1000
df = pd.DataFrame({'Level': np.random.randint(1, 6, N), 'Total': np.random.uniform(1, 5, N)})
fig, axes = plt.subplots(nrows=5, ncols=1, sharex=True)
ymax_per_level = [27, 29, 28, 26, 27]
for ind, (ax, lev_ymax) in enumerate(zip(axes.flatten(), ymax_per_level)):
level = ind + 1
n_bins = 13
ax.hist(df[df['Level'] == level]['Total'], bins=n_bins, histtype='bar')
ax.set_ylabel(f'TL={level}') # to add the level in the ylabel
ax.set_ylim(0, lev_ymax)
ax.grid(True, axis='both')
plt.show()
PS: A stacked histogram with custom legend and custom vertical lines could be created as:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import pandas as pd
import numpy as np
N = 1000
df = pd.DataFrame({'Level': np.random.randint(1, 6, N),
'RFResult': np.random.uniform(1, 5, N),
'PRIResult': np.random.uniform(1, 5, N),
'PDResult': np.random.uniform(1, 5, N)})
df['Total'] = df['RFResult'] + df['PRIResult'] + df['PDResult']
fig, axes = plt.subplots(nrows=5, ncols=1, sharex=True)
colors = ['crimson', 'limegreen', 'dodgerblue']
column_names = ['RFResult', 'PRIResult', 'PDResult']
level_vertical_line = [1, 2, 3, 4, 5]
for level, (ax, vertical_line) in enumerate(zip(axes.flatten(), level_vertical_line), start=1):
n_bins = 13
level_data = df[df['Level'] == level][column_names].to_numpy()
# vertical_line = level_data.mean()
ax.hist(level_data, bins=n_bins,
histtype='bar', stacked=True, color=colors)
ax.axvline(vertical_line, color='gold', ls=':', lw=2)
ax.set_ylabel(f'TL={level}') # to add the level in the ylabel
ax.margins(x=0.01)
ax.grid(True, axis='both')
legend_handles = [Patch(color=color) for color in colors]
axes[0].legend(legend_handles, column_names, ncol=len(column_names), loc='lower center', bbox_to_anchor=(0.5, 1.02))
plt.show()
So I'm newbie in python and I have a project making contour plot map. I have a data in xlsx contain x, y, and z values. x and y are the coordinates and z are the measurement values.
1. x ; y ; z
2. 110.4482 ; 7.04428 ; 0.177
3. 110.4451 ; 7.04366 ; 0.102
4. 110.4432 ; 7.04432 ; 0.482
5. 110.4407 ; 7.04434 ; 0.504
I want to make a contour like This
I tried to make a contour but when I run it, it appear a blank picture This
and a note:
warnings.warn("No contour levels were found"
UserWarning: No contour levels were found within the data range.
xa[xa < 0] = -1 RuntimeWarning: invalid value encountered in less
: usr/local/lib/python3.6/dist-packages/matplotlib/contour.py:1243:
UserWarning: No contour levels were found within the data range.
warnings.warn("No contour levels were found"
This is the code I used:
import numpy as np
import pandas as pd
import matplotlib as ml
import matplotlib.pyplot as plt
from scipy.interpolate import griddata
xlsx_path =('Book1.xlsx')
df = pd.read_excel(xlsx_path)
x = df.iloc[1:20,0]
y = df.iloc[1:20,1]
z = df.iloc[1:20,2]
xi = np.linspace(6, 8, 20)
yi = np.linspace(109, 111, 20)
zi = griddata((x, y), z, (xi[None,:], yi[:,None]), method='cubic')
plt.contour(xi, yi, zi)
plt.show()
How can I fix it?
Q : How can I fix it?
Just swap and adapt the x, y-ranges, for which you try to plot the there tabelled z-values derived cubic-contours.
You plot a "wrong"-( ill-defined )-region of data, where no z-level contours have evolved to get computed the less displayed.
xi = np.linspace( 6, 8, 20) # actual data-values are ~ ( 110.437 : 110.448 )
yi = np.linspace(109, 111, 20) # actual data-values are ~ ( 7.042 : 7.050 )
I've almost reached my goal because of the great help of this community. I explained my goal here before: matplotlib: assign color to a radius
I now have exactly the plot I wanted. My code for it looks like this:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
import mpl_toolkits.mplot3d.art3d as art3d
from matplotlib import cm
ri = 100
ra = 300
h=20
# input xy coordinates
xy = np.array([[ri,0],[ra,0],[ra,h],[ri,h],[ri,0]])
# radial component is x values of input
r = xy[:,0]
# angular component is one revolution of 30 steps
phi = np.linspace(0, 2*np.pi, 50)
# create grid
R,Phi = np.meshgrid(r,phi)
# transform to cartesian coordinates
X = R*np.cos(Phi)
Y = R*np.sin(Phi)
# Z values are y values, repeated 30 times
Z = np.tile(xy[:,1],len(Y)).reshape(Y.shape)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.set_zlim(0,200)
ax.plot_surface(X, Y, Z, alpha=0.5, color='grey', rstride=1, cstride=1)
#here are the values which I want to visualize
arr = np.array([[ 114.28, 40],
[ 128.57, 16],
[ 142.85,19],
[ 157.13,20],
[ 171.41,21],
[ 185.69,22],
[ 199.97,24],
[ 214.25,16],
[ 228.53,29],
[ 242.81,30],
[ 257.09,31],
[ 271.37,34],
[ 288.65,35],
[ 299.93,36],
[ 300,38]])
#interpolating between the single values of the arrays
new_x = np.concatenate([np.linspace(arr[i,0],arr[i+1,0], num=20)
for i in range(len(arr)-1)])
new_y = np.interp(new_x, arr[:,0], arr[:,1])
#connecting new_x and new_y to one new array
arr = np.vstack((new_x, new_y)).T
a_min = min(arr[:,1]) # minimum level
a_max = max(arr[:,1]) # maximum level
# Levels rescaled to a range (0,1) using min and max levels as `15` and '22`.
arr_norm = [(i - a_min)/(a_max - a_min) for i in arr[:,1]]
# Color scheme 'jet' mapped between `0` and `1`.
colors = [cm.jet(i) for i in arr_norm]
# Plot circle with radius from `arr` and rescaled color between 0 and 1.
for i, radius in enumerate(arr[:,0]):
p = Circle((0, 0), radius, fc='None', ec=colors[i])
ax.add_patch(p)
art3d.pathpatch_2d_to_3d(p, z=20, zdir="z")
plt.show()
The last thing I need now is a colorbar, where stands which color stands for which value just like in a contourplot:
I already tried colorbar(), but either there was an error, nothing happened or there was a colorbar with range (0 -->1) but it was emtpy (white).
This should do it:
import matplotlib as mpl
cax, _ = mpl.colorbar.make_axes(plt.gca(), shrink=0.8)
cbar = mpl.colorbar.ColorbarBase(cax, cmap='jet', label='some label',
norm=mpl.colors.Normalize(vmin=0., vmax=1.))
Result:
I found this wonderful graph in post here Variation on "How to plot decision boundary of a k-nearest neighbor classifier from Elements of Statistical Learning?". In this example K-NN is used to clasify data into three classes. I especially enjoy that it features the probability of class membership as a indication of the "confidence".
r and ggplot seem to do a great job.I wonder, whether this can be re-created in python? My initial thought tends to scikit-learn and matplotlib. Here is the iris example from scikit:
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
n_neighbors = 15
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.show()
This produces a graph in a sense very similar:
I have three questions:
How can I introduce the confidence to the plot?
How can I plot the decision-boundaries with a connected line?
Let's say I have a new observation, how can I introduce it to the plot and plot if it is classified correctly?
I stumbled upon your question about a year ago, and loved the plot -- I just never got around to answering it, until now. Hopefully the code comments below are self-explanitory enough (I also blogged about, if you want more details). Maybe four years too late, haha.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from matplotlib.lines import Line2D
from matplotlib.ticker import MaxNLocator
from sklearn import neighbors
iris = datasets.load_iris()
x = iris.data[:,0:2]
y = iris.target
# create the x0, x1 feature
x0 = x[:,0]
x1 = x[:,1]
# set main parameters for KNN plot
N_NEIGHBORS = 15 # KNN number of neighbors
H = 0.1 # mesh stepsize
PROB_DOT_SCALE = 40 # modifier to scale the probability dots
PROB_DOT_SCALE_POWER = 3 # exponential used to increase/decrease size of prob dots
TRUE_DOT_SIZE = 50 # size of the true labels
PAD = 1.0 # how much to "pad" around the true labels
clf = neighbors.KNeighborsClassifier(N_NEIGHBORS, weights='uniform')
clf.fit(x, y)
# find the min/max points for both x0 and x1 features
# these min/max values will be used to set the bounds
# for the plot
x0_min, x0_max = np.round(x0.min())-PAD, np.round(x0.max()+PAD)
x1_min, x1_max = np.round(x1.min())-PAD, np.round(x1.max()+PAD)
# create 1D arrays representing the range of probability data points
# on both the x0 and x1 axes.
x0_axis_range = np.arange(x0_min,x0_max, H)
x1_axis_range = np.arange(x1_min,x1_max, H)
# create meshgrid between the two axis ranges
xx0, xx1 = np.meshgrid(x0_axis_range, x1_axis_range)
# put the xx in the same dimensional format as the original x
# because it's easier to work with that way (at least for me)
# * shape will be: [no_dots, no_dimensions]
# where no_dimensions = 2 (x0 and x1 axis)
xx = np.reshape(np.stack((xx0.ravel(),xx1.ravel()),axis=1),(-1,2))
yy_hat = clf.predict(xx) # prediction of all the little dots
yy_prob = clf.predict_proba(xx) # probability of each dot being
# the predicted color
yy_size = np.max(yy_prob, axis=1)
# make figure
plt.style.use('seaborn-whitegrid') # set style because it looks nice
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,6), dpi=150)
# establish colors and colormap
# * color blind colors, from
# https://towardsdatascience.com/two-simple-steps-to-create-colorblind-friendly-data-visualizations-2ed781a167ec
redish = '#d73027'
orangeish = '#fc8d59'
yellowish = '#fee090'
blueish = '#4575b4'
colormap = np.array([redish,blueish,orangeish])
# plot all the little dots, position defined by the xx values, color
# defined by the knn predictions (yy_hat), and size defined by the
# probability of that color (yy_prob)
# * because the yy_hat values are either 0, 1, 2, we can use
# these as values to index into the colormap array
# * size of dots (the probability) increases exponentially (^3), so that there is
# a nice difference between different probabilities. I'm sure there is a more
# elegant way to do this though...
# * linewidths=0 so that there are no "edges" around the dots
ax.scatter(xx[:,0], xx[:,1], c=colormap[yy_hat], alpha=0.4,
s=PROB_DOT_SCALE*yy_size**PROB_DOT_SCALE_POWER, linewidths=0,)
# plot the contours
# * we have to reshape the yy_hat to get it into a
# 2D dimensional format, representing both the x0
# and x1 axis
# * the number of levels and color scheme was manually tuned
# to make sense for this data. Would probably change, for
# instance, if there were 4, or 5 (etc.) classes
ax.contour(x0_axis_range, x1_axis_range,
np.reshape(yy_hat,(xx0.shape[0],-1)),
levels=3, linewidths=1,
colors=[redish,blueish, blueish,orangeish,])
# plot the original x values.
# * zorder is 3 so that the dots appear above all the other dots
ax.scatter(x[:,0], x[:,1], c=colormap[y], s=TRUE_DOT_SIZE, zorder=3,
linewidths=0.7, edgecolor='k')
# create legends
x_min, x_max = ax.get_xlim()
y_min, y_max = ax.get_ylim()
# set x-y labels
ax.set_ylabel(r"$x_1$")
ax.set_xlabel(r"$x_0$")
# create class legend
# Line2D properties: https://matplotlib.org/stable/api/_as_gen/matplotlib.lines.Line2D.html
# about size of scatter plot points: https://stackoverflow.com/a/47403507/9214620
legend_class = []
for flower_class, color in zip(['c', 's', 'v'], [blueish, redish, orangeish]):
legend_class.append(Line2D([0], [0], marker='o', label=flower_class,ls='None',
markerfacecolor=color, markersize=np.sqrt(TRUE_DOT_SIZE),
markeredgecolor='k', markeredgewidth=0.7))
# iterate over each of the probabilities to create prob legend
prob_values = [0.4, 0.6, 0.8, 1.0]
legend_prob = []
for prob in prob_values:
legend_prob.append(Line2D([0], [0], marker='o', label=prob, ls='None', alpha=0.8,
markerfacecolor='grey',
markersize=np.sqrt(PROB_DOT_SCALE*prob**PROB_DOT_SCALE_POWER),
markeredgecolor='k', markeredgewidth=0))
legend1 = ax.legend(handles=legend_class, loc='center',
bbox_to_anchor=(1.05, 0.35),
frameon=False, title='class')
legend2 = ax.legend(handles=legend_prob, loc='center',
bbox_to_anchor=(1.05, 0.65),
frameon=False, title='prob', )
ax.add_artist(legend1) # add legend back after it disappears
ax.set_yticks(np.arange(x1_min,x1_max, 1)) # I don't like the decimals
ax.grid(False) # remove gridlines (inherited from 'seaborn-whitegrid' style)
# only use integers for axis tick labels
# from: https://stackoverflow.com/a/34880501/9214620
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
# set the aspect ratio to 1, for looks
ax.set_aspect(1)
# remove first ticks from axis labels, for looks
# from: https://stackoverflow.com/a/19503828/9214620
ax.set_xticks(ax.get_xticks()[1:-1])
ax.set_yticks(np.arange(x1_min,x1_max, 1)[1:])
plt.show()