Sklearn BIC criterion : differents optimum values of k for clustering - python-3.x

I want to determine the best value of k (number of clusters) for the KMeans algo and a dataset.
I found a ressource in the documentation of Sklearn : The Gaussian Mixture Model Selection using the BIC criterion.
I found an example of code on the site that I adapted to my dataset.
But each run of this code give a different value of optimal value of k . Why ?
Here the code :
import numpy as np
import pandas as pd
import itertools
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import mixture
print(__doc__)
# Number of samples per component
n_samples = 440
path = 'C:/Users/Lionel/Downloads'
file = 'Wholesale customers data.csv'
data = pd.read_csv(path + '/'+file)
X = np.array(data.iloc[:,2 :])
lowest_bic = np.infty
bic = []
n_components_range = range(1, 12)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a Gaussian mixture with EM
gmm = mixture.GaussianMixture(n_components=n_components,
covariance_type=cv_type)
gmm.fit(X)
bic.append(gmm.bic(X))
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
bic = np.array(bic)
color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
'darkorange'])
clf = best_gmm
print(clf)
bars = []
# Plot the BIC scores
spl = plt.subplot(2, 1, 1)
#spl = plt.plot()
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
xpos = np.array(n_components_range) + .2 * (i - 2)
bars.append(plt.bar(xpos, bic[i * len(n_components_range):
(i + 1) * len(n_components_range)],
width=.2, color=color))
plt.xticks(n_components_range)
plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
plt.title('BIC score per model')
xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
.2 * np.floor(bic.argmin() / len(n_components_range))
plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
spl.set_xlabel('Number of components')
spl.legend([b[0] for b in bars], cv_types)
# Plot the winner
splot = plt.subplot(2, 1, 2)
Y_ = clf.predict(X)
for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_,
color_iter)):
v, w = linalg.eigh(cov)
if not np.any(Y_ == i):
continue
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan2(w[0][1], w[0][0])
angle = 180. * angle / np.pi # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
ell.set_clip_box(splot.bbox)
ell.set_alpha(.5)
splot.add_artist(ell)
plt.xticks(())
plt.yticks(())
plt.title('Selected GMM: full model, 2 components')
plt.subplots_adjust(hspace=.35, bottom=.02)
plt.show()
Here the link to my dataset :
https://drive.google.com/open?id=1yMw1rMh12ml6Lh3yrL6WDLbEnLM-SmiN
Have you an explanation for this behaviour ?

Related

How to compute the distance of data points to decision boundary when using the EllipticEnvelope of sklearn?

How can I compute the euclidean distance to the boundary decision of the EllipticEnvelope? Here is my code :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import train_test_split
feature, output = "temperature", "consumption"
data = pd.DataFrame(np.random.normal(0,15, size=(2355,2)), columns=[feature, output])
X = data[[feature, output]]
X_train, X_test = train_test_split(X, shuffle=True, train_size=0.8)
model = EllipticEnvelope(contamination=0.18)
model.fit(X_train)
# extract the model predictions
y_pred = pd.Series(model.predict(X), index=X.index, name="anomaly")
# define the meshgrid : X = (u,v).T
u_min, u_max = X_train.iloc[:, 0].min() - 1.5, X_train.iloc[:, 0].max() + 1.5
v_min, v_max = X_train.iloc[:, 1].min() - 1.5, X_train.iloc[:, 1].max() + 1.5
n_points = 500
u = np.linspace(u_min, u_max, n_points)
v = np.linspace(v_min, v_max, n_points)
U, V = np.meshgrid(u, v)
# evaluate the decision function on the meshgrid
W = model.decision_function(np.c_[U.ravel(), V.ravel()])
W = W.reshape(U.shape)
plt.figure(figsize=(20,6))
a = plt.contour(U, V, W, levels=[0], linewidths=2, colors="black")
b = plt.scatter(X.loc[y_pred == 1].iloc[:, 0], X.loc[y_pred == 1].iloc[:, 1], c="yellowgreen", edgecolors='k')
c = plt.scatter(X.loc[y_pred == -1].iloc[:, 0], X.loc[y_pred == -1].iloc[:, 1], c="tomato", edgecolors='k')
plt.legend([a.collections[0], b, c], ['learned frontier', 'regular observations', 'abnormal observations'], bbox_to_anchor=(1.05, 1))
plt.axis('tight')
plt.show()
Edits
I am able to get the decision boundary points using the following code. Now, the problem can be solved by computing numerically the distance.
for item in a.collections:
for i in item.get_paths():
v = i.vertices
x = v[:, 0]
y = v[:, 1]
I have an obvious solution. Getting all data points d and compute the euclidean distance between d and e=(x,y). But, it is a brute-force technique.. :D I will continue my research !
Another solution would be to fit an ellipse and compute the distance using the formula described by #epiliam there : https://math.stackexchange.com/questions/3670465/calculate-distance-from-point-to-ellipse-edge
I will provide one solution tomorrow based on the brute-force. It seems to work well for small dataset (n_rows < 10000). I did not test for larger ones.

Using the linear_model perceptron module from sklearn to separate points

I am trying to use this sklearn module for a binary classification problem and my data is clearly linearly separable.
what I dont understand is why the green area of my plot does not include the five red circles.
.
I have tried to vary the number of iterations parameter(max_iter) from 100 to 10000, however it does not make any difference.
here is my code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Perceptron
def learn_and_display_Perceptron(datafile):
#taking data reading this from the above functions
data = np.loadtxt(datafile)
n,d = data.shape
x = data[:,0:2]
y = data[:,2]
clf = Perceptron(max_iter=10000)
clf.fit(x, y)
sv = np.zeros(n,dtype=bool) ## all False array
notsv = np.logical_not(sv) # all True array
# Determine the x1- and x2- limits of the plot
x1min = min(x[:,0]) - 1
x1max = max(x[:,0]) + 1
x2min = min(x[:,1]) - 1
x2max = max(x[:,1]) + 1
plt.xlim(x1min,x1max)
plt.ylim(x2min,x2max)
# Plot the data points, enlarging those that are support vectors
plt.plot(x[(y==1)*notsv,0], x[(y==1)*notsv,1], 'ro')
plt.plot(x[(y==1)*sv,0], x[(y==1)*sv,1], 'ro', markersize=10)
plt.plot(x[(y==-1)*notsv,0], x[(y==-1)*notsv,1], 'k^')
plt.plot(x[(y==-1)*sv,0], x[(y==-1)*sv,1], 'k^', markersize=10)
# Construct a grid of points and evaluate classifier at each grid points
grid_spacing = 0.05
xx1, xx2 = np.meshgrid(np.arange(x1min, x1max, grid_spacing), np.arange(x2min, x2max, grid_spacing))
grid = np.c_[xx1.ravel(), xx2.ravel()]
Z = clf.predict(grid)
# Quantize the values to -1, -0.5, 0, 0.5, 1 for display purposes
for i in range(len(Z)):
Z[i] = min(Z[i],1.0)
Z[i] = max(Z[i],-1.0)
if (Z[i] > 0.0) and (Z[i] < 1.0):
Z[i] = 0.5
if (Z[i] < 0.0) and (Z[i] > -1.0):
Z[i] = -0.5
# Show boundary and margin using a color plot
Z = Z.reshape(xx1.shape)
plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.PRGn, vmin=-2, vmax=2, shading='auto')
plt.show()
my datafile, data_1.txt can be found on here, https://github.com/bluetail14/MyCourserapractice/tree/main/Edx
What can I change in my code to adjust the green/purple borderline to include the five red circles?
Nice code. You need to change the eta0 value,
clf = Perceptron(max_iter=10000, eta0=0.1)

Scikit Spectral Clustering fails to classify concentric circles

Here is some code to set up the clustering problem:
import numpy as np
import matplotlib.pyplot as plt
# KMeans
# # Class=2
# Center(2.5,2.5), r1 = 2, r2 = 1
X1 = np.zeros(500*4)
X2 = np.zeros(500*4)
r1 = 2; r2 = 1; a = 2.5; b = 2.5 # generate circle
h = np.random.uniform(0, 2*np.pi, 1000)
noise = np.random.normal(0, 0.1, 1000)
X1[:1000] = np.cos(h) * r1 + a + noise
noise = np.random.normal(0, 0.1, 1000)
X2[:1000] = np.sin(h) * r1 + a + noise
h = np.random.uniform(0, 2*np.pi, 1000)
noise = np.random.normal(0, 0.1, 1000)
X1[1000:] = np.cos(h) * r2 + b + noise
noise = np.random.normal(0, 0.1, 1000)
X2[1000:] = np.sin(h) * r2 + b + noise
X = np.array([X1,X2]).T
plt.figure(figsize=(4,4))
plt.scatter(X[:,0],X[:,1])
From the following image, we assume that there are two clusters. All points in the inner circle should belong to one, and the outer circle should belong to another.
By scikit-learn, we have this code with RBF kernel:
from sklearn.cluster import SpectralClustering
clustering = SpectralClustering(n_clusters=2,assign_labels='kmeans', affinity='rbf',random_state=0).fit(X)
print(clustering.labels_)
plt.figure(figsize=(4,4))
X_C1 = np.array([X[i,:] for i in range(len(clustering.labels_)) if clustering.labels_[i] == 1])
X_C2 = np.array([X[i,:] for i in range(len(clustering.labels_)) if clustering.labels_[i] == 0])
plt.scatter(X_C1[:,0],X_C1[:,1],c="blue")
plt.scatter(X_C2[:,0],X_C2[:,1],c="red")
plt.show()
But it seems that the spectral clustering doesn't work (as bad KMeans clustering). So what is the problem here?
The default gamma=1.0 parameter is not high enough for this application.
Try gamma=6.0:
from sklearn.cluster import SpectralClustering
clustering = SpectralClustering(n_clusters=2, gamma=6.0).fit(X)
plt.scatter(X[:, 0], X[:, 1], c=clustering.labels_)
plt.show()

Heat map for Irregularly Spaced Data with No Interpolation

I would like to plot a heatmap where the input data is not in the typical rectangularly spaced grid. Here is some sample data:
import numpy as np
xmin = 6
xmax= 12
ymin = 0
x = np.linspace(xmin, xmax, 100)
ymax = x**2
final = []
for i in range(len(ymax)):
yrange = np.linspace(0, ymax[i], 100)
for j in range(len(yrange)):
intensity = np.random.rand()
final.append([x[i], yrange[j], intensity])
data_for_plotting = np.asarray(final) # (10000, 3) shaped array
I would like to plot intensity (in the colorbar) as a function of (x,y) which represents the position and I would like to do this without interpolation.
Here is my solution which uses matplotlib's griddata and linear interpolation.
import matplotlib.pyplot as plt
from matplotlib.mlab import griddata
total_length = 100
x1 = np.linspace(min(data_for_plotting[:,0]), max(data_for_plotting[:,0]), total_length)
y1 = np.linspace(min(data_for_plotting[:,1]), max(data_for_plotting[:,1]), total_length)
z1 = griddata(data_for_plotting[:,0], data_for_plotting[:,1], data_for_plotting[:,2], x1, y1, interp='linear')
p=plt.pcolormesh(x1, y1, z1, vmin = 0. , vmax=1.0, cmap='viridis')
clb = plt.colorbar(p)
plt.show()
I am looking for an alternate solution without interpolation as I would like to see the smallest unit of measurement in my x and y position (pixel size/rectangle). Based on the sample data given above I expect the height of the pixel to increase for large values of x.
I'm unsure what matplotlib.mlab.griddata is about. Maybe some very old version?
You could use scipy.interpolate.griddata which needs its parameters in a slightly different format. method='nearest' switches off the interpolation (default method='linear').
Here is how it could look with your test data (see griddata's documentation for more explanation and examples):
import matplotlib.pyplot as plt
from scipy.interpolate import griddata
import numpy as np
xmin = 6
xmax = 12
ymin = 0
x = np.linspace(xmin, xmax, 100)
ymax = x ** 2
final = []
for i in range(len(ymax)):
yrange = np.linspace(0, ymax[i], 100)
for j in range(len(yrange)):
intensity = np.random.rand()
final.append([x[i], yrange[j], intensity])
data_for_plotting = np.asarray(final) # (10000, 3) shaped array
total_length = 100
x1 = np.linspace(min(data_for_plotting[:, 0]), max(data_for_plotting[:, 0]), total_length)
y1 = np.linspace(min(data_for_plotting[:, 1]), max(data_for_plotting[:, 1]), total_length)
grid_x, grid_y = np.meshgrid(x1, y1)
z1 = griddata(data_for_plotting[:, :2], data_for_plotting[:, 2], (grid_x, grid_y), method='nearest')
img = plt.imshow(z1, extent=[x1[0], x1[-1], y1[0], y1[-1]], origin='lower',
vmin=0, vmax=1, cmap='inferno', aspect='auto')
cbar = plt.colorbar(img)
plt.show()
An alernative, is to create one rectangle for each of the prolonged pixels. Beware that this can be a rather slow operation. If really needed, one could create a pcolormesh for each column.
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import numpy as np
# ... create x and data_for_plotting as before
fig, ax = plt.subplots()
cmap = plt.get_cmap('inferno')
norm = plt.Normalize(0, 1)
x_step = x[1] - x[0]
y_step = 0
for i, (xi, yi, intensity_i) in enumerate(data_for_plotting):
if i + 1 < len(data_for_plotting) and data_for_plotting[i + 1, 0] == xi: # when False, the last y_step is reused
y_step = data_for_plotting[i + 1, 1] - yi
ax.add_artist(plt.Rectangle((xi, yi), x_step, y_step, color=cmap(norm(intensity_i))))
cbar = plt.colorbar(ScalarMappable(cmap=cmap, norm=norm))
ax.set_xlim(x[0], x[-1])
ax.set_ylim(0, data_for_plotting[:, 1].max())
plt.tight_layout()
plt.show()

Poor GMM fit in sklearn from 2 gaussian

I want to fit a 2 component mixture model with sklearn for then calculating back posterior probability. Butwith the code I have so far the fit for one of the two distributions is perfect (overfitting?) and other one is very poor. I made a dummy example with sampling 2 gaussian
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
def calc_pdf():
"""
calculate gauss mixture modelling for 2 comp
return pdfs
"""
d = np.random.normal(-0.1, 0.07, 5000)
t = np.random.normal(0.2, 0.13, 10000)
pool = np.concatenate([d, t]).reshape(-1,1)
label = ['d']*d.shape[0] + ['t'] * t.shape[0]
X = pool[pool>0].reshape(-1,1)
X = np.log(X)
clf = GaussianMixture(
n_components=2,
covariance_type='full',
tol = 1e-24,
max_iter = 1000
)
logprob = clf.fit(X).score_samples(X)
responsibilities = clf.predict_proba(X)
pdf = np.exp(logprob)
pdf_individual = responsibilities * pdf[:, np.newaxis]
plot_gauss(np.log(d), np.log(t), pdf_individual, X)
return pdf_individual[0], pdf_individual[1]
def plot_gauss(d, t, pdf_individual, x):
fig, ax = plt.subplots(figsize=(12, 9), facecolor='white')
ax.hist(d, 30, density=True, histtype='stepfilled', alpha=0.4)
ax.hist(t, 30, density=True, histtype='stepfilled', alpha=0.4)
ax.plot(x, pdf_individual, '.')
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')
plt.show()
calc_pdf()
which produces this plot here
Is there something obvious that I am missing?

Resources