How to compute the distance of data points to decision boundary when using the EllipticEnvelope of sklearn? - python-3.x

How can I compute the euclidean distance to the boundary decision of the EllipticEnvelope? Here is my code :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import train_test_split
feature, output = "temperature", "consumption"
data = pd.DataFrame(np.random.normal(0,15, size=(2355,2)), columns=[feature, output])
X = data[[feature, output]]
X_train, X_test = train_test_split(X, shuffle=True, train_size=0.8)
model = EllipticEnvelope(contamination=0.18)
model.fit(X_train)
# extract the model predictions
y_pred = pd.Series(model.predict(X), index=X.index, name="anomaly")
# define the meshgrid : X = (u,v).T
u_min, u_max = X_train.iloc[:, 0].min() - 1.5, X_train.iloc[:, 0].max() + 1.5
v_min, v_max = X_train.iloc[:, 1].min() - 1.5, X_train.iloc[:, 1].max() + 1.5
n_points = 500
u = np.linspace(u_min, u_max, n_points)
v = np.linspace(v_min, v_max, n_points)
U, V = np.meshgrid(u, v)
# evaluate the decision function on the meshgrid
W = model.decision_function(np.c_[U.ravel(), V.ravel()])
W = W.reshape(U.shape)
plt.figure(figsize=(20,6))
a = plt.contour(U, V, W, levels=[0], linewidths=2, colors="black")
b = plt.scatter(X.loc[y_pred == 1].iloc[:, 0], X.loc[y_pred == 1].iloc[:, 1], c="yellowgreen", edgecolors='k')
c = plt.scatter(X.loc[y_pred == -1].iloc[:, 0], X.loc[y_pred == -1].iloc[:, 1], c="tomato", edgecolors='k')
plt.legend([a.collections[0], b, c], ['learned frontier', 'regular observations', 'abnormal observations'], bbox_to_anchor=(1.05, 1))
plt.axis('tight')
plt.show()
Edits
I am able to get the decision boundary points using the following code. Now, the problem can be solved by computing numerically the distance.
for item in a.collections:
for i in item.get_paths():
v = i.vertices
x = v[:, 0]
y = v[:, 1]
I have an obvious solution. Getting all data points d and compute the euclidean distance between d and e=(x,y). But, it is a brute-force technique.. :D I will continue my research !
Another solution would be to fit an ellipse and compute the distance using the formula described by #epiliam there : https://math.stackexchange.com/questions/3670465/calculate-distance-from-point-to-ellipse-edge
I will provide one solution tomorrow based on the brute-force. It seems to work well for small dataset (n_rows < 10000). I did not test for larger ones.

Related

Using the linear_model perceptron module from sklearn to separate points

I am trying to use this sklearn module for a binary classification problem and my data is clearly linearly separable.
what I dont understand is why the green area of my plot does not include the five red circles.
.
I have tried to vary the number of iterations parameter(max_iter) from 100 to 10000, however it does not make any difference.
here is my code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Perceptron
def learn_and_display_Perceptron(datafile):
#taking data reading this from the above functions
data = np.loadtxt(datafile)
n,d = data.shape
x = data[:,0:2]
y = data[:,2]
clf = Perceptron(max_iter=10000)
clf.fit(x, y)
sv = np.zeros(n,dtype=bool) ## all False array
notsv = np.logical_not(sv) # all True array
# Determine the x1- and x2- limits of the plot
x1min = min(x[:,0]) - 1
x1max = max(x[:,0]) + 1
x2min = min(x[:,1]) - 1
x2max = max(x[:,1]) + 1
plt.xlim(x1min,x1max)
plt.ylim(x2min,x2max)
# Plot the data points, enlarging those that are support vectors
plt.plot(x[(y==1)*notsv,0], x[(y==1)*notsv,1], 'ro')
plt.plot(x[(y==1)*sv,0], x[(y==1)*sv,1], 'ro', markersize=10)
plt.plot(x[(y==-1)*notsv,0], x[(y==-1)*notsv,1], 'k^')
plt.plot(x[(y==-1)*sv,0], x[(y==-1)*sv,1], 'k^', markersize=10)
# Construct a grid of points and evaluate classifier at each grid points
grid_spacing = 0.05
xx1, xx2 = np.meshgrid(np.arange(x1min, x1max, grid_spacing), np.arange(x2min, x2max, grid_spacing))
grid = np.c_[xx1.ravel(), xx2.ravel()]
Z = clf.predict(grid)
# Quantize the values to -1, -0.5, 0, 0.5, 1 for display purposes
for i in range(len(Z)):
Z[i] = min(Z[i],1.0)
Z[i] = max(Z[i],-1.0)
if (Z[i] > 0.0) and (Z[i] < 1.0):
Z[i] = 0.5
if (Z[i] < 0.0) and (Z[i] > -1.0):
Z[i] = -0.5
# Show boundary and margin using a color plot
Z = Z.reshape(xx1.shape)
plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.PRGn, vmin=-2, vmax=2, shading='auto')
plt.show()
my datafile, data_1.txt can be found on here, https://github.com/bluetail14/MyCourserapractice/tree/main/Edx
What can I change in my code to adjust the green/purple borderline to include the five red circles?
Nice code. You need to change the eta0 value,
clf = Perceptron(max_iter=10000, eta0=0.1)

Fitting data with a double Gaussian

I am attempting to fit some data with a double Gaussian profile. The data looks almost perfectly Gaussian, but try as I might, I can't get a fit better than a certain shape, regardless of the initial guesses I input. I've tried to use the two gaussian equations listed below, but neither fit quite right. Overall I'd like it to be flatter on the continuum (no 'wings') and have a smoother, closer fit to the actual shape if possible.
Due to the nature of the follow-up analysis, the fit needs to be a double Gaussian, as I require the fitting parameters, and thus I can't consider other fitting methods. The data can be found here:
https://docs.google.com/spreadsheets/d/1kMO2ogAL8ZCiDeY29kBvv5lzMfAD7dLj-5rKW8kW9Go/edit?usp=sharing
Below is an example of the code I've been using to try and fit the data, as well as the output figure.
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from scipy.optimize import curve_fit
from lmfit import Model
with open("data.txt","r") as f:
content=[i.strip() for i in f.readlines()]
vel=[]
I=[]
dI=[]
for i in range(8,len(content)):
line=content[i].split()
vel.append(float(line[0]))
I.append(float(line[1]))
dI.append(float(line[2]))
def gaussian(x, A, x0, sig):
return A*np.exp(-(x-x0)**2/(2*sig**2))
def gaussian2(x, amp, cen, wid):
return (amp/(np.sqrt(2*np.pi)*wid))*np.exp(-(x-cen)**2/(2*wid**2))
def multi_gaussian(x, *pars):
offset = pars[-1]
g1 = gaussian(x, pars[1], pars[0], pars[2])
g2 = gaussian(x, pars[3], pars[0], pars[4])
return g1 + g2 + offset
def multi_gaussian2(x, *pars):
offset = pars[-1]
g1 = gaussian2(x, pars[1], pars[0], pars[2])
g2 = gaussian2(x, pars[3], pars[0], pars[4])
return g1 + g2 + offset
offset=1
guess = [-15,-0.01,10,-0.01,10,1]
popt, pcov = curve_fit(multi_gaussian, vel, I, guess)
popt2, pcov2 = curve_fit(multi_gaussian2, vel, I, guess)
x=np.linspace(np.min(vel),np.max(vel), 2000)
plt.figure()
plt.scatter(vel,I,s=0.1,c='b')
plt.plot(x, multi_gaussian(x, *popt), 'r--', linewidth=1,label='Gaussian1')
plt.plot(x, multi_gaussian2(x, *popt2), 'g--', linewidth=1,label='Gaussian2')
plt.legend(loc='best')
plt.show()
The data in your linked spreadsheet only has 2 significant digits for velocity and intensity. That makes it basically impossible to try to "refine" your fit to get a better result. That said, I highly recommend using a lmfit script like this, that will include your intensity uncertainties in the fit:
import matplotlib.pyplot as plt
import numpy as np
from lmfit.models import GaussianModel, ConstantModel
data = np.loadtxt('ddata.txt', skiprows=1)
v = data[:, 0]
i = data[:, 1]
di = data[:, 2]
model = (ConstantModel(prefix='offset_') +
GaussianModel(prefix='p1_') +
GaussianModel(prefix='p2_'))
params = model.make_params(offset_c=1,
p1_amplitude=-1., p1_sigma=100, p1_center=25,
p2_amplitude=-1., p2_sigma=100, p2_center=-25)
init = model.eval(params, x=v)
result = model.fit(i, params, weights=1.0/(di+1.e-9), x=v)
print(result.fit_report())
plt.figure()
plt.scatter(v, i, s=0.5, label='data')
plt.plot(v, init, label='init')
plt.plot(v, result.best_fit, label='fit')
plt.legend()
plt.xlabel('velocity (mm/s)')
plt.ylabel('intensity')
plt.show()
For the data you supplied, this will print out a fit report like this:
[[Model]]
((Model(constant, prefix='offset_') + Model(gaussian, prefix='p1_')) + Model(gaussian, prefix='p2_'))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 128
# data points = 191
# variables = 7
chi-square = 654.770994
reduced chi-square = 3.55853801
Akaike info crit = 249.314315
Bayesian info crit = 272.080229
[[Variables]]
offset_c: 1.00013943 +/- 5.1045e-05 (0.01%) (init = 1)
p1_amplitude: -1.36807407 +/- 0.08677931 (6.34%) (init = -1)
p1_center: 46.8019583 +/- 3.77807981 (8.07%) (init = 25)
p1_sigma: 57.3859589 +/- 2.39823612 (4.18%) (init = 100)
p2_amplitude: -1.16999330 +/- 0.08533205 (7.29%) (init = -1)
p2_center: -76.1117581 +/- 3.49975073 (4.60%) (init = -25)
p2_sigma: 51.7080694 +/- 2.08860434 (4.04%) (init = 100)
p1_fwhm: 135.133604 +/- 5.64741436 (4.18%) == '2.3548200*p1_sigma'
p1_height: -0.00951073 +/- 2.6406e-04 (2.78%) == '0.3989423*p1_amplitude/max(2.220446049250313e-16, p1_sigma)'
p2_fwhm: 121.763196 +/- 4.91828727 (4.04%) == '2.3548200*p2_sigma'
p2_height: -0.00902683 +/- 3.5183e-04 (3.90%) == '0.3989423*p2_amplitude/max(2.220446049250313e-16, p2_sigma)'
[[Correlations]] (unreported correlations are < 0.100)
C(p1_center, p2_amplitude) = -0.967
C(p1_amplitude, p2_center) = 0.959
C(p1_center, p2_center) = 0.956
C(p1_amplitude, p2_amplitude) = -0.946
C(p1_amplitude, p1_center) = 0.943
C(p2_amplitude, p2_center) = -0.943
and a plot of

How to plot support vectors for support vector regression?

I am trying to solve hard margin support vector regression and plot hyperplane and support vectors for a dataset.
As you know, hard margin is solved with the below assumption:
I solved the problem but when I want to plot decision boundaries and support vectors, I face the below problem. All of point should be located between two decision boundaries and support vectors should be drawn on the decision boundaries. Can you help me to find the problem?
Here is the full code:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn import metrics
Data = pd.read_csv("Data.txt",delimiter="\t")
X=Data['waterlevel(x)'].values
y=Data['Area(y)'].values
# Plot Data
import matplotlib.pyplot as plt
fig,ax = plt.subplots(1, 1,constrained_layout=True,figsize=(8, 4))
ax.plot(X, y,'k.')
ax.set_title('Urmia lake Area versus Level')
ax.set_xlabel('Water level (M)',fontsize=15)
ax.set_ylabel('Area (km^2)',fontsize=15)
#plt.axis([0, 25, 0, 25])
plt.grid(True)
plt.show()
# find max and min values of predictor variables (here X) to use it to specify initial values of w and b
max_feature_value=np.amax(X)
min_feature_value=np.amin(X)
w_optimum = max_feature_value*0.5
w = [w_optimum for i in range(1)] # w shoulb be a vector with dimension of the independent features (here:1)
wt_b=w
b_sum=0
for i in range(X.shape[0]):
b_sum+=y[i]-np.dot(wt_b,X[i])
b_ini=b_sum/len(X)
b_step_size_lower = 0.9
b_step_size_upper = 0.2
b_multiple = 500 # step size for b
b_range = np.arange((b_ini*b_step_size_lower), -b_ini*b_step_size_upper, b_multiple)
print(len(b_range))
# Estimate w and b using stochastic gradient descent and trial and error
l_rate=0.1
n_epoch = 250
epsilon=150 # acceptable error
length_Wvector_list=[]
for i in range (len(b_range)):
correctly_regressed = True
for j in range(X.shape[0]):
print(i,j,wt_b,b_range[i])
if (y[j]-(np.dot(wt_b,X[j])+b_range[i]) > epsilon) or (y[j]-(np.dot(wt_b,X[j])+b_range[i]) < -epsilon)==True:
correctly_regressed = False
wt_b = np.asarray(wt_b) - l_rate
if correctly_regressed==True:
length_Wvector_list.append([wt_b[0],wt_b,b_range[i]])
if wt_b[0] < 0:
wt_b=w
break
norms = sorted([n for n in length_Wvector_list])
wt_b=norms[0][1]
b=norms[0][2]
# Predict using the optimized values of w and b
y_predict=[]
for i in range (X.shape[0]):
y_hat=np.dot(wt_b,X[i])+b
y_predict.append(y_hat)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_predict)))
print('Coefficient of determination (R2):', metrics.r2_score(y, y_predict))
# plot
fig,ax = plt.subplots(1, 1,figsize=(8, 5.2))
ax.scatter(y, y_predict, cmap='K', edgecolor='b',linewidth='0.5',alpha=1, label='testing points',marker='o', s=12)
ax.set_xlabel('Observed Area(km $^{2}$)',fontsize=14)
ax.set_ylabel('Simulated Area(km $^{2}$)',fontsize=14)
# find support vectors
positive_instances=[]
negative_instances=[]
for i in range(X.shape[0]):
y_pre=(np.dot(wt_b,X[i]))+b
if y[i]-y_pre<=epsilon:
positive_instances.append([y[i]-y_pre,[X[i],y[i]]])
elif y[i]-y_pre>=-epsilon:
negative_instances.append([y[i]-y_pre,[X[i],y[i]]])
len(positive_instances)+len(negative_instances)
sort_positive=sorted([n for n in positive_instances])
sort_negative=sorted([n for n in negative_instances])
positive_support_vector=sort_positive[0][1]
negative_support_vector=sort_negative[-1][1]
model_support_vectors=np.stack((positive_support_vector,negative_support_vector),axis=-1)
# visualize the data-set
colors = {1:'r',-1:'b'}
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
plt.scatter(X,y,marker='o',c=y)
# plot support vectors
ax.scatter(model_support_vectors[0, :],model_support_vectors[1, :],s=200, linewidth=1,facecolors='none', edgecolors='b')
# hyperplane = x.w+b
# 0 = x.w+b
# psv = epsilon
# nsv = -epsilon
# dec = 0
def hyperplane_value(x,w,b,e):
return (np.dot(w,x)+b+e)
datarange = (min_feature_value*1.,max_feature_value*1.)
hyp_x_min = datarange[0]
hyp_x_max = datarange[1]
# (w.x+b) = epsilon
# positive support vector hyperplane
psv1 = hyperplane_value(hyp_x_min, wt_b, b, epsilon)
psv2 = hyperplane_value(hyp_x_max, wt_b, b, epsilon)
ax.plot([hyp_x_min,hyp_x_max],[psv1,psv2], 'k')
# (w.x+b) = -epsilon
# negative support vector hyperplane
nsv1 = hyperplane_value(hyp_x_min, wt_b, b, -epsilon)
nsv2 = hyperplane_value(hyp_x_max, wt_b, b, -epsilon)
ax.plot([hyp_x_min,hyp_x_max],[nsv1,nsv2], 'k')
# (w.x+b) = 0
# positive support vector hyperplane
db1 = hyperplane_value(hyp_x_min, wt_b, b, 0)
db2 = hyperplane_value(hyp_x_max, wt_b, b, 0)
ax.plot([hyp_x_min,hyp_x_max],[db1,db2], 'y--')
#plt.axis([-5,10,-12,-1])
plt.show()
I improved the program and the problem was solved. As you can see, decision boundaries and support vectors are drawn in the correct position.
Here is the full code:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn import metrics
Data = pd.read_csv("Data.txt",delimiter="\t")
X=Data['waterlevel(x)'].values
y=Data['Area(y)'].values
# Plot Data
import matplotlib.pyplot as plt
fig,ax = plt.subplots(1, 1,constrained_layout=True,figsize=(8, 4))
ax.plot(X, y,'k.')
ax.set_title('Urmia lake Area versus Level')
ax.set_xlabel('Water level (M)',fontsize=15)
ax.set_ylabel('Area (km^2)',fontsize=15)
#plt.axis([0, 25, 0, 25])
plt.grid(True)
plt.show()
# find max and min values of predictor variables (here X) to use it to specify initial values of w and b
max_feature_value=np.amax(X)
min_feature_value=np.amin(X)
w_optimum = max_feature_value*0.5
w = [w_optimum for i in range(1)] # w shoulb be a vector with dimension of the independent features (here:1)
wt_b=w
b_sum=0
for i in range(X.shape[0]):
b_sum+=y[i]-np.dot(wt_b,X[i])
b_ini=b_sum/len(X)
b_step_size_lower = 0.9
b_step_size_upper = 0.1
b_multiple = 500 # step size for b
b_range = np.arange((b_ini*b_step_size_lower), -b_ini*b_step_size_upper, b_multiple)
print(len(b_range))
# Estimate w and b using stochastic gradient descent and trial and error
l_rate=0.1
n_epoch = 250
epsilon=500 # acceptable error
length_Wvector_list=[]
for i in range (len(b_range)):
print(i)
optimized = False
while not optimized:
correctly_regressed = True
for j in range(X.shape[0]):
# every data point should be satisfies the constraint yi-(np.dot(w_t,xi)+b) <=epsilon or yi-(np.dot(w_t,xi)+b)>=-epsilon
if (y[j]-(np.dot(wt_b,X[j])+b_range[i]) > epsilon) or (y[j]-(np.dot(wt_b,X[j])+b_range[i]) < -epsilon)==True:
correctly_regressed = False
wt_b = np.asarray(wt_b) - l_rate
if correctly_regressed==True:
length_Wvector_list.append([wt_b[0],wt_b,b_range[i]]) #store w, b for minimum magnitude , magnitude or length of a vector w_t is called the norm
optimized = True
if wt_b[0] < 0:
optimized = True
wt_b_temp=wt_b
wt_b=w
norms = sorted([n for n in length_Wvector_list])
wt_b=norms[0][1]
b=norms[0][2]
# Predict using the optimized values of w and b
y_predict=[]
for i in range (X.shape[0]):
y_hat=np.dot(wt_b,X[i])+b
y_predict.append(y_hat)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_predict)))
print('Coefficient of determination (R2):', metrics.r2_score(y, y_predict))
# plot
fig,ax = plt.subplots(1, 1,figsize=(8, 5.2))
ax.scatter(y, y_predict, cmap='K', edgecolor='b',linewidth='0.5',alpha=1, label='testing points',marker='o', s=12)
ax.set_xlabel('Observed Area(km $^{2}$)',fontsize=14)
ax.set_ylabel('Simulated Area(km $^{2}$)',fontsize=14)
ax.set_xlim([min(y)-100, max(y)+100])
ax.set_ylim([min(y)-100, max(y)+100])
# find support vectors
positive_instances=[]
negative_instances=[]
for i in range(X.shape[0]):
y_pre=(np.dot(wt_b,X[i]))+b
if ((y[i]-y_pre>0) and (y[i]-y_pre<=epsilon))==True:
positive_instances.append([y[i]-y_pre,[X[i],y[i]]])
elif ((y[i]-y_pre<0) and (y[i]-y_pre>=-epsilon))==True:
negative_instances.append([y[i]-y_pre,[X[i],y[i]]])
len(positive_instances)+len(negative_instances)
sort_positive=sorted([n for n in positive_instances])
sort_negative=sorted([n for n in negative_instances])
positive_support_vector=sort_positive[-1][1]
negative_support_vector=sort_negative[0][1]
model_support_vectors=np.stack((positive_support_vector,negative_support_vector),axis=-1)
# visualize the data-set
colors = {1:'r',-1:'b'}
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
plt.scatter(X,y,marker='o',c=y)
# plot support vectors
ax.scatter(model_support_vectors[0, :],model_support_vectors[1, :],s=200, linewidth=1,facecolors='none', edgecolors='b')
# hyperplane = x.w+b
# 0 = x.w+b
# psv = epsilon
# nsv = -epsilon
# dec = 0
def hyperplane_value(x,w,b,e):
return (np.dot(w,x)+b+e)
datarange = (min_feature_value*1.,max_feature_value*1.)
hyp_x_min = datarange[0]
hyp_x_max = datarange[1]
# (w.x+b) = epsilon
# positive support vector hyperplane
psv1 = hyperplane_value(hyp_x_min, wt_b, b, epsilon)
psv2 = hyperplane_value(hyp_x_max, wt_b, b, epsilon)
ax.plot([hyp_x_min,hyp_x_max],[psv1,psv2], 'k')
# (w.x+b) = -epsilon
# negative support vector hyperplane
nsv1 = hyperplane_value(hyp_x_min, wt_b, b, -epsilon)
nsv2 = hyperplane_value(hyp_x_max, wt_b, b, -epsilon)
ax.plot([hyp_x_min,hyp_x_max],[nsv1,nsv2], 'k')
# (w.x+b) = 0
# positive support vector hyperplane
db1 = hyperplane_value(hyp_x_min, wt_b, b, 0)
db2 = hyperplane_value(hyp_x_max, wt_b, b, 0)
ax.plot([hyp_x_min,hyp_x_max],[db1,db2], 'y--')
#plt.axis([-5,10,-12,-1])
plt.show()

Poor GMM fit in sklearn from 2 gaussian

I want to fit a 2 component mixture model with sklearn for then calculating back posterior probability. Butwith the code I have so far the fit for one of the two distributions is perfect (overfitting?) and other one is very poor. I made a dummy example with sampling 2 gaussian
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
def calc_pdf():
"""
calculate gauss mixture modelling for 2 comp
return pdfs
"""
d = np.random.normal(-0.1, 0.07, 5000)
t = np.random.normal(0.2, 0.13, 10000)
pool = np.concatenate([d, t]).reshape(-1,1)
label = ['d']*d.shape[0] + ['t'] * t.shape[0]
X = pool[pool>0].reshape(-1,1)
X = np.log(X)
clf = GaussianMixture(
n_components=2,
covariance_type='full',
tol = 1e-24,
max_iter = 1000
)
logprob = clf.fit(X).score_samples(X)
responsibilities = clf.predict_proba(X)
pdf = np.exp(logprob)
pdf_individual = responsibilities * pdf[:, np.newaxis]
plot_gauss(np.log(d), np.log(t), pdf_individual, X)
return pdf_individual[0], pdf_individual[1]
def plot_gauss(d, t, pdf_individual, x):
fig, ax = plt.subplots(figsize=(12, 9), facecolor='white')
ax.hist(d, 30, density=True, histtype='stepfilled', alpha=0.4)
ax.hist(t, 30, density=True, histtype='stepfilled', alpha=0.4)
ax.plot(x, pdf_individual, '.')
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')
plt.show()
calc_pdf()
which produces this plot here
Is there something obvious that I am missing?

Sklearn BIC criterion : differents optimum values of k for clustering

I want to determine the best value of k (number of clusters) for the KMeans algo and a dataset.
I found a ressource in the documentation of Sklearn : The Gaussian Mixture Model Selection using the BIC criterion.
I found an example of code on the site that I adapted to my dataset.
But each run of this code give a different value of optimal value of k . Why ?
Here the code :
import numpy as np
import pandas as pd
import itertools
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import mixture
print(__doc__)
# Number of samples per component
n_samples = 440
path = 'C:/Users/Lionel/Downloads'
file = 'Wholesale customers data.csv'
data = pd.read_csv(path + '/'+file)
X = np.array(data.iloc[:,2 :])
lowest_bic = np.infty
bic = []
n_components_range = range(1, 12)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a Gaussian mixture with EM
gmm = mixture.GaussianMixture(n_components=n_components,
covariance_type=cv_type)
gmm.fit(X)
bic.append(gmm.bic(X))
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
bic = np.array(bic)
color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
'darkorange'])
clf = best_gmm
print(clf)
bars = []
# Plot the BIC scores
spl = plt.subplot(2, 1, 1)
#spl = plt.plot()
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
xpos = np.array(n_components_range) + .2 * (i - 2)
bars.append(plt.bar(xpos, bic[i * len(n_components_range):
(i + 1) * len(n_components_range)],
width=.2, color=color))
plt.xticks(n_components_range)
plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
plt.title('BIC score per model')
xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
.2 * np.floor(bic.argmin() / len(n_components_range))
plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
spl.set_xlabel('Number of components')
spl.legend([b[0] for b in bars], cv_types)
# Plot the winner
splot = plt.subplot(2, 1, 2)
Y_ = clf.predict(X)
for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_,
color_iter)):
v, w = linalg.eigh(cov)
if not np.any(Y_ == i):
continue
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan2(w[0][1], w[0][0])
angle = 180. * angle / np.pi # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
ell.set_clip_box(splot.bbox)
ell.set_alpha(.5)
splot.add_artist(ell)
plt.xticks(())
plt.yticks(())
plt.title('Selected GMM: full model, 2 components')
plt.subplots_adjust(hspace=.35, bottom=.02)
plt.show()
Here the link to my dataset :
https://drive.google.com/open?id=1yMw1rMh12ml6Lh3yrL6WDLbEnLM-SmiN
Have you an explanation for this behaviour ?

Resources