Why doesn't Nearest Neighbour work on my data? - python-3.x

I am trying to learn a little about nearest neighbour matching. Below you see two scatter plots. The first shows the real data. I trying to use scikit-learn's NN-classifier to identify the white observations. The second scatter plot shows my achievement - which is entirely useless, as you can see.
I don't get why that is the case? It seems that the white observations are closely related and different fromt the other observations. What is happening here?
Here is what I do:
# import neccessary packages
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.cross_validation import train_test_split as tts
import matplotlib.pyplot as plt
from sklearn import neighbors
from matplotlib.colors import ListedColormap
# import data and give a little overview
sample = pd.read_stata('real_data_1.dta')
s = sample
print(s.dtypes)
print(s.shape)
# Nearest Neighboor
print(__doc__)
n_neighbors = 1
X = np.array((s.t_ums_ma, s.t_matauf)).reshape(918, 2)
y = np.array(s.matauf_measure)
plt.scatter(s.t_ums_ma,s.t_matauf, c=s.matauf_measure, label='Nordan Scatter', color='b', s=25, marker="o")
plt.xlabel('crisis')
plt.ylabel('current debt')
plt.title('Interesting Graph\nCheck it out')
plt.legend()
plt.gray()
plt.show()
X_train, X_test, y_train, y_test = tts(X, y, test_size = 1)
h = 0.02
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X_train[:, 0].min() - 0.01, X[:, 0].max() + 0.01
y_min, y_max = X_train[:, 1].min() - 0.01, X[:, 1].max() + 0.01
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.show()
Any help is greatly appreciated! Best /R

Related

matplotlib 3d plot issue

Am unable to get the proper plot for this, even though i set index 15,15 as 100 , still i get plot with all zero. Can someone help?
import matplotlib.pyplot as plt
import numpy as np
# Make data.
X = np.arange(0, 512, 1)
Y = np.arange(0, 512, 1)
X, Y = np.meshgrid(X, Y)
Z = np.zeros((512, 512), dtype=float)
Z[15,15]=10
# Plot the surface.
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111, projection='3d')
# Plot a 3D surface
ax.plot_surface(X, Y, Z)
plt.show()

Unable to run auto-sklearn on JupyterLab as kernal keeps getting killed, despite having enough memory

I'm trying to run the auto-sklearn example on the digits dataset (classification), as in the official documentation at https://automl.github.io/auto-sklearn/master/
The kernel keeps getting killed on running automl.fit(X_train, y_train).
Kernel Restarting:
The kernel for Downloads/examples_jupyter/digits-test.ipynb appears to have died. It
will restart automatically.
Here's is the code :
import autosklearn.classification
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
X, y = sklearn.datasets.load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
automl = autosklearn.classification.AutoSklearnClassifier()
automl.fit(X_train, y_train) #Killed running this
y_hat = automl.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))
I am running this code on JupyterLab and use Anaconda. I have 12 GB RAM free before running the program and none seems to be used.
$ conda -V
conda 4.8.3
$ python -V
Python 3.7.6
JupyterLab - Version 1.2.6
I couldn't pip install the autosklearn.
Can you try a different classifier? There are several here.
https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
This works fine for me.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
h = .02 # step size in the mesh
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable
]
figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
figure.subplots_adjust(left=.02, right=.98)
plt.show()

Classification of buildings as per the damage data using SVM

I have a university task to perform. It is regarding the classification of several buildings (with 6 parameters) based on the damage classification (1-5). I did the coding as per the guidance of the SVM, but not sure of the output accuracy. Can you please advise, how can I improve my result and what is the other choices of the algorithm.
'''
# Support Vector Machine (SVM)
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Ehsan Duzce.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 7].values
# Taking care of missing data
from sklearn.impute import SimpleImputer
# creating object for SimpleImputer class as "imputer"
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean", verbose=0)
imputer = imputer.fit(X[:, 1:7]) #upper bound is not included, but lower bound
X[:, 1:7] = imputer.transform(X[:, 1:7])
# Avoiding the dummy Variable Trap
X = X[:, 1:] #To remove the first column from the dataset
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'poly', degree = 3)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() +
1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1,
step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in
range(4)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x6
pred = classifier.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred, alpha = 1.0, cmap = ListedColormap(('green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red'))(I))
plt.title('SVM (Training set)')
plt.xlabel('Damage Scale')
plt.ylabel('Building Database')
plt.legend()
plt.show()
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() +
1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1,
step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in
range(4)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x6
pred = classifier.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred, alpha = 1.0, cmap = ListedColormap(('green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red'))(I))
plt.title('SVM (Test set)')
plt.xlabel('Damage Scale')
plt.ylabel('Building Database')
plt.legend()
plt.show()
'''
)
First and foremost you should get acquainted with your training data. From what I've understood you simply feed the data to the model without any kind of pre processing on the data, you shouldn't do that.
I see you are inputing missing data with the mean, maybe try and remove the data points and see the results, remove outliers that may "confuse" your model.
Also your plots are not very friendly you tell us the data is classified 1-5, but in the plots [-2,2].
But since your questions is algorithmic specific try hyper-parameter tuning.
You can do it like this:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
print(grid.best_estimator_)
I recommend reading this article, to understand SVM and tune your parameters]
https://towardsdatascience.com/svm-hyper-parameter-tuning-using-gridsearchcv-49c0bc55ce29

changing xticks and yticks

After fitting the kNN-classifier with the scaled features (age and salary), I would like to plot the resulting diagram with the unscaled feature values.
kNN-plot
I think that one way to do this is to change the xticks and yticks of the plot and leave everything like it is. Hopefully someone has got a better idea.
Moreover, it would be great if the diagram can show the correct (age / salary) values in the bottom left corner, when I go with the cursor over the diagram.
Unfortunately, I have no idea how to do that. Therefore, I am asking of help.
The dataset:
https://www.dropbox.com/sh/2mfr2kajrm7y2qq/AADFmZzYWLEjqYSLPjaQcLwka?dl=0
The code so far:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
X = dataset.iloc[:, [2, 3]].values.astype(float)
y = dataset.iloc[:,-1].values
# splitting into training and test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
# no fit, because it is test
X_test = sc_X.transform(X_test)
# fitting kNN classification to the training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
# Predict the Test set result
y_pred = classifier.predict(X_test)
# Visualising the Test set results
f = plt.figure()
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
f.show()
Alright, I noticed that the answer to my answer is pretty simple...
I thought too complicated.
However, this is the solution:
We just have to add the following lines:
wished_xticks = np.array([18, 22, 35])
temp_x = np.c_[ wished_xticks, [0]*len(wished_xticks) ]
transformed_x = sc_X.transform(temp_x)[:,0]
plt.xticks(transformed_x, wished_xticks)
wished_yticks = np.array([17000, 25000, 100000, 150000])
temp_y = np.c_[ [0]*len(wished_yticks), wished_yticks ]
transformed_y = sc_X.transform(temp_y)[:,1]
plt.yticks(transformed_y, wished_yticks)
So, we get our wished result:
Diagram

How to change the ticks in a confusion matrix?

I am working with a confusion matrix (Figure A)
How can I make my ticks to start from 1 to 3 instead of 0 to 2?
I tried adding a +1 in tick_marks. But it does not work (Figure B)
Check my code:
import itertools
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Oranges):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(iris.target_names)) + 1
plt.xticks(tick_marks, rotation=45)
plt.yticks(tick_marks)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Figure A:
Figure B
You should get the axis of the plt and change the xtick_labels (if that's what you intend to do):
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
class_names = iris.target_names
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results
classifier = svm.SVC(kernel='linear', C=0.01)
y_pred = classifier.fit(X_train, y_train).predict(X_test)
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Oranges):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(iris.target_names))
plt.xticks(tick_marks, rotation=45)
ax = plt.gca()
ax.set_xticklabels((ax.get_xticks() +1).astype(str))
plt.yticks(tick_marks)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
plot_confusion_matrix(cm)
plt.show()
result:
I faced a similar problem: When I wanted to use custom labels for my classes, either the squared boxes went out of bounds or the labels were being offset, as you show here.
If you have multiple labels (>7), then first you need to explicitly set the tick frequency to one using plticker.MultipleLocator. Then you simply set the x and y ticklabels without mentioning the ticks (To not set the xticks and yticks is important. If you do so, the imshow/matshow part gets chopped off at the top.) Add the following lines inside the plot_confusion_matrix function.
import matplotlib.ticker as plticker
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm,cmap=cmap)
fig.colorbar(cax)
loc = plticker.MultipleLocator(base=1.0)
ax.xaxis.set_major_locator(loc)
ax.yaxis.set_major_locator(loc)
ax.set_yticklabels(['']+iris.target_names)
ax.set_xticklabels(['']+iris.target_names)

Resources