Classification of buildings as per the damage data using SVM - python-3.x

I have a university task to perform. It is regarding the classification of several buildings (with 6 parameters) based on the damage classification (1-5). I did the coding as per the guidance of the SVM, but not sure of the output accuracy. Can you please advise, how can I improve my result and what is the other choices of the algorithm.
'''
# Support Vector Machine (SVM)
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Ehsan Duzce.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 7].values
# Taking care of missing data
from sklearn.impute import SimpleImputer
# creating object for SimpleImputer class as "imputer"
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean", verbose=0)
imputer = imputer.fit(X[:, 1:7]) #upper bound is not included, but lower bound
X[:, 1:7] = imputer.transform(X[:, 1:7])
# Avoiding the dummy Variable Trap
X = X[:, 1:] #To remove the first column from the dataset
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'poly', degree = 3)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() +
1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1,
step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in
range(4)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x6
pred = classifier.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred, alpha = 1.0, cmap = ListedColormap(('green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red'))(I))
plt.title('SVM (Training set)')
plt.xlabel('Damage Scale')
plt.ylabel('Building Database')
plt.legend()
plt.show()
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() +
1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1,
step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in
range(4)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x6
pred = classifier.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred, alpha = 1.0, cmap = ListedColormap(('green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red'))(I))
plt.title('SVM (Test set)')
plt.xlabel('Damage Scale')
plt.ylabel('Building Database')
plt.legend()
plt.show()
'''

)
First and foremost you should get acquainted with your training data. From what I've understood you simply feed the data to the model without any kind of pre processing on the data, you shouldn't do that.
I see you are inputing missing data with the mean, maybe try and remove the data points and see the results, remove outliers that may "confuse" your model.
Also your plots are not very friendly you tell us the data is classified 1-5, but in the plots [-2,2].
But since your questions is algorithmic specific try hyper-parameter tuning.
You can do it like this:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
print(grid.best_estimator_)
I recommend reading this article, to understand SVM and tune your parameters]
https://towardsdatascience.com/svm-hyper-parameter-tuning-using-gridsearchcv-49c0bc55ce29

Related

SVR hyperparameter selection and visualisation

I am just a beginner in data analysis. I want to use 'Cross-validation Grid Search method" to determine the parameters gamma and C of the Radial Basis Function (RBF) kernel SVM. I don't know where I should put my data on this code, and what data type I should use (training or target data)?
For SVR
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error,explained_variance_score
from TwoStageTrAdaBoostR2 import TwoStageTrAdaBoostR2 # import the two-stage algorithm
from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from matplotlib.colors import Normalize
from sklearn.svm import SVC
# Data import (source)
source= pd.read_csv(sourcedata)
# Data import (target)
data= pd.read_csv(targetdata)
# Sample Size
datatrain = data.sample(n=60, random_state=1)
datatest = data[~dataL.index.isin(data.index)]
# Merge training set data (source and target)
train = pd.concat([source, datatrain], sort=False)
train.reset_index(inplace=True, drop=True)
datatest.reset_index(inplace=True, drop=True)
# Variable input
X_train, y_train = train[['x1', 'x2']].values, train['y'].values
X_test, y_test = FL[['x1', 'x2']].values, FL['y'].values
# Parameter setting
#sample_size = [n_source1+n_source2+n_source3+n_source4+n_source5, n_target_train]
n_estimators = 100
steps = 8
fold = 5
random_state = np.random.RandomState(1)
sample_size = [350, 60]
#1 twostage tradaboost.r2
regr_1 = TwoStageTrAdaBoostR2(SVR(C=50, gamma='auto'),
n_estimators = n_estimators, sample_size = sample_size,
steps = steps, fold = fold,
random_state = random_state)
regr_1.fit(X_train, y_train)
y_pred1 = regr_1.predict(X_test)
print("MSE of regular two stage trAdaboostR2--model1:",sqrt(mean_squared_error(y_test, y_pred1)))
#Plot the results
plt.figure()
plt.scatter(y_test, y_test-y_pred1, c="black", label="TwoStageTrAdaBoostR2_model1", s=10)
plt.xlabel("CAR")
plt.ylabel("Err")
plt.title("Two-stage Transfer Learning Boosted Decision Tree Regression", loc='left', fontsize=12, fontweight=0, color="orange")
plt.legend()
plt.show()
for cross-validation grid search methods(best parameters):
# Cross validation grid search (best parameters)
parameter_candidates = [
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, c=5 ,n_jobs=-1)
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
For visualization of parameter effects
c_range = np.logspace(-2, 2, 4)
gamma_range = np.logspace(-2, 2, 5)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
{'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]
svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=2,n_jobs=-1,
scoring='explained_variance')
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
len(c_range))
# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
len(c_range))
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(c_range)), c_range)
plt.title('Validation accuracy')
plt.show()
When I used this code, I found the following output Heatmap plot!
But I am trying to get a Heatmap like this one
The following code with some typical regression data should work all the way through:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV,train_test_split
from matplotlib.colors import Normalize
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
X, y = datasets.load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
# Cross validation grid search (best parameters)
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
{'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]
svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=20,n_jobs=-4,cv=4,
scoring='explained_variance')
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
len(c_range))
# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
len(c_range))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=-.2, midpoint=0.5))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)),
[np.format_float_scientific(i,1) for i in gamma_range],rotation=45)
plt.yticks(np.arange(len(c_range)),
[np.format_float_scientific(i,) for i in c_range])
plt.title('Validation accuracy')
plt.show()
The granularity of the grid is very low but it takes some time run otherwise. Also the limits of the grid will need to be more educated that the ones I chose.
I'm not sure why you get the error you get but I kept things simple and initiated the SVR once in my snippet so you can see how it works. I've also used different lengths for the C and gamma arrays that's just to show how these parameters are carried through. Sometimes I find that if everything has the same length is difficult to see which parameter is responsible for what.
The final plot looks like that but this depends heavily on the range of the grid, its granularity and the dataset that you are working with. Also note that I change the parameters of the MidpointNormalize class you provided.

Unable to run auto-sklearn on JupyterLab as kernal keeps getting killed, despite having enough memory

I'm trying to run the auto-sklearn example on the digits dataset (classification), as in the official documentation at https://automl.github.io/auto-sklearn/master/
The kernel keeps getting killed on running automl.fit(X_train, y_train).
Kernel Restarting:
The kernel for Downloads/examples_jupyter/digits-test.ipynb appears to have died. It
will restart automatically.
Here's is the code :
import autosklearn.classification
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
X, y = sklearn.datasets.load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
automl = autosklearn.classification.AutoSklearnClassifier()
automl.fit(X_train, y_train) #Killed running this
y_hat = automl.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))
I am running this code on JupyterLab and use Anaconda. I have 12 GB RAM free before running the program and none seems to be used.
$ conda -V
conda 4.8.3
$ python -V
Python 3.7.6
JupyterLab - Version 1.2.6
I couldn't pip install the autosklearn.
Can you try a different classifier? There are several here.
https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
This works fine for me.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
h = .02 # step size in the mesh
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable
]
figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
figure.subplots_adjust(left=.02, right=.98)
plt.show()

Python 3.6: 'c' argument looks like a single numeric RGB or RGBA sequence

While running below code from Machine Learning A-Z Course, getting the warning.
# Logistic Regression
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values
X = X.astype(float)
y = y.astype(float)
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Fitting Logistic Regression to the Training set
#lbfgs = Limited-memory BFGS It is a popular algorithm for parameter estimation in machine learning.
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, solver='lbfgs')
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
Full Error:
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
The issue is in below code:
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
However, not able to fix it. Any help?
You're just seeing an warning, which should not be a problem. The following code runs without any error in 3.2.1
Check your matplotlib version.
import matplotlib
print(matplotlib.__version__)
# Logistic Regression
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values
X = X.astype(float)
y = y.astype(float)
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Fitting Logistic Regression to the Training set
#lbfgs = Limited-memory BFGS It is a popular algorithm for parameter estimation in machine learning.
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, solver='lbfgs')
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
Just don't use ListedColormap, from version 3 you're supposed to pass the color as a string for each scatter point.
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values
X = X.astype(float)
y = y.astype(float)
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Fitting Logistic Regression to the Training set
#lbfgs = Limited-memory BFGS It is a popular algorithm for parameter estimation in machine learning.
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, solver='lbfgs')
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ['red', 'green'][i], label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
This shouldn't give any warnings.
my case is using lag_plot() function
Solved it lilke this
lag_plot(np.log(data['numeric_column']), c = ['blue'][0])

changing xticks and yticks

After fitting the kNN-classifier with the scaled features (age and salary), I would like to plot the resulting diagram with the unscaled feature values.
kNN-plot
I think that one way to do this is to change the xticks and yticks of the plot and leave everything like it is. Hopefully someone has got a better idea.
Moreover, it would be great if the diagram can show the correct (age / salary) values in the bottom left corner, when I go with the cursor over the diagram.
Unfortunately, I have no idea how to do that. Therefore, I am asking of help.
The dataset:
https://www.dropbox.com/sh/2mfr2kajrm7y2qq/AADFmZzYWLEjqYSLPjaQcLwka?dl=0
The code so far:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
X = dataset.iloc[:, [2, 3]].values.astype(float)
y = dataset.iloc[:,-1].values
# splitting into training and test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
# no fit, because it is test
X_test = sc_X.transform(X_test)
# fitting kNN classification to the training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
# Predict the Test set result
y_pred = classifier.predict(X_test)
# Visualising the Test set results
f = plt.figure()
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
f.show()
Alright, I noticed that the answer to my answer is pretty simple...
I thought too complicated.
However, this is the solution:
We just have to add the following lines:
wished_xticks = np.array([18, 22, 35])
temp_x = np.c_[ wished_xticks, [0]*len(wished_xticks) ]
transformed_x = sc_X.transform(temp_x)[:,0]
plt.xticks(transformed_x, wished_xticks)
wished_yticks = np.array([17000, 25000, 100000, 150000])
temp_y = np.c_[ [0]*len(wished_yticks), wished_yticks ]
transformed_y = sc_X.transform(temp_y)[:,1]
plt.yticks(transformed_y, wished_yticks)
So, we get our wished result:
Diagram

Why doesn't Nearest Neighbour work on my data?

I am trying to learn a little about nearest neighbour matching. Below you see two scatter plots. The first shows the real data. I trying to use scikit-learn's NN-classifier to identify the white observations. The second scatter plot shows my achievement - which is entirely useless, as you can see.
I don't get why that is the case? It seems that the white observations are closely related and different fromt the other observations. What is happening here?
Here is what I do:
# import neccessary packages
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.cross_validation import train_test_split as tts
import matplotlib.pyplot as plt
from sklearn import neighbors
from matplotlib.colors import ListedColormap
# import data and give a little overview
sample = pd.read_stata('real_data_1.dta')
s = sample
print(s.dtypes)
print(s.shape)
# Nearest Neighboor
print(__doc__)
n_neighbors = 1
X = np.array((s.t_ums_ma, s.t_matauf)).reshape(918, 2)
y = np.array(s.matauf_measure)
plt.scatter(s.t_ums_ma,s.t_matauf, c=s.matauf_measure, label='Nordan Scatter', color='b', s=25, marker="o")
plt.xlabel('crisis')
plt.ylabel('current debt')
plt.title('Interesting Graph\nCheck it out')
plt.legend()
plt.gray()
plt.show()
X_train, X_test, y_train, y_test = tts(X, y, test_size = 1)
h = 0.02
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X_train[:, 0].min() - 0.01, X[:, 0].max() + 0.01
y_min, y_max = X_train[:, 1].min() - 0.01, X[:, 1].max() + 0.01
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.show()
Any help is greatly appreciated! Best /R

Resources