I am trying to visualise SVM classification results using Matplotlib and Scikit-learn, how to handle MemoryError ?!
For my example, I have a small dataset, a table X of 100 examples and 10 features (data table of integer). I did perform classification using SVM of Scikit learn, then I want to visualize the results. But since I have 10 features I can't visualize them directly, so I used PCA after classification to reduce the dimensionality of my data. It did work on IRIS dataset but for my data, it crashes giving me "MemoryError"
#SVM classification
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20)
svclassifier = SVC(kernel='linear',gamma='auto',max_iter=1000, decision_function_shape='ovo')
models=svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
#Plot funtions
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max()+1
y_min, y_max = y.min() - 1, y.max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min,
y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
#PCA for D.R
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
print("original shape: ", X.shape)
print("transformed shape:", X_pca.shape)
X=X_pca
#Ploting results
fig, sub = plt.subplots()
plt.subplots_adjust(wspace=0.4, hspace=0.4)
X0, X1 = X[:, 0].flatten(), X[:, 1].flatten()
xx, yy = make_meshgrid(X0, X1)
plot_contours(sub, models, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
sub.scatter(X0, X1, c=Y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
sub.set_xlim(xx.min(), xx.max())
sub.set_ylim(yy.min(), yy.max())
sub.set_xlabel('Sepal length')
sub.set_ylabel('Sepal width')
sub.set_xticks(())
sub.set_yticks(())
sub.set_title("TITLE")
plt.show()
original shape: (100, 10)
transformed shape: (100, 2)
MySQL connection is closed
Traceback (most recent call last):
File "new_data.py", line 123, in <module>
xx, yy = make_meshgrid(X0, X1)
File "new_data.py", line 81, in make_meshgrid
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
File "/home/.local/lib/python3.5/site-packages/numpy/lib/function_base.py", line 4211, in meshgrid
output = [x.copy() for x in output]
File "/home/.local/lib/python3.5/site-packages/numpy/lib/function_base.py", line 4211, in <listcomp>
output = [x.copy() for x in output]
MemoryError
Related
I have a dataset with the following shapes: (2400, 2) (2400,) (1600, 2) (1600,)
My task is to perform non-linear separable classification by binary logistic regression.
But I get the following error in visualization part:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-2754b9327868> in <module>()
4
5 # Plot different regions and color them
----> 6 output = output.reshape(x_vals.shape)
7 plt.imshow(output, interpolation='nearest',
8 extent=(x_min, x_max, y_min, y_max),
ValueError: cannot reshape array of size 2880000 into shape (1200,1200)
How can I reshape array into matrix?
Below is my implementation for the reference:
num_features = 2
learning_rate = 0.0001
training_steps = 4000
batch_size = 32
display_step = 50
x_train, y_train = map(list, zip(*[(x,y) for x,y in zip(x_train, y_train) if y==0 or y==1]))
x_test, y_test = map(list, zip(*[(x,y) for x,y in zip(x_test, y_test) if y==0 or y==1]))
x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32)
y_train, y_test = np.array(y_train, np.int64), np.array(y_test, np.int64)
x_train, x_test = x_train.reshape([-1, num_features]), x_test.reshape([-1, num_features])
x_train, x_test = x_train/255., x_test/255.
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)
b = tf.Variable(tf.ones((num_features, 2)) * 0.000001, name = "weight")
b0 = tf.Variable(0., name = "bias")
def logistic_regression(x, b, b0):
return 1. / (1. + tf.exp(-tf.matmul(x, b) - b0))
def loglikelihood(p, y_true):
return tf.reduce_sum(tf.one_hot(y_true, 2) * tf.math.log(p), axis=-1)
def accuracy(y_pred, y_true):
correct_prediction = tf.equal(tf.argmax(y_pred, axis=-1), y_true)
return tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
optimizer = tf.optimizers.Adam()
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
with tf.GradientTape() as g:
g.watch([b, b0])
p = logistic_regression(batch_x, b, b0)
ll = loglikelihood(p, batch_y)
ll_sum = tf.reduce_mean(ll)
grad_b, grad_b0 = g.gradient(ll_sum, [b, b0])
optimizer.apply_gradients(zip([grad_b, grad_b0], [b, b0]))
if step % display_step == 0:
p = logistic_regression(batch_x, b, b0)
acc = accuracy(p, batch_y)
p = logistic_regression(x_test, b, b0)
val_acc = accuracy(p, y_test)
print("step: %i, acc: %f, val_acc %f" % (step, acc, val_acc))
def predict(x_test):
return tf.round(logistic_regression(x_test, b, b0))
import numpy as np
x_min, y_min = -12, -12
x_max, y_max = 12, 12
x_vals, y_vals = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
xy_grid = pd.DataFrame(zip(x_vals.ravel(), y_vals.ravel()), dtype=np.float32)
# Predict output labels for all the points on the grid
output = predict(xy_grid.to_numpy()).numpy()
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
# Plot different regions and color them
output = output.reshape(x_vals.shape)
plt.imshow(output, interpolation='nearest',
extent=(x_min, x_max, y_min, y_max),
cmap=plt.cm.Paired,
aspect='auto',
origin='lower')
pd.DataFrame(np.concatenate([x_train,
np.expand_dims(y_train, axis=-1)], axis=1)).plot.scatter(0, 1, c=2, colormap='viridis', ax=ax)
The expected result should be like this:
expected image
But I get the following image:
resulting image
The error is caused by x_vals.shape being too small. Hence numpy cannot reshape the data as you would be losing some data.
You need to change the shape of x_vals to be whatever shape output you expect you see.
For example, if you want a 1200x2400 image you can do:
x_min, y_min = -12, -12
x_max, y_max = 12, 12
x_vals, y_vals = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.02))
print(x_vals.shape) # shows x_vals.shape = (1200, 2400) which is 28880000 elements
output = np.arange(2880000)
output = output.reshape(x_vals.shape) # this will now work
I'm trying to create a Linear Regression model in Scikit-Learn. Although I've encountered a problem. It's saying that x and y are not the same size. I am using googles "california housing" dataset. Here's the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataset = pd.read_csv('/content/sample_data/california_housing_train.csv')
x = dataset.iloc[:, :-2].values
y = dataset.iloc[:, :-1].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/3)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
plt.scatter(x_train, y_train, color = "red")
plt.plot(x_train, lr.predict(x_train), color = "green")
plt.title("Income vs Home Value (Training set)")
plt.xlabel("Income")
plt.ylabel("Home Value")
plt.show()
plt.scatter(x_test, y_test, color = "red")
plt.plot(x_train, lr.predict(x_train), color = "green")
plt.title("Income vs Home Value (Testing set)")
plt.xlabel("Income")
plt.ylabel("Home value")
plt.show()
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-47-95095200e54b> in <module>()
18 y_pred = lr.predict(x_test)
19
---> 20 plt.scatter(x_train[0], y_train[:], color = "red")
21 plt.plot(x_train, lr.predict(x_train), color = "green")
22 plt.title("Income vs Home Value (Training set)")
3 frames
/usr/local/lib/python3.7/dist-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, **kwargs)
4389 y = np.ma.ravel(y)
4390 if x.size != y.size:
-> 4391 raise ValueError("x and y must be the same size")
4392
4393 if s is None:
ValueError: x and y must be the same size
I have no idea why. I've tried everything on other posts. According to what I've found on other posts, it's because one(x or y) is 2d and one is 1d. Although the "fixes" aren't working.
Look at the dimensions of your x & y variables:
[ins] In [34]: x.shape
Out[34]: (17000, 7)
[ins] In [35]: y.shape
Out[35]: (17000, 8)
The y variable should be the target variable, the home price:
y = dataset.iloc[:,-1].values
Your x-variable definition leaves out the median_income, which is what you are trying to plot, so here is an x matrix that includes the income variable:
x = dataset.iloc[:, :-1].values
With y defined as above it is now 1-dimensional; the x matrix has 8 variables in it, the last of which (index 7) is median_income. To plot it:
plt.scatter(x_train[:,7], y_train, color = "red")
I have two inputs X1, X2 and corresponding label Y. I want to split the data into training and validation using SkLearn's train_test_split. My X1 is of shape (1920,12) and X2 is of shape(1920,51,5). The code I use is :
from sklearn.model_selection import train_test_split
X1 = np.load('x_train.npy')
X2 = np.load('oneHot.npy')
y_train = np.load('y_train.npy')
X = np.array(list(zip(X1, X2))) ### To zip the two inputs.
X_train, X_valid, y_train, y_valid = train_test_split(X, y_train,test_size=0.2)
X1_train, oneHot_train = X_train[:, 0], X_train[:, 1]
However when I check the shape X1_train and oneHot_train it is (1536,) whereas X1_train should be (1536,12) and oneHot_train should be (1536,51,5). What am I doing wrong here? Insights will be appreciated.
train_test_split can take up any number of iterators for splitting. Hence, you can directly feed the x1 and x2 - like below:
x1 = np.random.rand(1920,12)
x2 = np.random.rand(1920,51,5)
y = np.random.choice([0,1], 1920)
x1_train, x1_test, x2_train, x2_test, y_train, y_test = train_test_split(\
x1, x2, y ,test_size=0.2)
x1_train.shape, x1_test.shape
# ((1536, 12), (384, 12))
x2_train.shape, x2_test.shape
# ((1536, 51, 5), (384, 51, 5))
y_train.shape, y_test.shape
# ((1536,), (384,))
I have a university task to perform. It is regarding the classification of several buildings (with 6 parameters) based on the damage classification (1-5). I did the coding as per the guidance of the SVM, but not sure of the output accuracy. Can you please advise, how can I improve my result and what is the other choices of the algorithm.
'''
# Support Vector Machine (SVM)
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Ehsan Duzce.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 7].values
# Taking care of missing data
from sklearn.impute import SimpleImputer
# creating object for SimpleImputer class as "imputer"
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean", verbose=0)
imputer = imputer.fit(X[:, 1:7]) #upper bound is not included, but lower bound
X[:, 1:7] = imputer.transform(X[:, 1:7])
# Avoiding the dummy Variable Trap
X = X[:, 1:] #To remove the first column from the dataset
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'poly', degree = 3)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() +
1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1,
step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in
range(4)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x6
pred = classifier.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred, alpha = 1.0, cmap = ListedColormap(('green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red'))(I))
plt.title('SVM (Training set)')
plt.xlabel('Damage Scale')
plt.ylabel('Building Database')
plt.legend()
plt.show()
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() +
1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1,
step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in
range(4)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x6
pred = classifier.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred, alpha = 1.0, cmap = ListedColormap(('green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red'))(I))
plt.title('SVM (Test set)')
plt.xlabel('Damage Scale')
plt.ylabel('Building Database')
plt.legend()
plt.show()
'''
)
First and foremost you should get acquainted with your training data. From what I've understood you simply feed the data to the model without any kind of pre processing on the data, you shouldn't do that.
I see you are inputing missing data with the mean, maybe try and remove the data points and see the results, remove outliers that may "confuse" your model.
Also your plots are not very friendly you tell us the data is classified 1-5, but in the plots [-2,2].
But since your questions is algorithmic specific try hyper-parameter tuning.
You can do it like this:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
print(grid.best_estimator_)
I recommend reading this article, to understand SVM and tune your parameters]
https://towardsdatascience.com/svm-hyper-parameter-tuning-using-gridsearchcv-49c0bc55ce29
I am working on Indian Spontaneous Expression dataset which has 428 images, each of shape (1080, 1920, 3). Classification classes are 4 and its shape is (428, 4). While splitting into training, validation and testing data using train_test_split:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)
I am getting mentioned error.
I tried reshaping the data but anyhow I couldn't succeed.
import cv2 as cv
data=pd.read_excel('/content/drive/My Drive/ISED/details1.xlsx')
count=0
path = data['img_path']
for path in data['img_path']:
count=count+1
temp1 = path.replace("'", "")
imgpath = "/content/drive/My Drive/ISED/" + temp1
imgFile = cv.imread(imgpath)
X = np.asarray(imgFile)
print(X.shape)
print(count)
y = pd.get_dummies(data['emotion']).as_matrix()
# # #storing them using numpy
np.save('fdataXISED', X)
np.save('flabelsISED', y)
# #
print("Preprocessing Done")
print("Number of Features: "+str(len(X[0])))
print("Number of Labels: "+ str(len(y[0])))
print("Number of examples in dataset:"+str(len(X)))
print("X,y stored in fdataXISED.npy and flabelsISED.npy respectively")
num_features = 1920
num_labels = 4
batch_size = 64
epochs = 100
width, height = 1080, 1920
x = np.load('./fdataXISED.npy')
y = np.load('./flabelsISED.npy')
print(x.dtype)
x = x.astype(float)
x -= np.mean(x, axis=0)
x /= np.std(x, axis=0)
print(x.shape," ", y.shape)
#splitting into training, validation and testing data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1,
random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
test_size=0.1, random_state=
I expect proper data split for training.
Problem is here, X = np.asarray(imgFile) in for path in data['img_path']: So, X carries only last image. Please change like this,
X=[]
for path in data['img_path']:
count=count+1
temp1 = path.replace("'", "")
imgpath = "/content/drive/My Drive/ISED/" + temp1
imgFile = cv.imread(imgpath)
imgFile = np.asarray(imgFile)
X.append(imgFile)
X = np.asarray(X)
print(X.shape)
print(count)
And at the end your X will be in shape of (428,1080,1920,3) and y must be in (428,4)
Error occurs because different number of samples in X and y.