PyPlot error "X and Y must be same size", everything I've found online isn't working - python-3.x

I'm trying to create a Linear Regression model in Scikit-Learn. Although I've encountered a problem. It's saying that x and y are not the same size. I am using googles "california housing" dataset. Here's the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataset = pd.read_csv('/content/sample_data/california_housing_train.csv')
x = dataset.iloc[:, :-2].values
y = dataset.iloc[:, :-1].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/3)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
plt.scatter(x_train, y_train, color = "red")
plt.plot(x_train, lr.predict(x_train), color = "green")
plt.title("Income vs Home Value (Training set)")
plt.xlabel("Income")
plt.ylabel("Home Value")
plt.show()
plt.scatter(x_test, y_test, color = "red")
plt.plot(x_train, lr.predict(x_train), color = "green")
plt.title("Income vs Home Value (Testing set)")
plt.xlabel("Income")
plt.ylabel("Home value")
plt.show()
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-47-95095200e54b> in <module>()
18 y_pred = lr.predict(x_test)
19
---> 20 plt.scatter(x_train[0], y_train[:], color = "red")
21 plt.plot(x_train, lr.predict(x_train), color = "green")
22 plt.title("Income vs Home Value (Training set)")
3 frames
/usr/local/lib/python3.7/dist-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, **kwargs)
4389 y = np.ma.ravel(y)
4390 if x.size != y.size:
-> 4391 raise ValueError("x and y must be the same size")
4392
4393 if s is None:
ValueError: x and y must be the same size
I have no idea why. I've tried everything on other posts. According to what I've found on other posts, it's because one(x or y) is 2d and one is 1d. Although the "fixes" aren't working.

Look at the dimensions of your x & y variables:
[ins] In [34]: x.shape
Out[34]: (17000, 7)
[ins] In [35]: y.shape
Out[35]: (17000, 8)
The y variable should be the target variable, the home price:
y = dataset.iloc[:,-1].values
Your x-variable definition leaves out the median_income, which is what you are trying to plot, so here is an x matrix that includes the income variable:
x = dataset.iloc[:, :-1].values
With y defined as above it is now 1-dimensional; the x matrix has 8 variables in it, the last of which (index 7) is median_income. To plot it:
plt.scatter(x_train[:,7], y_train, color = "red")

Related

KNN Python implementation

this is what shows when i try running my code:
FutureWarning: Unlike other reduction functions (e.g. skew, kurtosis), the default behavior of mode typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of keepdims will become False, the axis over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set keepdims to True or False to avoid this warning.
lab = mode(labels)
This is my Python code, and i find some difficulties trying find a suited solution:
# Importing the required modules
import numpy as np
from scipy.stats import mode
# Euclidean Distance
def eucledian(p1, p2):
dist = np.sqrt(np.sum((p1 - p2) ** 2))
return dist
# Function to calculate KNN
def predict(x_train, y, x_input, k):
op_labels = []
# Loop through the Datapoints to be classified
for item in x_input:
# Array to store distances
point_dist = []
# Loop through each training Data
for j in range(len(x_train)):
distances = eucledian(np.array(x_train[j, :]), item)
# Calculating the distance
point_dist.append(distances)
point_dist = np.array(point_dist)
# Sorting the array while preserving the index
# Keeping the first K datapoints
dist = np.argsort(point_dist)[:k]
# Labels of the K datapoints from above
labels = y[dist]
** # Majority voting
lab = mode(labels)
lab = lab.mode[0]
op_labels.append(lab)**
return op_labels
# Importing the required modules
# Importing required modules
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from numpy.random import randint
# Loading the Data
iris= load_iris()
# Store features matrix in X
X= iris.data
# Store target vector in
y = iris.target
# Creating the training Data
train_idx = xxx = randint(0, 150, 100)
X_train = X[train_idx]
y_train = y[train_idx]
# Creating the testing Data
test_idx = xxx = randint(0, 150, 50) # taking 50 random samples
X_test = X[test_idx]
y_test = y[test_idx]
# Applying our function
y_pred = predict(X_train, y_train, X_test, 7)
# Checking the accuracy
accuracy_score(y_test, y_pred)
I am expecting a prediction/accuracy to be the prompt.
KNN can be done like this.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# Assign colum names to the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
# Read dataset to pandas dataframe
dataset = pd.read_csv(url, names=names)
dataset.head()
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Result:
precision recall f1-score support
Iris-setosa 1.00 1.00 1.00 13
Iris-versicolor 1.00 0.89 0.94 9
Iris-virginica 0.89 1.00 0.94 8
accuracy 0.97 30
macro avg 0.96 0.96 0.96 30
weighted avg 0.97 0.97 0.97 30
error = []
# Calculating error for K values between 1 and 40
for i in range(1, 40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

how to cast an array to a matrix?

I have a dataset with the following shapes: (2400, 2) (2400,) (1600, 2) (1600,)
My task is to perform non-linear separable classification by binary logistic regression.
But I get the following error in visualization part:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-2754b9327868> in <module>()
4
5 # Plot different regions and color them
----> 6 output = output.reshape(x_vals.shape)
7 plt.imshow(output, interpolation='nearest',
8 extent=(x_min, x_max, y_min, y_max),
ValueError: cannot reshape array of size 2880000 into shape (1200,1200)
How can I reshape array into matrix?
Below is my implementation for the reference:
num_features = 2
learning_rate = 0.0001
training_steps = 4000
batch_size = 32
display_step = 50
x_train, y_train = map(list, zip(*[(x,y) for x,y in zip(x_train, y_train) if y==0 or y==1]))
x_test, y_test = map(list, zip(*[(x,y) for x,y in zip(x_test, y_test) if y==0 or y==1]))
x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32)
y_train, y_test = np.array(y_train, np.int64), np.array(y_test, np.int64)
x_train, x_test = x_train.reshape([-1, num_features]), x_test.reshape([-1, num_features])
x_train, x_test = x_train/255., x_test/255.
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)
b = tf.Variable(tf.ones((num_features, 2)) * 0.000001, name = "weight")
b0 = tf.Variable(0., name = "bias")
def logistic_regression(x, b, b0):
return 1. / (1. + tf.exp(-tf.matmul(x, b) - b0))
def loglikelihood(p, y_true):
return tf.reduce_sum(tf.one_hot(y_true, 2) * tf.math.log(p), axis=-1)
def accuracy(y_pred, y_true):
correct_prediction = tf.equal(tf.argmax(y_pred, axis=-1), y_true)
return tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
optimizer = tf.optimizers.Adam()
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
with tf.GradientTape() as g:
g.watch([b, b0])
p = logistic_regression(batch_x, b, b0)
ll = loglikelihood(p, batch_y)
ll_sum = tf.reduce_mean(ll)
grad_b, grad_b0 = g.gradient(ll_sum, [b, b0])
optimizer.apply_gradients(zip([grad_b, grad_b0], [b, b0]))
if step % display_step == 0:
p = logistic_regression(batch_x, b, b0)
acc = accuracy(p, batch_y)
p = logistic_regression(x_test, b, b0)
val_acc = accuracy(p, y_test)
print("step: %i, acc: %f, val_acc %f" % (step, acc, val_acc))
def predict(x_test):
return tf.round(logistic_regression(x_test, b, b0))
import numpy as np
x_min, y_min = -12, -12
x_max, y_max = 12, 12
x_vals, y_vals = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
xy_grid = pd.DataFrame(zip(x_vals.ravel(), y_vals.ravel()), dtype=np.float32)
# Predict output labels for all the points on the grid
output = predict(xy_grid.to_numpy()).numpy()
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
# Plot different regions and color them
output = output.reshape(x_vals.shape)
plt.imshow(output, interpolation='nearest',
extent=(x_min, x_max, y_min, y_max),
cmap=plt.cm.Paired,
aspect='auto',
origin='lower')
pd.DataFrame(np.concatenate([x_train,
np.expand_dims(y_train, axis=-1)], axis=1)).plot.scatter(0, 1, c=2, colormap='viridis', ax=ax)
The expected result should be like this:
expected image
But I get the following image:
resulting image
The error is caused by x_vals.shape being too small. Hence numpy cannot reshape the data as you would be losing some data.
You need to change the shape of x_vals to be whatever shape output you expect you see.
For example, if you want a 1200x2400 image you can do:
x_min, y_min = -12, -12
x_max, y_max = 12, 12
x_vals, y_vals = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.02))
print(x_vals.shape) # shows x_vals.shape = (1200, 2400) which is 28880000 elements
output = np.arange(2880000)
output = output.reshape(x_vals.shape) # this will now work

How to fix "PermissionError: [WinError 5] Access is denied" in virtual environment and Jupyter notebook caused by n_jobs = -1

I am working within a virtual environment that was setup following https://docs.python.org/3/tutorial/venv.html
In addition I am using Jupyter Notebook.
In my code I am using sklearn.model_selection.cross_val_score(...). It seems that the parameter n_jobs = "1" or "-1" is causing issues such that using "1" I receive no errors. While using "-1" gives me the following error:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
'''
Traceback (most recent call last):
File "c:\users\chang\ml\lib\site-packages\sklearn\externals\joblib\externals\loky\process_executor.py", line 391, in _process_worker
call_item = call_queue.get(block=True, timeout=timeout)
File "C:\Users\chang\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\queues.py", line 99, in get
if not self._rlock.acquire(block, timeout):
PermissionError: [WinError 5] Access is denied
'''
The above exception was the direct cause of the following exception:
BrokenProcessPool Traceback (most recent call last)
<ipython-input-10-56afe11b41fd> in <module>
11 X_poly = poly.fit_transform(X)
12
---> 13 score = cross_val_score(lgr_clf, X_poly, y, cv=loo, scoring='accuracy', n_jobs=-1).mean()
14 scores.append(score)
15
c:\users\chang\ml\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
c:\users\chang\ml\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
c:\users\chang\ml\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
928
929 with self._backend.retrieval_context():
--> 930 self.retrieve()
931 # Make sure that we get a last message telling us we are done
932 elapsed_time = time.time() - self._start_time
c:\users\chang\ml\lib\site-packages\sklearn\externals\joblib\parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
c:\users\chang\ml\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
~\AppData\Local\Programs\Python\Python37-32\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
~\AppData\Local\Programs\Python\Python37-32\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
------------------------------------------------------------------------------
I have a second computer where the code is working, but there is no virtual environment setup.
Running the cmd as administrator does not fix my problem.
I do not have my virtual environment as a environment variable, but I do have C:\Users\chang\AppData\Local\Programs\Python\Python37-32 as an environment variable.
I suspect that I am missing a crucial step while setting up my virtual environment that leads to PermissionError: [WinError 5] Access is denied error.
#!/usr/bin/env python
# coding: utf-8
# In[14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing
from sklearn import neighbors
import statsmodels.api as sm
import statsmodels.formula.api as smf
get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('seaborn-white')
# In[15]:
df = pd.read_csv('Default.csv', index_col = 0)
df.info()
# In[16]:
##ESTIAMATE TEST ERROR. 3 SPLITS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
cols = ['student', 'balance', 'income']
X=df[cols]
y=df['default']
X=X.replace("Yes",1)
X=X.replace("No",0)
y=y.replace("Yes",1)
y=y.replace("No",0)
t_prop = 0.5
poly_order = np.arange(1,4) #degrees
r_state = np.arange(3) #number of splits
Z = np.zeros((poly_order.size,r_state.size))
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
for (i,j),v in np.ndenumerate(Z):
poly = PolynomialFeatures(int(X1[i,j]))
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3,)# random_state=42)
y_train_default = (y_train == 1)
y_test_default = (y_test == 1)
lgr_clf = LogisticRegression(solver = "lbfgs")
lgr_clf.fit(X_train, y_train_default)
y_train_pred = lgr_clf.predict(X_train)
y_test_pred = lgr_clf.predict(X_test)
Z[i,j]= metrics.accuracy_score(y_test, y_test_pred)
plt.plot(X1,Z)
plt.title('{} random splits of the data set'.format(max(r_state)+1))
plt.ylabel('Accuracy Score')
plt.ylim(.94,1)
plt.xlabel('Degree of Polynomial')
plt.xlim(1,3)
# In[17]:
##LOOCV
loo = LeaveOneOut()
loo.get_n_splits(df)
scores = list()
X = X[:2500]
y = y[:2500]
for i in poly_order:
poly = PolynomialFeatures(i)
X_poly = poly.fit_transform(X)
score = cross_val_score(lgr_clf, X_poly, y, cv=loo, scoring='accuracy', n_jobs = -1).mean()
scores.append(score)
# k-fold CV
folds = 3
elements = len(df.index)
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
Z3 = np.zeros((poly_order.size,r_state.size))
for (i,j),v in np.ndenumerate(Z3):
poly = PolynomialFeatures(X1[i,j])
X_poly = poly.fit_transform(X)
kf_10 = KFold(n_splits=folds, random_state=Y1[i,j])
Z3[i,j] = cross_val_score(lgr_clf, X_poly, y, cv=kf_10, scoring='accuracy').mean()
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
# Note: cross_val_score() method return negative values for the scores.
# https://github.com/scikit-learn/scikit-learn/issues/2439
# Left plot
ax1.plot(poly_order, np.array(scores), '-o')
ax1.set_title('LOOCV')
# Right plot
ax2.plot(X1,Z3,'-o')
ax2.set_title('3-fold CV')
for ax in fig.axes:
ax.set_ylabel('Mean Squared Error')
ax.set_xlabel('Degree of Polynomial')
ax.set_ylim(0.9,1)
ax.set_xlim(0.5,3.5)
#ax.set_xticks(range(1,5,2));
# In[18]:
##ESTIAMATE TEST ERROR. 4 SPLITS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
cols = ['student', 'balance', 'income']
X=df[cols]
y=df['default']
X=X.replace("Yes",1)
X=X.replace("No",0)
y=y.replace("Yes",1)
y=y.replace("No",0)
t_prop = 0.5
poly_order = np.arange(1,4) #degrees
r_state = np.arange(4) #number of splits
Z = np.zeros((poly_order.size,r_state.size))
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
for (i,j),v in np.ndenumerate(Z):
poly = PolynomialFeatures(int(X1[i,j]))
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3,)# random_state=42)
y_train_default = (y_train == 1)
y_test_default = (y_test == 1)
lgr_clf = LogisticRegression(solver = "lbfgs")
lgr_clf.fit(X_train, y_train_default)
y_train_pred = lgr_clf.predict(X_train)
y_test_pred = lgr_clf.predict(X_test)
Z[i,j]= metrics.accuracy_score(y_test, y_test_pred)
plt.plot(X1,Z)
plt.title('{} random splits of the data set'.format(max(r_state)+1))
plt.ylabel('Accuracy Score')
plt.ylim(.94,1)
plt.xlabel('Degree of Polynomial')
plt.xlim(1,3)
# In[19]:
##LOOCV
loo = LeaveOneOut()
loo.get_n_splits(df)
scores = list()
X = X[:2500]
y = y[:2500]
for i in poly_order:
poly = PolynomialFeatures(i)
X_poly = poly.fit_transform(X)
score = cross_val_score(lgr_clf, X_poly, y, cv=loo, scoring='accuracy', n_jobs = -1).mean()
scores.append(score)
# k-fold CV
folds = 4
elements = len(df.index)
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
Z4 = np.zeros((poly_order.size,r_state.size))
for (i,j),v in np.ndenumerate(Z4):
poly = PolynomialFeatures(X1[i,j])
X_poly = poly.fit_transform(X)
kf_10 = KFold(n_splits=folds, random_state=Y1[i,j])
Z4[i,j] = cross_val_score(lgr_clf, X_poly, y, cv=kf_10, scoring='accuracy').mean()
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
# Note: cross_val_score() method return negative values for the scores.
# https://github.com/scikit-learn/scikit-learn/issues/2439
# Left plot
ax1.plot(poly_order, np.array(scores), '-o')
ax1.set_title('LOOCV')
# Right plot
ax2.plot(X1,Z4,'-o')
ax2.set_title('4-fold CV')
for ax in fig.axes:
ax.set_ylabel('Mean Squared Error')
ax.set_xlabel('Degree of Polynomial')
ax.set_ylim(0.9,1)
ax.set_xlim(0.5,3.5)
#ax.set_xticks(range(1,5,2));
# In[21]:
##ESTIAMATE TEST ERROR. 5 SPLITS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
cols = ['student', 'balance', 'income']
X=df[cols]
y=df['default']
X=X.replace("Yes",1)
X=X.replace("No",0)
y=y.replace("Yes",1)
y=y.replace("No",0)
t_prop = 0.5
poly_order = np.arange(1,4) #degrees
r_state = np.arange(5) #number of splits
Z = np.zeros((poly_order.size,r_state.size))
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
for (i,j),v in np.ndenumerate(Z):
poly = PolynomialFeatures(int(X1[i,j]))
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3,)# random_state=42)
y_train_default = (y_train == 1)
y_test_default = (y_test == 1)
lgr_clf = LogisticRegression(solver = "lbfgs")
lgr_clf.fit(X_train, y_train_default)
y_train_pred = lgr_clf.predict(X_train)
y_test_pred = lgr_clf.predict(X_test)
Z[i,j]= metrics.accuracy_score(y_test, y_test_pred)
plt.plot(X1,Z)
plt.title('{} random splits of the data set'.format(max(r_state)+1))
plt.ylabel('Accuracy Score')
plt.ylim(.94,1)
plt.xlabel('Degree of Polynomial')
plt.xlim(1,3)
# In[22]:
##LOOCV
loo = LeaveOneOut()
loo.get_n_splits(df)
scores = list()
X = X[:2500]
y = y[:2500]
for i in poly_order:
poly = PolynomialFeatures(i)
X_poly = poly.fit_transform(X)
score = cross_val_score(lgr_clf, X_poly, y, cv=loo, scoring='accuracy', n_jobs = -1).mean()
scores.append(score)
# k-fold CV
folds = 5
elements = len(df.index)
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
Z5 = np.zeros((poly_order.size,r_state.size))
for (i,j),v in np.ndenumerate(Z5):
poly = PolynomialFeatures(X1[i,j])
X_poly = poly.fit_transform(X)
kf_10 = KFold(n_splits=folds, random_state=Y1[i,j])
Z5[i,j] = cross_val_score(lgr_clf, X_poly, y, cv=kf_10, scoring='accuracy').mean()
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
# Note: cross_val_score() method return negative values for the scores.
# https://github.com/scikit-learn/scikit-learn/issues/2439
# Left plot
ax1.plot(poly_order, np.array(scores), '-o')
ax1.set_title('LOOCV')
# Right plot
ax2.plot(X1,Z5,'-o')
ax2.set_title('5-fold CV')
for ax in fig.axes:
ax.set_ylabel('Mean Squared Error')
ax.set_xlabel('Degree of Polynomial')
ax.set_ylim(0.9,1)
ax.set_xlim(0.5,3.5)
#ax.set_xticks(range(1,5,2));
# In[23]:
#Analysis
#When Comparing the LOOCV to the random split, it can be seen that the
#LOOCV is closest to a linear model with polynomial degree one.
#This is also a true statement when compared to the K-fold CV.
#In addition the number of folds does not cause a huge deviation
#compared to LOOCV. This proves the statement in class that having
#a large or small number of folds does not necessarily make the model better
# In[ ]:
Additional Information/Updates:
2/3/2020
If anybody comes across this, here is a more active thread Github
Thread. Here someone has mentioned a new possible fix, but unsure
yet if it fixes the problem mentioned here. It is related to how the data
is read in, but I doubt that this is the solution.
Small update, I have yet to revisit this problem, but I recently
encountered a similar run time error for a different program (unsure
if it was the exact same run time error. Also unsure if it is correct
to even call this a run time error in the first place). I realized my python
was 32bit for some unknown reason. At this point I upgraded to 64bit
which fixed my problem. I have yet to try this on my old code posted
here. Unsure, but I also need to check if the python was 32bit on my
other machine.

Error when fiting linear binary classifier with tensorflow ValueError: No gradients provided for any variable, check your graph

I have an error when trying to fit a linear binary classifier using step function and MSE, instead of softmax and cross-entropy loss. I have and error which I can't overcome probably due to shape inconsistencies. I provide a code sample. Please help
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification as gen_data
from sklearn.model_selection import train_test_split
rng = np.random
# Setting hyperparameters
n_observations = 100
lr = 0.005
n_iter = 100
# Generate input data
xs, ys = gen_data(n_features=2, n_redundant=0, n_informative=2,
random_state=0, n_clusters_per_class=1)
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=.4)
X_train = np.float32(X_train)
X_test = np.float32(X_test)
# Graph
X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)
W = tf.Variable(np.float32(rng.randn(2)), name="weight")
b = tf.Variable(np.float32(rng.randn()), name="bias")
def step(x):
is_greater = tf.greater(x, 0)
as_float = tf.to_float(is_greater)
doubled = tf.multiply(as_float, 2)
return tf.subtract(doubled, 1)
Y_pred = step(tf.add(tf.multiply(X , W), b))
cost = tf.reduce_mean(tf.squared_difference(Y_pred, Y))
# Using built-in optimization algorithm to train the model:
train_step = tf.train.GradientDescentOptimizer(0.005).minimize(cost)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for step in range(n_iter):
sess.run(train_step, feed_dict={X:X_train, Y:y_train})
print ("iter: {0}; weight: {1}; bias: {2}".format(step,
sess.run(W),
sess.run(b)))
This is the error:
ValueErrorTraceback (most recent call last)
<ipython-input-17-5a0c4711802c> in <module>()
26
27 # Using built-in optimization algorithm to train the model:
---> 28 train_step = tf.train.GradientDescentOptimizer(0.005).minimize(cost)
29
30 # Using TF differentiation from scratch to implement a step-by-step optimizer
/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.pyc in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
405 "No gradients provided for any variable, check your graph for ops"
406 " that do not support gradients, between variables %s and loss %s." %
--> 407 ([str(v) for _, v in grads_and_vars], loss))
408
409 return self.apply_gradients(grads_and_vars, global_step=global_step,
ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables ["<tf.Variable 'weight:0' shape=(2,) dtype=float64_ref>", "<tf.Variable 'bias:0' shape=() dtype=float32_ref>", "<tf.Variable 'weight_1:0' shape=(2,) dtype=float64_ref>", "<tf.Variable 'bias_1:0' shape=() dtype=float32_ref>",
Your training data isn't changing between training steps. That is, each training step feeds the same values for X and Y:
for step in range(n_iter):
sess.run(train_step, feed_dict={X:X_train, Y:y_train})
If you set different values for X and Y between training steps, the error should go away.

Why doesn't Nearest Neighbour work on my data?

I am trying to learn a little about nearest neighbour matching. Below you see two scatter plots. The first shows the real data. I trying to use scikit-learn's NN-classifier to identify the white observations. The second scatter plot shows my achievement - which is entirely useless, as you can see.
I don't get why that is the case? It seems that the white observations are closely related and different fromt the other observations. What is happening here?
Here is what I do:
# import neccessary packages
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.cross_validation import train_test_split as tts
import matplotlib.pyplot as plt
from sklearn import neighbors
from matplotlib.colors import ListedColormap
# import data and give a little overview
sample = pd.read_stata('real_data_1.dta')
s = sample
print(s.dtypes)
print(s.shape)
# Nearest Neighboor
print(__doc__)
n_neighbors = 1
X = np.array((s.t_ums_ma, s.t_matauf)).reshape(918, 2)
y = np.array(s.matauf_measure)
plt.scatter(s.t_ums_ma,s.t_matauf, c=s.matauf_measure, label='Nordan Scatter', color='b', s=25, marker="o")
plt.xlabel('crisis')
plt.ylabel('current debt')
plt.title('Interesting Graph\nCheck it out')
plt.legend()
plt.gray()
plt.show()
X_train, X_test, y_train, y_test = tts(X, y, test_size = 1)
h = 0.02
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X_train[:, 0].min() - 0.01, X[:, 0].max() + 0.01
y_min, y_max = X_train[:, 1].min() - 0.01, X[:, 1].max() + 0.01
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.show()
Any help is greatly appreciated! Best /R

Resources