Linear regression with gradient descent is giving different result on the same dataset compared to sklearn.
Want to know why is that so. Is it the problem of local minima
The dataset is as follows
ht wt
63 127
64 121
66 142
69 157
69 162
71 156
71 169
72 165
73 181
75 208
Sklearn is computing intercept as -266.53439537 and coefficient as 6.13758146
whereas gradient descent is giving intercept as -1.49087014 and coeff as 2.3239637
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
def cost (m,b , data_size):
x = IN
y = OUT
totalError = 0
for i in range (data_size):
x = IN[i]
y = OUT[i]
totalError += ((m*x + b) - y) ** 2
return totalError/ float(data_size)
def compute_gradient(X , Y, theta_1 ,theta_0 , N, learning_rate):
gradient_theta_0 = 0
gradient_theta_1 = 0
#print (X.shape, Y.shape, N)
Y_pred = theta_1*X + theta_0
gradient_theta_1 = ((-2/N) * sum(X * (Y - Y_pred)))
gradient_theta_0 = ((-2/N) * sum(Y - Y_pred))
#print (gradient_theta_0 , gradient_theta_1, gradient_theta_0 *
learning_rate, gradient_theta_1 * learning_rate)
new_theta_0 = theta_0 - (gradient_theta_0 * learning_rate)
new_theta_1 = theta_1 - (gradient_theta_1 * learning_rate)
return (new_theta_1,new_theta_0)
IN = np.array([63 , 64, 66, 69, 69, 71, 71, 72, 73, 75])
OUT = np.array([127,121,142,157,162,156,169,165,181,208])
X = IN[:,np.newaxis]
Y = OUT[:,np.newaxis]
iterations = 10000
initial_theta_0 = 0
initial_theta_1 = 0
learning_rate = 0.00001
theta_0 = initial_theta_0
theta_1 = initial_theta_1
fig,ax = plt.subplots(figsize=(12,8))
cost_history = []
for i in range (iterations):
#print ("iteration {} m {} b {}".format(i, theta_1, theta_0))
[theta_1, theta_0] = compute_gradient(X , Y , theta_1 ,theta_0,
data_size, learning_rate)
totalError = cost (theta_1,theta_0, data_size)
#print (totalError)
cost_history.append (totalError)
print ("iteration {} m {} b {}".format(i, theta_1, theta_0))
reg_line = [(theta_1 * x) + theta_0 for x in IN]
lm = LinearRegression(), Y)
print ("SKLEARN coeff {}".format(lm.coef_))
print ("SKLEARN intercept {}".format(lm.intercept_))
#reg_line = [(lm.coef_[0] * x) + lm.intercept_ for x in IN]
ax3.plot (IN, reg_line , color='red')
print ("SKLEARN coeff {}".format(lm.coef_))
print ("SKLEARN intercept {}".format(lm.intercept_))
iteration 99999 m [2.3239637] b [-1.49087014]
SKLEARN coeff [[6.13758146]]
SKLEARN intercept [-266.53439537]

You have taken bad initial conditions (0,0) and fallen into a local minimum close to that point. More intuitive initial conditions are based on maxima and minima of ht and wt, i.e.
initial_theta_0 = np.min(Y)+np.min(X)*(np.min(Y)-np.max(Y))/(np.max(X)-np.min(X)) #-335.75
initial_theta_1 = (np.max(Y)-np.min(Y))/(np.max(X)-np.min(X)) # 7.25
#initial_theta_0 = 121+63*(121-208)/(75-63) # -335.75
#initial_theta_1 = (208-121)/(75-63) # 7.25


How to computes the Jacobian of BertForMaskedLM using jacrev

I tried this plan blow to compute the Jacobian of BertForMaskedLM using jacrev:
import numpy as np
from transformers import BertTokenizer,BertForMaskedLM
import torch
import torch.nn as nn
from functorch import make_functional, make_functional_with_buffers, vmap, vjp, jvp, jacrev
device = 'cuda:2'
model_name = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertForMaskedLM.from_pretrained(model_name)
net =
fnet, params, buffers = make_functional_with_buffers(net)
def fnet_single(params,x,y):
result = fnet(params, buffers, x.unsqueeze(0).unsqueeze(0),y.unsqueeze(0).unsqueeze(0))['logits']
return result.squeeze(0).squeeze(0)
text = u'大肠杆菌是人和许多动物肠道中最主要的一种细菌'
inputs = tokenizer.encode_plus(text)
segment_ids = inputs['token_type_ids']
token_ids = inputs['input_ids']
length = len(token_ids) - 2
batch_token_ids = torch.tensor([token_ids] * (2 * length - 1),requires_grad=True).to(device)
batch_segment_ids = torch.zeros_like(batch_token_ids).to(device)
for i in range(length):
if i > 0:
batch_token_ids[2 * i - 1, i] = 103
batch_token_ids[2 * i - 1, i + 1] = 103
batch_token_ids[2 * i, i + 1] = 103
threshold = 100
word_token_ids = [[token_ids[1]]]
for i in range(1, length):
x,y = batch_token_ids[2 * i],batch_segment_ids[2*i]
jacobian1 = jacrev(fnet_single,argnums=1)(params,x,y)
x,y = batch_token_ids[2 * i - 1],batch_segment_ids[2*i-1]
jacobian2 = jacrev(fnet_single,argnums=1)(params,x,y)
Howerer,an error appeared:
'Traceback (most recent call last):
File "", line 49, in
batch_token_ids = torch.tensor([token_ids] * (2 * length - 1),requires_grad=True).to(device)
RuntimeError: Only Tensors of floating point and complex dtype can require gradients'
Is there anyone to help me?
It is because you are trying to get the jacobian with respect to data for whom the gradient scope is not set.
If you want to get the jacobian wrt parameters: jacrev(fnet_single, argnums=0)(params, x, y)
If you want to get the jacobian wrt data: x = (note that converting x dtype to float is mandatory to set the scope on it)

numpy condition function for 2-D data

I have a synthetic dataset consisting of features (X) and labels (y) which is used for KMeans clustering using Python 3.8 and sklearn 0.22.2 and numpy 1.19.
X.shape, y.shape
# ((100, 2), (100,))
kmeans = KMeans(n_clusters = 3, init = 'random', n_init = 10, max_iter = 300)
# Train model on scaled features-
After training KMeans on 'X', I want to replace the unique (continuous) values of 'X' with the cluster centers (discreet) obtained using KMeans.
for i in range(3):
print("cluster number {0} has center = {1}".format(i + 1, kmeans.cluster_centers_[i, :]))
cluster number 1 has center = [-0.7869159 1.14173859]
cluster number 2 has center = [ 1.28010442 -1.04663318]
cluster number 3 has center = [-0.54654735 0.0054752 ]
# {0, 1, 2}
One way I have of doing it is:
X[np.where(clustered_labels == 0)] = val[0,:]
X[np.where(clustered_labels == 1)] = val[1,:]
X[np.where(clustered_labels == 2)] = val[2,:]
Can I do it using
cond = [clustered_labels == i for i in range(3)]
val = kmeans.cluster_centers_[:,:]
But on executing the code:, val)
I get the following error:
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) in
----> 1, val)
<array_function internals> in select(*args, **kwargs)
~/.local/lib/python3.8/site-packages/numpy/lib/ in
select(condlist, choicelist, default)
693 result_shape = condlist[0].shape
694 else:
--> 695 result_shape = np.broadcast_arrays(condlist[0], choicelist[0])[0].shape
697 result = np.full(result_shape, choicelist[-1], dtype)
<array_function internals> in broadcast_arrays(*args, **kwargs)
~/.local/lib/python3.8/site-packages/numpy/lib/ in
broadcast_arrays(subok, *args)
256 args = [np.array(_m, copy=False, subok=subok) for _m in args]
--> 258 shape = _broadcast_shape(*args)
260 if all(array.shape == shape for array in args):
~/.local/lib/python3.8/site-packages/numpy/lib/ in
187 # use the old-iterator because np.nditer does not handle size 0 arrays
188 # consistently
--> 189 b = np.broadcast(*args[:32])
190 # unfortunately, it cannot handle 32 or more arguments directly
191 for pos in range(32, len(args), 31):
ValueError: shape mismatch: objects cannot be broadcast to a single
Somewhat cleaner way to do it (but very similar to your way) will be the following. Here's a simple example:
from sklearn.cluster import KMeans
import numpy as np
x1 = np.random.normal(0, 2, 100)
y1 = np.random.normal(0, 1, 100)
label1 = np.ones(100)
d1 = np.column_stack([x1, y1, label1])
x2 = np.random.normal(3, 1, 100)
y2 = np.random.normal(1, 2, 100)
label2 = np.ones(100) * 2
d2 = np.column_stack([x2, y2, label2])
x3 = np.random.normal(-3, 0.5, 100)
y3 = np.random.normal(0.5, 0.25, 100)
label3 = np.ones(100) * 3
d3 = np.column_stack([x3, y3, label3])
D = np.row_stack([d1, d2, d3])
X = D[:, :2]
y = D[:, 2]
print(f'X.shape = {X.shape}, y.shape = {y.shape}')
# X.shape = (300, 2), y.shape = (300,)
kmeans = KMeans(n_clusters = 3, init = 'random', n_init = 10, max_iter = 300)
# Train model on scaled features-
preds = kmeans.predict(X)
X[preds==0] = kmeans.cluster_centers_[0]
X[preds==1] = kmeans.cluster_centers_[1]
X[preds==2] = kmeans.cluster_centers_[2]
Yet another way to accomplish the task is to use the np.put method instead of the assignment as follows:
np.put(X, preds==0, kmeans.cluster_centers_[0])
np.put(X, preds==1, kmeans.cluster_centers_[1])
np.put(X, preds==2, kmeans.cluster_centers_[2])
Frankly, I don't see a way to accomplish the task by the means of the function, and I guess the way you do it is the best way, based on this answer.

Scipy optimization with matrix multiplication

I've tried to use spicy.optimize.minimize to solve a matrix multiplication optimization problem, however, the result gives me a dimension error, can someone help me with it?
import numpy as np
from scipy.optimize import minimize
# define known variables, mu, sigma, rf
mu = np.matrix([[0.12],
sigma = np.matrix([[0.5, 0.05, 0.03],
[0.05, 0.4, 0.01],
[0.03, 0.01, 0.2]])
rf = 0.02
def objective_fun(x):
This is the objective function
s = np.sqrt(x.T * sigma * x)/(mu.T * x - rf)
return s
def constraint(x):
con = 1
for i in np.arange(0,3):
con = con - x[i]
return con
# set up the boundaries for x
bound_i = (0, np.Inf)
bnds = (bound_i, bound_i, bound_i)
#set up the constraints for x
con = {'type':'eq', 'fun':constraint}
# initial guess for variable x
x = np.matrix([[0.5],
sol = minimize(objective_fun, x, method = 'SLSQP', bounds = bnds, constraints = con)
The error gives me:
ValueError Traceback (most recent call last)
<ipython-input-31-b8901077b164> in <module>
----> 1 sol = minimize(objective_fun, x, method = 'SLSQP', bounds = bnds, constraints = con)
e:\Anaconda3\lib\site-packages\scipy\optimize\ in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
606 elif meth == 'slsqp':
607 return _minimize_slsqp(fun, x0, args, jac, bounds,
--> 608 constraints, callback=callback, **options)
609 elif meth == 'trust-constr':
610 return _minimize_trustregion_constr(fun, x0, args, jac, hess, hessp,
e:\Anaconda3\lib\site-packages\scipy\optimize\ in _minimize_slsqp(func, x0, args, jac, bounds, constraints, maxiter, ftol, iprint, disp, eps, callback, **unknown_options)
398 # Compute objective function
--> 399 fx = func(x)
400 try:
401 fx = float(np.asarray(fx))
e:\Anaconda3\lib\site-packages\scipy\optimize\ in function_wrapper(*wrapper_args)
324 def function_wrapper(*wrapper_args):
325 ncalls[0] += 1
--> 326 return function(*(wrapper_args + args))
328 return ncalls, function_wrapper
<ipython-input-28-b1fb2386a380> in objective_fun(x)
3 This is the objective function
4 '''
----> 5 s = np.sqrt(x.T * sigma * x)/(mu.T * x - rf)
6 return s
e:\Anaconda3\lib\site-packages\numpy\matrixlib\ in __mul__(self, other)
218 if isinstance(other, (N.ndarray, list, tuple)) :
219 # This promotes 1-D vectors to row vectors
--> 220 return, asmatrix(other))
221 if isscalar(other) or not hasattr(other, '__rmul__') :
222 return, other)
ValueError: shapes (1,3) and (1,3) not aligned: 3 (dim 1) != 1 (dim 0)
However, I tried individually every function I wrote, they all have no errors in the end, like, if after defining the x matrix as shown in the code, I simply run objective_fun(x) in the console, and I immediately get an answer:
That means that my function can do the matrix multiplication correctly, so what is wrong with the code here?
The docs for minimize() says that x0 should be an (n,) shaped array, but you are trying to treat it like a (3,1) array. I'm not sure on the inner workings of minimize() but I suspect when it steps over different values of the fit parameters it converts to the format that it thinks it wants. Anyways, the following minor corrections make it so the code works.
import numpy as np
from scipy.optimize import minimize
# define known variables, mu, sigma, rf
mu = np.matrix([[0.12],
sigma = np.matrix([[0.5, 0.05, 0.03],
[0.05, 0.4, 0.01],
[0.03, 0.01, 0.2]])
rf = 0.02
def objective_fun(x):
This is the objective function
x = np.expand_dims(x, 1) # convert the (3,) shape to (3,1). Then we can do our normal matrix math on it
s = np.sqrt(x.T * sigma * x)/(mu.T * x - rf) # Transposes so the shapes are correct
return s
def constraint(x):
con = 1
for i in np.arange(0,3):
con = con - x[i]
return con
# set up the boundaries for x
bound_i = (0, np.Inf)
bnds = (bound_i, bound_i, bound_i)
#set up the constraints for x
con = {'type':'eq', 'fun':constraint}
# initial guess for variable x
x = np.array([0.5, 0.3, 0.2]) # Defining the initial guess as an (3,) array)
sol = minimize(objective_fun, x, method = 'SLSQP', bounds = bnds, constraints = con)
print(sol) # and the solution looks reasonable
fun: 5.86953830952583
jac: array([-1.70555401, -1.70578796, -1.70573896])
message: 'Optimization terminated successfully.'
nfev: 32
nit: 6
njev: 6
status: 0
success: True
x: array([0.42809911, 0.29522438, 0.27667651])
Take a look at the comments I put in for an explanation on what you need to do.

Sklearn BIC criterion : differents optimum values of k for clustering

I want to determine the best value of k (number of clusters) for the KMeans algo and a dataset.
I found a ressource in the documentation of Sklearn : The Gaussian Mixture Model Selection using the BIC criterion.
I found an example of code on the site that I adapted to my dataset.
But each run of this code give a different value of optimal value of k . Why ?
Here the code :
import numpy as np
import pandas as pd
import itertools
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import mixture
# Number of samples per component
n_samples = 440
path = 'C:/Users/Lionel/Downloads'
file = 'Wholesale customers data.csv'
data = pd.read_csv(path + '/'+file)
X = np.array(data.iloc[:,2 :])
lowest_bic = np.infty
bic = []
n_components_range = range(1, 12)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a Gaussian mixture with EM
gmm = mixture.GaussianMixture(n_components=n_components,
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
bic = np.array(bic)
color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
clf = best_gmm
bars = []
# Plot the BIC scores
spl = plt.subplot(2, 1, 1)
#spl = plt.plot()
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
xpos = np.array(n_components_range) + .2 * (i - 2)
bars.append(, bic[i * len(n_components_range):
(i + 1) * len(n_components_range)],
width=.2, color=color))
plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
plt.title('BIC score per model')
xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
.2 * np.floor(bic.argmin() / len(n_components_range))
plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
spl.set_xlabel('Number of components')
spl.legend([b[0] for b in bars], cv_types)
# Plot the winner
splot = plt.subplot(2, 1, 2)
Y_ = clf.predict(X)
for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_,
v, w = linalg.eigh(cov)
if not np.any(Y_ == i):
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan2(w[0][1], w[0][0])
angle = 180. * angle / np.pi # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
plt.title('Selected GMM: full model, 2 components')
plt.subplots_adjust(hspace=.35, bottom=.02)
Here the link to my dataset :
Have you an explanation for this behaviour ?

Trying to to use Caffe classifier causes "sequence argument must have length equal to input rank "error

I am trying to use Caffe.Classifier class and its predict() method on my Imagenet trained caffemodel.
Images were resized to 256x256 and crops of 227x227 were used to train the net.
Everything is simple and straight forward, yet I keep getting weird errors such as the following :
RuntimeError Traceback (most recent call last)
<ipython-input-7-3b440ebf1f6e> in <module>()
17 image_dims=(256, 256))
---> 19 out = net.predict([image_caffe], oversample=True)
20 print(labels[out[0].argmax()].strip(),' (', out[0][out[0].argmax()] , ')')
21 plabel = int(labels[out[0].argmax()].strip())
<ipython-input-5-e6ae1810b820> in predict(self, inputs, oversample)
65 for ix, in_ in enumerate(inputs):
66 print('image dims = ',self.image_dims[0],',',self.image_dims[1] ,'_in = ',in_.shape)
---> 67 input_[ix] =, self.image_dims)
69 if oversample:
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\caffe\ in resize_image(im, new_dims, interp_order)
335 # ndimage interpolates anything but more slowly.
336 scale = tuple(np.array(new_dims, dtype=float) / np.array(im.shape[:2]))
--> 337 resized_im = zoom(im, scale + (1,), order=interp_order)
338 return resized_im.astype(np.float32)
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\scipy\ndimage\ in zoom(input, zoom, output, order, mode, cval, prefilter)
588 else:
589 filtered = input
--> 590 zoom = _ni_support._normalize_sequence(zoom, input.ndim)
591 output_shape = tuple(
592 [int(round(ii * jj)) for ii, jj in zip(input.shape, zoom)])
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\scipy\ndimage\ in _normalize_sequence(input, rank, array_type)
63 if len(normalized) != rank:
64 err = "sequence argument must have length equal to input rank"
---> 65 raise RuntimeError(err)
66 else:
67 normalized = [input] * rank
RuntimeError: sequence argument must have length equal to input rank
And here is the snippets of code I'm using :
import sys
import caffe
import numpy as np
import lmdb
import matplotlib.pyplot as plt
import itertools
def flat_shape(x):
"Returns x without singleton dimension, eg: (1,28,28) -> (28,28)"
return x.reshape(x.shape)
def db_reader(fpath, type='lmdb'):
if type == 'lmdb':
return lmdb_reader(fpath)
return leveldb_reader(fpath)
def lmdb_reader(fpath):
import lmdb
lmdb_env =
lmdb_txn = lmdb_env.begin()
lmdb_cursor = lmdb_txn.cursor()
for key, value in lmdb_cursor:
datum = caffe.proto.caffe_pb2.Datum()
label = int(datum.label)
image =
yield (key, flat_shape(image), label)
def leveldb_reader(fpath):
import leveldb
db = leveldb.LevelDB(fpath)
for key, value in db.RangeIter():
datum = caffe.proto.caffe_pb2.Datum()
label = int(datum.label)
image =
yield (key, flat_shape(image), label)
Classifier class (copied form Caffe's python directory):
import numpy as np
import caffe
class Classifier(caffe.Net):
Classifier extends Net for image class prediction
by scaling, center cropping, or oversampling.
image_dims : dimensions to scale input for cropping/sampling.
Default is to scale to net input size for whole-image crop.
mean, input_scale, raw_scale, channel_swap: params for
preprocessing options.
def __init__(self, model_file, pretrained_file, image_dims=None,
mean=None, input_scale=None, raw_scale=None,
caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
# configure pre-processing
in_ = self.inputs[0]
self.transformer =
{in_: self.blobs[in_].data.shape})
self.transformer.set_transpose(in_, (2, 0, 1))
if mean is not None:
self.transformer.set_mean(in_, mean)
if input_scale is not None:
self.transformer.set_input_scale(in_, input_scale)
if raw_scale is not None:
self.transformer.set_raw_scale(in_, raw_scale)
if channel_swap is not None:
self.transformer.set_channel_swap(in_, channel_swap)
print('crops: ',self.blobs[in_].data.shape[2:])
self.crop_dims = np.array(self.blobs[in_].data.shape[2:])
if not image_dims:
image_dims = self.crop_dims
self.image_dims = image_dims
def predict(self, inputs, oversample=True):
Predict classification probabilities of inputs.
inputs : iterable of (H x W x K) input ndarrays.
oversample : boolean
average predictions across center, corners, and mirrors
when True (default). Center-only prediction when False.
predictions: (N x C) ndarray of class probabilities for N images and C
# Scale to standardize input dimensions.
input_ = np.zeros((len(inputs),
for ix, in_ in enumerate(inputs):
print('image dims = ',self.image_dims[0],',',self.image_dims[1] ,'_in = ',in_.shape)
input_[ix] =, self.image_dims)
if oversample:
# Generate center, corner, and mirrored crops.
input_ =, self.crop_dims)
# Take center crop.
center = np.array(self.image_dims) / 2.0
crop = np.tile(center, (1, 2))[0] + np.concatenate([
-self.crop_dims / 2.0,
self.crop_dims / 2.0
input_ = input_[:, crop[0]:crop[2], crop[1]:crop[3], :]
# Classify
caffe_in = np.zeros(np.array(input_.shape)[[0, 3, 1, 2]],
for ix, in_ in enumerate(input_):
caffe_in[ix] = self.transformer.preprocess(self.inputs[0], in_)
out = self.forward_all(**{self.inputs[0]: caffe_in})
predictions = out[self.outputs[0]]
# For oversampling, average predictions across crops.
if oversample:
predictions = predictions.reshape((len(predictions) / 10, 10, -1))
predictions = predictions.mean(1)
return predictions
Main section :
proto ='deploy.prototxt'
# Extract mean from the mean image file
#mean_blobproto_new = caffe.proto.caffe_pb2.BlobProto()
#f = open(mean, 'rb')
#mean_image =
mu = np.load('mean.npy').mean(1).mean(1)
reader = lmdb_reader(db_path)
i = 0
for i, image, label in reader:
image_caffe = image.reshape(1, *image.shape)
print(image_caffe.shape, mu.shape)
net = Classifier(proto, model,
mean= mu,
image_dims=(256, 256))
out = net.predict([image_caffe], oversample=True)
print(i, labels[out[0].argmax()].strip(),' (', out[0][out[0].argmax()] , ')')
What is wrong here?
I found the cause, I had to feed the image in the form of 3D tensor not a 4D one!
so our 4d tensor:
image_caffe = image.reshape(1, *image.shape)
needed to be changed to a 3D one:
image_caffe = image.transpose(2,1,0)
As a side note, try using python2 for running any caffe related. python3 might work at first but will definitely cause a lot of headaches. for instance, predict method with oversample set to True, will crash under python3 but works just fine under python2!
