Argument must be a string or a number issue, Not 'Type' - Pyspark - python-3.x

Update:
So i have been looking into the issue, the problem is with scikit-multiflow datastream. in last quarter of code stream_clf.partial_fit(X,y, classes=stream.target_values) here the class valuefor stream.target_values should a number or string, but the method is returning (dtype). When i print or loop stream.target_values i get this:
I have tried to do conversion etc. but still of no use. can someone please help here ?
Initial Problem
I am running a code (took inspiration from here). It works perfectly alright when used vanilla python environment.
But if i run this code after certain modification in Apache Spark using Pyspark , i get the following error
TypeError: int() argument must be a string, a bytes-like object or a number, not 'type'
I have tried every possibile way to trace the issue but everything looks alright. The error arises from the last line of the code where hoefding tree is called for prediction. It expects an ndarray and the type of X variable is also ndarray. I am not sure what is trigerring the issue. Can some one please help or direct me to right trace?
complete stack of error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-52-1310132c88db> in <module>
30 D3_win.addInstance(X,y)
31 xx = np.array(X,dtype='float64')
---> 32 y_hat = stream_clf.predict(xx)
33
34
~/conceptDrift/projectTest/lib/python3.5/site-packages/skmultiflow/trees/hoeffding_tree.py in predict(self, X)
1068 r, _ = get_dimensions(X)
1069 predictions = []
-> 1070 y_proba = self.predict_proba(X)
1071 for i in range(r):
1072 index = np.argmax(y_proba[i])
~/conceptDrift/projectTest/lib/python3.5/site-packages/skmultiflow/trees/hoeffding_tree.py in predict_proba(self, X)
1099 votes = normalize_values_in_dict(votes, inplace=False)
1100 if self.classes is not None:
-> 1101 y_proba = np.zeros(int(max(self.classes)) + 1)
1102 else:
1103 y_proba = np.zeros(int(max(votes.keys())) + 1)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'type'
Code
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import functions as fn
import sys
from pyspark import SparkContext,SparkConf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score as AUC
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from skmultiflow.trees.hoeffding_tree import HoeffdingTree
from skmultiflow.data.data_stream import DataStream
import time
def drift_detector(S,T,threshold = 0.75):
T = pd.DataFrame(T)
#print(T)
S = pd.DataFrame(S)
# Give slack variable in_target which is 1 for old and 0 for new
T['in_target'] = 0 # in target set
S['in_target'] = 1 # in source set
# Combine source and target with new slack variable
ST = pd.concat( [T, S], ignore_index=True, axis=0)
labels = ST['in_target'].values
ST = ST.drop('in_target', axis=1).values
# You can use any classifier for this step. We advise it to be a simple one as we want to see whether source
# and target differ not to classify them.
clf = LogisticRegression(solver='liblinear')
predictions = np.zeros(labels.shape)
# Divide ST into two equal chunks
# Train LR on a chunk and classify the other chunk
# Calculate AUC for original labels (in_target) and predicted ones
skf = StratifiedKFold(n_splits=2, shuffle=True)
for train_idx, test_idx in skf.split(ST, labels):
X_train, X_test = ST[train_idx], ST[test_idx]
y_train, y_test = labels[train_idx], labels[test_idx]
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)[:, 1]
predictions[test_idx] = probs
auc_score = AUC(labels, predictions)
print(auc_score)
# Signal drift if AUC is larger than the threshold
if auc_score > threshold:
return True
else:
return False
class D3():
def __init__(self, w, rho, dim, auc):
self.size = int(w*(1+rho))
self.win_data = np.zeros((self.size,dim))
self.win_label = np.zeros(self.size)
self.w = w
self.rho = rho
self.dim = dim
self.auc = auc
self.drift_count = 0
self.window_index = 0
def addInstance(self,X,y):
if(self.isEmpty()):
self.win_data[self.window_index] = X
self.win_label[self.window_index] = y
self.window_index = self.window_index + 1
else:
print("Error: Buffer is full!")
def isEmpty(self):
return self.window_index < self.size
def driftCheck(self):
if drift_detector(self.win_data[:self.w], self.win_data[self.w:self.size], auc): #returns true if drift is detected
self.window_index = int(self.w * self.rho)
self.win_data = np.roll(self.win_data, -1*self.w, axis=0)
self.win_label = np.roll(self.win_label, -1*self.w, axis=0)
self.drift_count = self.drift_count + 1
return True
else:
self.window_index = self.w
self.win_data = np.roll(self.win_data, -1*(int(self.w*self.rho)), axis=0)
self.win_label =np.roll(self.win_label, -1*(int(self.w*self.rho)), axis=0)
return False
def getCurrentData(self):
return self.win_data[:self.window_index]
def getCurrentLabels(self):
return self.win_label[:self.window_index]
def select_data(x):
x = "/user/hadoop1/tellus/sea_1.csv"
peopleDF = spark.read.csv(x, header= True)
df = peopleDF.toPandas()
scaler = MinMaxScaler()
df.iloc[:,0:df.shape[1]-1] = scaler.fit_transform(df.iloc[:,0:df.shape[1]-1])
return df
def check_true(y,y_hat):
if(y==y_hat):
return 1
else:
return 0
df = select_data("/user/hadoop1/tellus/sea_1.csv")
stream = DataStream(df)
stream.prepare_for_use()
stream_clf = HoeffdingTree()
w = int(2000)
rho = float(0.4)
auc = float(0.60)
# In[ ]:
D3_win = D3(w,rho,stream.n_features,auc)
stream_acc = []
stream_record = []
stream_true= 0
i=0
start = time.time()
X,y = stream.next_sample(int(w*rho))
stream_clf.partial_fit(X,y, classes=stream.target_values)
while(stream.has_more_samples()):
X,y = stream.next_sample()
if D3_win.isEmpty():
D3_win.addInstance(X,y)
y_hat = stream_clf.predict(X)

Problem was with select_data() function, data type of variables was being changed during the execution. This issue is fixed now.

Related

Predicting classes in MNIST dataset with a Gaussian- the same prediction errors with different paramemters?

I am trying to find the best c parameter following the instructions to a task that asks me to ' Define a function, fit_generative_model, that takes as input a training set (train_data, train_labels) and fits a Gaussian generative model to it. It should return the parameters of this generative model; for each label j = 0,1,...,9, where
pi[j]: the frequency of that label
mu[j]: the 784-dimensional mean vector
sigma[j]: the 784x784 covariance matrix
It is important to regularize these matrices. The standard way of doing this is to add cI to them, where c is some constant and I is the 784-dimensional identity matrix. c is now a parameter, and by setting it appropriately, we can improve the performance of the model.
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import gzip, os
import numpy as np
from scipy.stats import multivariate_normal
if sys.version_info[0] == 2:
from urllib import urlretrieve
else:
from urllib.request import urlretrieve
# Downloads the dataset
def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
print("Downloading %s" % filename)
urlretrieve(source + filename, filename)
# Invokes download() if necessary, then reads in images
def load_mnist_images(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
data = data.reshape(-1,784)
return data
def load_mnist_labels(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=8)
return data
## Load the training set
train_data = load_mnist_images('train-images-idx3-ubyte.gz')
train_labels = load_mnist_labels('train-labels-idx1-ubyte.gz')
## Load the testing set
test_data = load_mnist_images('t10k-images-idx3-ubyte.gz')
test_labels = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
train_data.shape, train_labels.shape
So I have written this code for three different C-values. they each give me the same error?
def fit_generative_model(x,y):
lst=[]
for c in [20,200, 4000]:
k = 10 # labels 0,1,...,k-1
d = (x.shape)[1] # number of features
mu = np.zeros((k,d))
sigma = np.zeros((k,d,d))
pi = np.zeros(k)
for label in range(0,k):
indices = (y == label)
mu[label] = np.mean(x[indices,:], axis=0)
sigma[label] = np.cov(x[indices,:], rowvar=0, bias=1) + c*np.identity(784) # I define the identity matrix
predictions = np.argmax(score, axis=1)
errors = np.sum(predictions != y)
lst.append(errors)
print(c,"Model makes " + str(errors) + " errors out of 10000", lst)
Then I fit it to the training data and get these same errors:
mu, sigma, pi = fit_generative_model(train_data, train_labels)
20 Model makes 1 errors out of 10000 [1]
200 Model makes 1 errors out of 10000 [1, 1]
4000 Model makes 1 errors out of 10000 [1, 1, 1]
and to the test data:
mu, sigma, pi = fit_generative_model(test_data, test_labels)
20 Model makes 9020 errors out of 10000 [9020]
200 Model makes 9020 errors out of 10000 [9020, 9020]
4000 Model makes 9020 errors out of 10000 [9020, 9020, 9020]
What is it I'm doing wrong? the correct answer is c=4000 which yields an error of ~4.3%.

I am using Python to implement linear regression on some dataset, but on this step I am continously getting this error

I wrote this linear regression code and now it is giving me an error:
at def iterate_weights function.error = index 200 is out of bounds for
axis 0 with size 200
I don't know what is wrong. Also when I am uploading my weights they are coming the same as above which I chose at random. I am using Jupyter notebook.
Are there any mistakes?
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
#importing dataset
data = pd.read_csv('F:\WOC\linearreg.csv')
print(data.shape)
data.head()
data_arr = np.genfromtxt("F:\WOC\linearreg.csv", delimiter=",", skip_header=1)
print(data_arr)
# In[3]:
#collecting x and y
x_train = data_arr[:,1:4]
y_train = data_arr[:,4:5]
print(x_train)
print(y_train)
# In[4]:
weights_shape = y_train.shape
print(weights_shape)
r,c = x_train.shape
print(r,c)
w = np.random.randn(c,1)
w_num = len(w)
print(w)
# In[5]:
h = np.dot(x_train,w)
def cost_function():
print(h)
j = (1/2*r)*((h-y_train)**2)
print('j',j)
cost_function()
# In[6]:
def iterate_weights():
L=0.01
iterations = 1000
for iterations_proceed in range(1,1001):
for i in range(w_num):
for m in range(1,201):
w[i,0] = w[i,0]-L*((1/r)*(sum(h-y_train)*(x_train[m,i])))
print(w)
iterate_weights()
# In[7]:
h = np.dot(x_train,w)
def cost_function1():
j = np.sum((1/2*r)*((h-y_train)**2))
print(j)

Pad data using tf.data.Dataset

I have to use tf.data.Dataset for creating a input pipeline for an RNN model in tensorflow. I am providing a basic code, by which I need to pad the data in batch with a pad token and use it for further manipulation.
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
total_data_size = 10000
embedding_dimension = 25
max_len = 17
varying_length = np.random.randint(max_len, size=(10000)) # varying length data
X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length
Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary
embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding
def gen():
for index in range(len(X)):
yield X[index] , Y[index]
dataset = tf.data.Dataset.from_generator(gen,(tf.int32,tf.int32))
dataset = dataset.batch(batch_size=25)
padded_shapes = (tf.TensorShape([None])) # sentence of unknown size
padding_values = (tf.constant(-111)) # the value with which pad index needs to be filled
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values)
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
print(sess.run(iter2.get_next()))
I hope the code is self explanatory with comments. But I am getting following error,
InvalidArgumentError (see above for traceback): Cannot batch tensors with different shapes in component 0. First element had shape [11] and element 1 had shape [12].
[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,?], [?]], output_types=[DT_INT32, DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Iterator)]]
I believe that since your generator yields two outputs, your padded_shapes and padded_values tuples must have a length of two. For me, this works:
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32))
dataset = dataset.batch(batch_size=25)
padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None])) # sentence of unknown size
padding_values = (tf.constant(-111), tf.constant(-111)) # the value with which pad index needs to be filled
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values)
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
Finally got the answer. The issue was for the second padded shapes instead of Tensorshape([None]), we should provide [], because the second item returned by the generator is a scalar. If using Tensorshape([None]),, make sure we are returning a vector
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
total_data_size = 10000
embedding_dimension = 25
max_len = 17
varying_length = np.random.randint(max_len, size=(10000)) # varying length data
X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length
Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary
embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding
def gen():
for index in range(len(X)):
yield X[index] , Y[index]
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), (tf.TensorShape([None]), []))
padded_shapes = (tf.TensorShape([None]), []) # sentence of unknown size
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=(-111, 0))
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
sess.run(iter2.get_next())

keras:how to get initial loss function value before training

In Keras, I checked the callbacks mechanism.
However it does not provide any information before the start of training.Like the output is always after epoch = 1.I would like to check the value of the loss function for the first time feed forward.How can I achieve this?
Thanks.
This answer does not work.
'setting model.trainable = False and then train the model'.How to perform feed forward propagation in CNN using Keras?
I set model.trainable = False before compiling the model, but the model still output different loss functions.This is weird.It is supposed to output a constant loss which is the loss when a forward-feed is performed.
The code is in the following:
from keras import backend as K
from keras.models import Model
from keras.layers import Dense, Input
from keras.models import Sequential
import numpy as np
import random
from keras.layers import Input, Dense
from keras.models import Model
from keras.layers.core import Dropout,Activation,Flatten,Lambda
from keras.layers.normalization import BatchNormalization
import keras
import time
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from ann_visualizer.visualize import ann_viz;
def gen_x(n,p,rho):
if abs(rho) < 1 :
beta=np.sqrt(rho/(1-rho))
x0=np.random.normal(size=(n,p))
z=np.random.normal(size=(n,1))
x=beta*np.repeat(z,repeats=p,axis=1)+x0
if abs(rho)==1:
x=np.repeat(z,repeats=p,axis=1)
return x
## This function creates true survival times as described in section 3 of the paper. In all simulations we set snr (signal to noise ratio) to 3.
def genecoef(p):
#return list( map(lambda x : np.power(-1,x)*np.exp(-0.1*(x-1)), np.arange(1,p+1,1)) )
return list( np.random.rand(p) )
def gen_times(x,snr):
n,p=x.shape
coef=genecoef(p)
f=np.matmul(np.matrix(x),np.matrix(coef).T)
e=np.random.normal(size=(n,1))
k=np.sqrt(np.var(f)/(snr*np.var(e)))
y=np.exp(f+k*e)
return(y)
## This function creates true survival times as described in section 3 of the paper. In all simulations we set snr (signal to noise ratio) to 3.
def gen_times_censor(x,snr):
n,p=x.shape
coef=genecoef(p)
f=np.matmul(np.matrix(x),np.matrix(coef).T)
e=np.random.normal(size=(n,1))
k=np.sqrt(np.var(f)/(snr*np.var(e)))
y=np.exp(k*e)
return(y)
def nltr(x):
y1 = x[:,0]*x[:,1]
y2 = x[:,2]*x[:,3]
y3 = x[:,4]**2
y4 = x[:,5]* (x[:,6]**2)
y5 = x[:,7]*x[:,8]* x[:,9]
y6 = 0.5 *np.exp(x[:,8]* x[:,9])
newx = np.column_stack((y1,y2,y3,y4,y5,y6))
return newx
def survdata(n,p,snr,rho):
x = gen_x(n,p,rho)
time = gen_times(x,snr)
censortime = gen_times_censor(x,snr)
y = np.apply_along_axis(np.min,1,np.column_stack((time,censortime)))
y = np.array(y)
#b==0 censored b ==1 uncensored
b = np.apply_along_axis(np.argmax,1,np.column_stack((time,censortime)))
b = np.array(b)
a = x
ordery=np.argsort(y)
a=a[ordery]
y=y[ordery]
b=b[ordery]
Rlist=[]
event_index=np.argwhere(b==1).ravel().astype(np.int32)
nsample=len(b)
nevent=sum(b)
Rlist=[]
for j in range(nevent):
Rlist+=[ list(range(np.argwhere(b==1).ravel()[j],nsample) )]
bmask = b.astype(bool)
cumlist=list(reversed(np.append(event_index,n)))
slarr=np.vectorize(lambda x:(len(x)-1))
nctrue = np.sum(slarr(Rlist))
#a:n(#samples)*p(#features) matrix,survival time from short to high
#y:survival time
#b censored(0) or not(1)
#bmask bool(b)
#nevent #uncensored
return a,y,b,bmask,nsample,nevent,event_index,Rlist,cumlist,nctrue
n=50
p=10
snr=1
rho=0.1
a,y,b,bmask,nsample,nevent,event_index,Rlist,cumlist,nctrue= survdata(n,p,snr,rho)
sc=StandardScaler()
a=nltr(a)
a=sc.fit_transform(a)
def ploss(y_true,y_pred):
#y_pred for sample x_i is the value of np.dot(x_i,beta) in the linear cox case
#y_pred is the loss for sample i
z = 0
#for j in event_index:
#z = z + K.sum(y_pred[j,0])
#z = z + K.constant(y_pred[j,0])
#z = K.sum(tf.boolean_mask(y_pred,bmask) )
#iz = K.print_tensor(tf.boolean_mask(y_pred,bmask),'y_pred_mask is')
#gz = K.print_tensor(K.gather(y_pred,event_index),'y_pred_gather is')
z = K.sum(K.gather(y_pred,event_index))
currentsum = 0
for j in range(nevent):
currentsum = currentsum + K.sum(K.exp(K.gather(y_pred,\
np.array(range(cumlist[j+1],cumlist[j])))))
z = z - K.log(currentsum)
#tempz=0
#for i in j:
#tempz = tempz + K.exp(y_pred[i,0])
#z = z - K.log(tempz)
z = -z
return z
def c_index_func(y_true, y_pred):
#y_pred is the loss for sample i
c_hat = 0
for i in range(nevent-1):
c_hat = c_hat + K.sum(K.cast(y_pred[event_index[i]+1:,0]\
<y_pred[event_index[i],0],tf.float32))
#c_hat = c_hat + K.sum(K.cast(y_pred[event_index[i]+1:,0]\
#<y_pred[event_index[i],0],float32))
return c_hat/nctrue
model=Sequential()
model.add(Dense(1,activation='linear',kernel_initializer='one',\
batch_input_shape=(a.shape[0],a.shape[1])))
#model.add(Dropout(0.2))
#model.compile(loss=ploss,optimizer='newton-raphson')
#model.compile(loss=ploss,optimizer=keras.optimizers.Adam(lr=0, beta_1=0.9, beta_2=0.999, \
#epsilon=None, decay=0.0, amsgrad=False),metrics=[c_index_func])
model.trainable=False
model.compile(loss=ploss,optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.0, \
decay=0.0, nesterov=False),metrics=[c_index_func])
model.fit(x=a,y=y,batch_size=len(a),epochs=3,verbose=2)
For this you can just use model.evaluate(x, y) and it will return an array with the loss and metrics. The first element of this array is the loss on the given data. Just do this before training and it will give you the initial loss.
it is very easy just make the learning rate = 0 and train the DNN then all the losses are the initial loss

Trying to to use Caffe classifier causes "sequence argument must have length equal to input rank "error

I am trying to use Caffe.Classifier class and its predict() method on my Imagenet trained caffemodel.
Images were resized to 256x256 and crops of 227x227 were used to train the net.
Everything is simple and straight forward, yet I keep getting weird errors such as the following :
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-7-3b440ebf1f6e> in <module>()
17 image_dims=(256, 256))
18
---> 19 out = net.predict([image_caffe], oversample=True)
20 print(labels[out[0].argmax()].strip(),' (', out[0][out[0].argmax()] , ')')
21 plabel = int(labels[out[0].argmax()].strip())
<ipython-input-5-e6ae1810b820> in predict(self, inputs, oversample)
65 for ix, in_ in enumerate(inputs):
66 print('image dims = ',self.image_dims[0],',',self.image_dims[1] ,'_in = ',in_.shape)
---> 67 input_[ix] = caffe.io.resize_image(in_, self.image_dims)
68
69 if oversample:
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\caffe\io.py in resize_image(im, new_dims, interp_order)
335 # ndimage interpolates anything but more slowly.
336 scale = tuple(np.array(new_dims, dtype=float) / np.array(im.shape[:2]))
--> 337 resized_im = zoom(im, scale + (1,), order=interp_order)
338 return resized_im.astype(np.float32)
339
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\scipy\ndimage\interpolation.py in zoom(input, zoom, output, order, mode, cval, prefilter)
588 else:
589 filtered = input
--> 590 zoom = _ni_support._normalize_sequence(zoom, input.ndim)
591 output_shape = tuple(
592 [int(round(ii * jj)) for ii, jj in zip(input.shape, zoom)])
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\scipy\ndimage\_ni_support.py in _normalize_sequence(input, rank, array_type)
63 if len(normalized) != rank:
64 err = "sequence argument must have length equal to input rank"
---> 65 raise RuntimeError(err)
66 else:
67 normalized = [input] * rank
RuntimeError: sequence argument must have length equal to input rank
And here is the snippets of code I'm using :
import sys
import caffe
import numpy as np
import lmdb
import matplotlib.pyplot as plt
import itertools
def flat_shape(x):
"Returns x without singleton dimension, eg: (1,28,28) -> (28,28)"
return x.reshape(x.shape)
def db_reader(fpath, type='lmdb'):
if type == 'lmdb':
return lmdb_reader(fpath)
else:
return leveldb_reader(fpath)
def lmdb_reader(fpath):
import lmdb
lmdb_env = lmdb.open(fpath)
lmdb_txn = lmdb_env.begin()
lmdb_cursor = lmdb_txn.cursor()
for key, value in lmdb_cursor:
datum = caffe.proto.caffe_pb2.Datum()
datum.ParseFromString(value)
label = int(datum.label)
image = caffe.io.datum_to_array(datum).astype(np.uint8)
yield (key, flat_shape(image), label)
def leveldb_reader(fpath):
import leveldb
db = leveldb.LevelDB(fpath)
for key, value in db.RangeIter():
datum = caffe.proto.caffe_pb2.Datum()
datum.ParseFromString(value)
label = int(datum.label)
image = caffe.io.datum_to_array(datum).astype(np.uint8)
yield (key, flat_shape(image), label)
Classifier class (copied form Caffe's python directory):
import numpy as np
import caffe
class Classifier(caffe.Net):
"""
Classifier extends Net for image class prediction
by scaling, center cropping, or oversampling.
Parameters
----------
image_dims : dimensions to scale input for cropping/sampling.
Default is to scale to net input size for whole-image crop.
mean, input_scale, raw_scale, channel_swap: params for
preprocessing options.
"""
def __init__(self, model_file, pretrained_file, image_dims=None,
mean=None, input_scale=None, raw_scale=None,
channel_swap=None):
caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
# configure pre-processing
in_ = self.inputs[0]
print('inputs[0]',self.inputs[0])
self.transformer = caffe.io.Transformer(
{in_: self.blobs[in_].data.shape})
self.transformer.set_transpose(in_, (2, 0, 1))
if mean is not None:
self.transformer.set_mean(in_, mean)
if input_scale is not None:
self.transformer.set_input_scale(in_, input_scale)
if raw_scale is not None:
self.transformer.set_raw_scale(in_, raw_scale)
if channel_swap is not None:
self.transformer.set_channel_swap(in_, channel_swap)
print('crops: ',self.blobs[in_].data.shape[2:])
self.crop_dims = np.array(self.blobs[in_].data.shape[2:])
if not image_dims:
image_dims = self.crop_dims
self.image_dims = image_dims
def predict(self, inputs, oversample=True):
"""
Predict classification probabilities of inputs.
Parameters
----------
inputs : iterable of (H x W x K) input ndarrays.
oversample : boolean
average predictions across center, corners, and mirrors
when True (default). Center-only prediction when False.
Returns
-------
predictions: (N x C) ndarray of class probabilities for N images and C
classes.
"""
# Scale to standardize input dimensions.
input_ = np.zeros((len(inputs),
self.image_dims[0],
self.image_dims[1],
inputs[0].shape[2]),
dtype=np.float32)
for ix, in_ in enumerate(inputs):
print('image dims = ',self.image_dims[0],',',self.image_dims[1] ,'_in = ',in_.shape)
input_[ix] = caffe.io.resize_image(in_, self.image_dims)
if oversample:
# Generate center, corner, and mirrored crops.
input_ = caffe.io.oversample(input_, self.crop_dims)
else:
# Take center crop.
center = np.array(self.image_dims) / 2.0
crop = np.tile(center, (1, 2))[0] + np.concatenate([
-self.crop_dims / 2.0,
self.crop_dims / 2.0
])
input_ = input_[:, crop[0]:crop[2], crop[1]:crop[3], :]
# Classify
caffe_in = np.zeros(np.array(input_.shape)[[0, 3, 1, 2]],
dtype=np.float32)
for ix, in_ in enumerate(input_):
caffe_in[ix] = self.transformer.preprocess(self.inputs[0], in_)
out = self.forward_all(**{self.inputs[0]: caffe_in})
predictions = out[self.outputs[0]]
# For oversampling, average predictions across crops.
if oversample:
predictions = predictions.reshape((len(predictions) / 10, 10, -1))
predictions = predictions.mean(1)
return predictions
Main section :
proto ='deploy.prototxt'
model='snap1.caffemodel'
mean='imagenet_mean.binaryproto'
db_path='G:/imagenet/ilsvrc12_val_lmdb'
# Extract mean from the mean image file
#mean_blobproto_new = caffe.proto.caffe_pb2.BlobProto()
#f = open(mean, 'rb')
#mean_blobproto_new.ParseFromString(f.read())
#mean_image = caffe.io.blobproto_to_array(mean_blobproto_new)
#f.close()
mu = np.load('mean.npy').mean(1).mean(1)
caffe.set_mode_gpu()
reader = lmdb_reader(db_path)
i = 0
for i, image, label in reader:
image_caffe = image.reshape(1, *image.shape)
print(image_caffe.shape, mu.shape)
net = Classifier(proto, model,
mean= mu,
channel_swap=(2,1,0),
raw_scale=255,
image_dims=(256, 256))
out = net.predict([image_caffe], oversample=True)
print(i, labels[out[0].argmax()].strip(),' (', out[0][out[0].argmax()] , ')')
i+=1
What is wrong here?
I found the cause, I had to feed the image in the form of 3D tensor not a 4D one!
so our 4d tensor:
image_caffe = image.reshape(1, *image.shape)
needed to be changed to a 3D one:
image_caffe = image.transpose(2,1,0)
As a side note, try using python2 for running any caffe related. python3 might work at first but will definitely cause a lot of headaches. for instance, predict method with oversample set to True, will crash under python3 but works just fine under python2!

Resources