Trying to to use Caffe classifier causes "sequence argument must have length equal to input rank "error - python-3.x

I am trying to use Caffe.Classifier class and its predict() method on my Imagenet trained caffemodel.
Images were resized to 256x256 and crops of 227x227 were used to train the net.
Everything is simple and straight forward, yet I keep getting weird errors such as the following :
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-7-3b440ebf1f6e> in <module>()
17 image_dims=(256, 256))
18
---> 19 out = net.predict([image_caffe], oversample=True)
20 print(labels[out[0].argmax()].strip(),' (', out[0][out[0].argmax()] , ')')
21 plabel = int(labels[out[0].argmax()].strip())
<ipython-input-5-e6ae1810b820> in predict(self, inputs, oversample)
65 for ix, in_ in enumerate(inputs):
66 print('image dims = ',self.image_dims[0],',',self.image_dims[1] ,'_in = ',in_.shape)
---> 67 input_[ix] = caffe.io.resize_image(in_, self.image_dims)
68
69 if oversample:
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\caffe\io.py in resize_image(im, new_dims, interp_order)
335 # ndimage interpolates anything but more slowly.
336 scale = tuple(np.array(new_dims, dtype=float) / np.array(im.shape[:2]))
--> 337 resized_im = zoom(im, scale + (1,), order=interp_order)
338 return resized_im.astype(np.float32)
339
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\scipy\ndimage\interpolation.py in zoom(input, zoom, output, order, mode, cval, prefilter)
588 else:
589 filtered = input
--> 590 zoom = _ni_support._normalize_sequence(zoom, input.ndim)
591 output_shape = tuple(
592 [int(round(ii * jj)) for ii, jj in zip(input.shape, zoom)])
C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\scipy\ndimage\_ni_support.py in _normalize_sequence(input, rank, array_type)
63 if len(normalized) != rank:
64 err = "sequence argument must have length equal to input rank"
---> 65 raise RuntimeError(err)
66 else:
67 normalized = [input] * rank
RuntimeError: sequence argument must have length equal to input rank
And here is the snippets of code I'm using :
import sys
import caffe
import numpy as np
import lmdb
import matplotlib.pyplot as plt
import itertools
def flat_shape(x):
"Returns x without singleton dimension, eg: (1,28,28) -> (28,28)"
return x.reshape(x.shape)
def db_reader(fpath, type='lmdb'):
if type == 'lmdb':
return lmdb_reader(fpath)
else:
return leveldb_reader(fpath)
def lmdb_reader(fpath):
import lmdb
lmdb_env = lmdb.open(fpath)
lmdb_txn = lmdb_env.begin()
lmdb_cursor = lmdb_txn.cursor()
for key, value in lmdb_cursor:
datum = caffe.proto.caffe_pb2.Datum()
datum.ParseFromString(value)
label = int(datum.label)
image = caffe.io.datum_to_array(datum).astype(np.uint8)
yield (key, flat_shape(image), label)
def leveldb_reader(fpath):
import leveldb
db = leveldb.LevelDB(fpath)
for key, value in db.RangeIter():
datum = caffe.proto.caffe_pb2.Datum()
datum.ParseFromString(value)
label = int(datum.label)
image = caffe.io.datum_to_array(datum).astype(np.uint8)
yield (key, flat_shape(image), label)
Classifier class (copied form Caffe's python directory):
import numpy as np
import caffe
class Classifier(caffe.Net):
"""
Classifier extends Net for image class prediction
by scaling, center cropping, or oversampling.
Parameters
----------
image_dims : dimensions to scale input for cropping/sampling.
Default is to scale to net input size for whole-image crop.
mean, input_scale, raw_scale, channel_swap: params for
preprocessing options.
"""
def __init__(self, model_file, pretrained_file, image_dims=None,
mean=None, input_scale=None, raw_scale=None,
channel_swap=None):
caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
# configure pre-processing
in_ = self.inputs[0]
print('inputs[0]',self.inputs[0])
self.transformer = caffe.io.Transformer(
{in_: self.blobs[in_].data.shape})
self.transformer.set_transpose(in_, (2, 0, 1))
if mean is not None:
self.transformer.set_mean(in_, mean)
if input_scale is not None:
self.transformer.set_input_scale(in_, input_scale)
if raw_scale is not None:
self.transformer.set_raw_scale(in_, raw_scale)
if channel_swap is not None:
self.transformer.set_channel_swap(in_, channel_swap)
print('crops: ',self.blobs[in_].data.shape[2:])
self.crop_dims = np.array(self.blobs[in_].data.shape[2:])
if not image_dims:
image_dims = self.crop_dims
self.image_dims = image_dims
def predict(self, inputs, oversample=True):
"""
Predict classification probabilities of inputs.
Parameters
----------
inputs : iterable of (H x W x K) input ndarrays.
oversample : boolean
average predictions across center, corners, and mirrors
when True (default). Center-only prediction when False.
Returns
-------
predictions: (N x C) ndarray of class probabilities for N images and C
classes.
"""
# Scale to standardize input dimensions.
input_ = np.zeros((len(inputs),
self.image_dims[0],
self.image_dims[1],
inputs[0].shape[2]),
dtype=np.float32)
for ix, in_ in enumerate(inputs):
print('image dims = ',self.image_dims[0],',',self.image_dims[1] ,'_in = ',in_.shape)
input_[ix] = caffe.io.resize_image(in_, self.image_dims)
if oversample:
# Generate center, corner, and mirrored crops.
input_ = caffe.io.oversample(input_, self.crop_dims)
else:
# Take center crop.
center = np.array(self.image_dims) / 2.0
crop = np.tile(center, (1, 2))[0] + np.concatenate([
-self.crop_dims / 2.0,
self.crop_dims / 2.0
])
input_ = input_[:, crop[0]:crop[2], crop[1]:crop[3], :]
# Classify
caffe_in = np.zeros(np.array(input_.shape)[[0, 3, 1, 2]],
dtype=np.float32)
for ix, in_ in enumerate(input_):
caffe_in[ix] = self.transformer.preprocess(self.inputs[0], in_)
out = self.forward_all(**{self.inputs[0]: caffe_in})
predictions = out[self.outputs[0]]
# For oversampling, average predictions across crops.
if oversample:
predictions = predictions.reshape((len(predictions) / 10, 10, -1))
predictions = predictions.mean(1)
return predictions
Main section :
proto ='deploy.prototxt'
model='snap1.caffemodel'
mean='imagenet_mean.binaryproto'
db_path='G:/imagenet/ilsvrc12_val_lmdb'
# Extract mean from the mean image file
#mean_blobproto_new = caffe.proto.caffe_pb2.BlobProto()
#f = open(mean, 'rb')
#mean_blobproto_new.ParseFromString(f.read())
#mean_image = caffe.io.blobproto_to_array(mean_blobproto_new)
#f.close()
mu = np.load('mean.npy').mean(1).mean(1)
caffe.set_mode_gpu()
reader = lmdb_reader(db_path)
i = 0
for i, image, label in reader:
image_caffe = image.reshape(1, *image.shape)
print(image_caffe.shape, mu.shape)
net = Classifier(proto, model,
mean= mu,
channel_swap=(2,1,0),
raw_scale=255,
image_dims=(256, 256))
out = net.predict([image_caffe], oversample=True)
print(i, labels[out[0].argmax()].strip(),' (', out[0][out[0].argmax()] , ')')
i+=1
What is wrong here?

I found the cause, I had to feed the image in the form of 3D tensor not a 4D one!
so our 4d tensor:
image_caffe = image.reshape(1, *image.shape)
needed to be changed to a 3D one:
image_caffe = image.transpose(2,1,0)
As a side note, try using python2 for running any caffe related. python3 might work at first but will definitely cause a lot of headaches. for instance, predict method with oversample set to True, will crash under python3 but works just fine under python2!

Related

Predicting classes in MNIST dataset with a Gaussian- the same prediction errors with different paramemters?

I am trying to find the best c parameter following the instructions to a task that asks me to ' Define a function, fit_generative_model, that takes as input a training set (train_data, train_labels) and fits a Gaussian generative model to it. It should return the parameters of this generative model; for each label j = 0,1,...,9, where
pi[j]: the frequency of that label
mu[j]: the 784-dimensional mean vector
sigma[j]: the 784x784 covariance matrix
It is important to regularize these matrices. The standard way of doing this is to add cI to them, where c is some constant and I is the 784-dimensional identity matrix. c is now a parameter, and by setting it appropriately, we can improve the performance of the model.
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import gzip, os
import numpy as np
from scipy.stats import multivariate_normal
if sys.version_info[0] == 2:
from urllib import urlretrieve
else:
from urllib.request import urlretrieve
# Downloads the dataset
def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
print("Downloading %s" % filename)
urlretrieve(source + filename, filename)
# Invokes download() if necessary, then reads in images
def load_mnist_images(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
data = data.reshape(-1,784)
return data
def load_mnist_labels(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=8)
return data
## Load the training set
train_data = load_mnist_images('train-images-idx3-ubyte.gz')
train_labels = load_mnist_labels('train-labels-idx1-ubyte.gz')
## Load the testing set
test_data = load_mnist_images('t10k-images-idx3-ubyte.gz')
test_labels = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
train_data.shape, train_labels.shape
So I have written this code for three different C-values. they each give me the same error?
def fit_generative_model(x,y):
lst=[]
for c in [20,200, 4000]:
k = 10 # labels 0,1,...,k-1
d = (x.shape)[1] # number of features
mu = np.zeros((k,d))
sigma = np.zeros((k,d,d))
pi = np.zeros(k)
for label in range(0,k):
indices = (y == label)
mu[label] = np.mean(x[indices,:], axis=0)
sigma[label] = np.cov(x[indices,:], rowvar=0, bias=1) + c*np.identity(784) # I define the identity matrix
predictions = np.argmax(score, axis=1)
errors = np.sum(predictions != y)
lst.append(errors)
print(c,"Model makes " + str(errors) + " errors out of 10000", lst)
Then I fit it to the training data and get these same errors:
mu, sigma, pi = fit_generative_model(train_data, train_labels)
20 Model makes 1 errors out of 10000 [1]
200 Model makes 1 errors out of 10000 [1, 1]
4000 Model makes 1 errors out of 10000 [1, 1, 1]
and to the test data:
mu, sigma, pi = fit_generative_model(test_data, test_labels)
20 Model makes 9020 errors out of 10000 [9020]
200 Model makes 9020 errors out of 10000 [9020, 9020]
4000 Model makes 9020 errors out of 10000 [9020, 9020, 9020]
What is it I'm doing wrong? the correct answer is c=4000 which yields an error of ~4.3%.

How to map a function to a pre-batched ('BatchDataset') tensor in Tensorflow

I am making data windows (input, output pairs of windows) from time series data. I have already converted my time series to a tf dataset, where each batch has the number of time steps equal to the total window sizes I need.
def make_dataset(data=train_df[0]):
ds = tf.keras.preprocessing.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32
)
return ds
Example of the shape returned:
for example in tensor.take(1):
print(f'shape: {example.shape}')
shape: (32, 48, 18)
What I need to do now is split the time dimension into my input-output pairs, and I have a function to do this, however, when I try to map this function to my 'ds' in the above function I get the following error:
'BatchDataset' object is not subscriptable
I am hoping someone can help me understand where I am going wrong? I am pretty new to tensorflow... My code is below, in this example 'input slice' and 'label_slice' are 0 and 24 respectively. So my aim is to split my batches into input-output pairs of length 24 each.
def split_window(features):
inputs = features[:, input_slice, :]
labels = features[:, labels_slice, :]
inputs.set_shape([None, input_width, None])
labels.set_shape([None, label_width, None])
return inputs, labels
def make_dataset(data=train_df[0]):
data = np.array(data, dtype=np.float32)
ds = tf.keras.preprocessing.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32
)
ds = ds.map(split_window(ds))
return ds
tensor = make_dataset()
tensor
'BatchDataset' object is not subscriptable
Your snippet of code looks similar to the tutorial of Time Series in Tensorflow. Based on that, I modified the main class WindowGenerator() (excluded the parts of train/val/test datasets and output-labels selection) to a simpler class suitable to your question.
class WindowGenerator():
def __init__(self, input_width, label_width, shift):
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size[self.input_slice]
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size [self.labels_slice]
def split_window(self, features):
inputs = features[:, self.input_slice, :]
labels = features[:, self.labels_slice, :]
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, None])
return inputs, labels
def make_dataset(self, data):
data = np.array(data, dtype=np.float32)
ds = tf.keras.utils.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=self.total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=batch_size,)
ds = ds.map(self.split_window)
return ds
input_width=24
label_width=24
total_windth = input_width + label_width
batch_size = 32
window = WindowGenerator(input_width=input_width, label_width=label_width, shift=1)
dataset = window.make_dataset(train_df[0])
I would recommend, though, to use Dataset.window(). It is simpler and more intuitive.
dataset = tf.data.Dataset.from_tensor_slices(train_df[0])
dataset = dataset.window(total_windth, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(batch_size))
dataset = dataset.map(lambda window: (window[:-label_width], window[-input_width:]))

Argument must be a string or a number issue, Not 'Type' - Pyspark

Update:
So i have been looking into the issue, the problem is with scikit-multiflow datastream. in last quarter of code stream_clf.partial_fit(X,y, classes=stream.target_values) here the class valuefor stream.target_values should a number or string, but the method is returning (dtype). When i print or loop stream.target_values i get this:
I have tried to do conversion etc. but still of no use. can someone please help here ?
Initial Problem
I am running a code (took inspiration from here). It works perfectly alright when used vanilla python environment.
But if i run this code after certain modification in Apache Spark using Pyspark , i get the following error
TypeError: int() argument must be a string, a bytes-like object or a number, not 'type'
I have tried every possibile way to trace the issue but everything looks alright. The error arises from the last line of the code where hoefding tree is called for prediction. It expects an ndarray and the type of X variable is also ndarray. I am not sure what is trigerring the issue. Can some one please help or direct me to right trace?
complete stack of error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-52-1310132c88db> in <module>
30 D3_win.addInstance(X,y)
31 xx = np.array(X,dtype='float64')
---> 32 y_hat = stream_clf.predict(xx)
33
34
~/conceptDrift/projectTest/lib/python3.5/site-packages/skmultiflow/trees/hoeffding_tree.py in predict(self, X)
1068 r, _ = get_dimensions(X)
1069 predictions = []
-> 1070 y_proba = self.predict_proba(X)
1071 for i in range(r):
1072 index = np.argmax(y_proba[i])
~/conceptDrift/projectTest/lib/python3.5/site-packages/skmultiflow/trees/hoeffding_tree.py in predict_proba(self, X)
1099 votes = normalize_values_in_dict(votes, inplace=False)
1100 if self.classes is not None:
-> 1101 y_proba = np.zeros(int(max(self.classes)) + 1)
1102 else:
1103 y_proba = np.zeros(int(max(votes.keys())) + 1)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'type'
Code
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import functions as fn
import sys
from pyspark import SparkContext,SparkConf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score as AUC
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from skmultiflow.trees.hoeffding_tree import HoeffdingTree
from skmultiflow.data.data_stream import DataStream
import time
def drift_detector(S,T,threshold = 0.75):
T = pd.DataFrame(T)
#print(T)
S = pd.DataFrame(S)
# Give slack variable in_target which is 1 for old and 0 for new
T['in_target'] = 0 # in target set
S['in_target'] = 1 # in source set
# Combine source and target with new slack variable
ST = pd.concat( [T, S], ignore_index=True, axis=0)
labels = ST['in_target'].values
ST = ST.drop('in_target', axis=1).values
# You can use any classifier for this step. We advise it to be a simple one as we want to see whether source
# and target differ not to classify them.
clf = LogisticRegression(solver='liblinear')
predictions = np.zeros(labels.shape)
# Divide ST into two equal chunks
# Train LR on a chunk and classify the other chunk
# Calculate AUC for original labels (in_target) and predicted ones
skf = StratifiedKFold(n_splits=2, shuffle=True)
for train_idx, test_idx in skf.split(ST, labels):
X_train, X_test = ST[train_idx], ST[test_idx]
y_train, y_test = labels[train_idx], labels[test_idx]
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)[:, 1]
predictions[test_idx] = probs
auc_score = AUC(labels, predictions)
print(auc_score)
# Signal drift if AUC is larger than the threshold
if auc_score > threshold:
return True
else:
return False
class D3():
def __init__(self, w, rho, dim, auc):
self.size = int(w*(1+rho))
self.win_data = np.zeros((self.size,dim))
self.win_label = np.zeros(self.size)
self.w = w
self.rho = rho
self.dim = dim
self.auc = auc
self.drift_count = 0
self.window_index = 0
def addInstance(self,X,y):
if(self.isEmpty()):
self.win_data[self.window_index] = X
self.win_label[self.window_index] = y
self.window_index = self.window_index + 1
else:
print("Error: Buffer is full!")
def isEmpty(self):
return self.window_index < self.size
def driftCheck(self):
if drift_detector(self.win_data[:self.w], self.win_data[self.w:self.size], auc): #returns true if drift is detected
self.window_index = int(self.w * self.rho)
self.win_data = np.roll(self.win_data, -1*self.w, axis=0)
self.win_label = np.roll(self.win_label, -1*self.w, axis=0)
self.drift_count = self.drift_count + 1
return True
else:
self.window_index = self.w
self.win_data = np.roll(self.win_data, -1*(int(self.w*self.rho)), axis=0)
self.win_label =np.roll(self.win_label, -1*(int(self.w*self.rho)), axis=0)
return False
def getCurrentData(self):
return self.win_data[:self.window_index]
def getCurrentLabels(self):
return self.win_label[:self.window_index]
def select_data(x):
x = "/user/hadoop1/tellus/sea_1.csv"
peopleDF = spark.read.csv(x, header= True)
df = peopleDF.toPandas()
scaler = MinMaxScaler()
df.iloc[:,0:df.shape[1]-1] = scaler.fit_transform(df.iloc[:,0:df.shape[1]-1])
return df
def check_true(y,y_hat):
if(y==y_hat):
return 1
else:
return 0
df = select_data("/user/hadoop1/tellus/sea_1.csv")
stream = DataStream(df)
stream.prepare_for_use()
stream_clf = HoeffdingTree()
w = int(2000)
rho = float(0.4)
auc = float(0.60)
# In[ ]:
D3_win = D3(w,rho,stream.n_features,auc)
stream_acc = []
stream_record = []
stream_true= 0
i=0
start = time.time()
X,y = stream.next_sample(int(w*rho))
stream_clf.partial_fit(X,y, classes=stream.target_values)
while(stream.has_more_samples()):
X,y = stream.next_sample()
if D3_win.isEmpty():
D3_win.addInstance(X,y)
y_hat = stream_clf.predict(X)
Problem was with select_data() function, data type of variables was being changed during the execution. This issue is fixed now.

Keras One Hot Encoding Memory Management - best Possible way out

I know this problem has been answered in different ways in the past. But I am not able to figure out and fit in my code and need help. I am using the cornell movie corpus as my dataset. Trying to train a LSTM model for chatbot is the final expectation. But I am stuck with initial one hot encoding and is getting out of memory. Note the VM I am training is 86GB memory but still having issues.In nmt_special_utils_mod.py the one hot encoding is going beyond allocated memory and I am not able to pass the stage. Any alternative way to do these line will be helpful without loosing the functionality
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))
All codes as below to make the question clear
import_corpus_mod.py -
Change 1: updated less frequent word removal
def data_load():
TrainDataSetPath = 'D:\\Script\\Python\\NLP\\chatbotSeq2SeqWithAtt\\ChatBot\\'
####initializing libraries####
#import numpy as np
#import tensorflow as tf
import re
#import time
########### Data Pre-processing Part 1##########
def clean_text(text):
'''The function will clean known texts and make it more meaningful'''
text = text.lower()
text = re.sub(r"i'm", "i am", text)
text = re.sub(r"he's", "he is", text)
text = re.sub(r"she's", "she is", text)
text = re.sub(r"it's", "it is", text)
text = re.sub(r"let's", "let us", text)
text = re.sub(r"that's", "that is", text)
text = re.sub(r"what's", "what is", text)
text = re.sub(r"where's", "where is", text)
text = re.sub(r"how's", "how is", text)
text = re.sub(r"howz", "how is", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"don't", "do not", text)
text = re.sub(r"won't", "will not", text)
text = re.sub(r"can't", "cannot", text)
text = re.sub(r"wouldn't", "would not", text)
text = re.sub(r"wasn't", "was not", text)
text = re.sub(r"haven't", "have not", text)
text = re.sub(r"\s+"," ",text)
text = re.sub(r"[-()\"#/#;:<>+=~|{}.?,]", "", text)
#####Add more below this line######
#####Add more above this line######
return text
lines = open(TrainDataSetPath+'movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open(TrainDataSetPath+'movie_conversations_short.txt', encoding='utf-8', errors='ignore').read().split('\n')
#Create dictionary which maps each line with its corresponding ID
id2line = {}
for line in lines:
_line = line.split(' +++$+++ ')
if len(_line) == 5:
id2line[_line[0]] = _line[4]
#Create list of all conversation
conversations_ids = []
for conversation in conversations[:-1]: #the last line in conversation is blank hence -1
#Split then pick last part[-1] which is conversation. Then Removing square bracket by [1:-1] and then replacing quotes and space
_conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
# Append to form a list of list separating by comma
conversations_ids.append(_conversation.split(","))
#Separating the question and answer - assuming the first is the question second is the answer in a conversation
questions = []
answers = []
threshold = 5 #If more than 15 counts of words
for conversation in conversations_ids:
for i in range(len(conversation)-1):
questions.append(id2line[conversation[i]])
answers.append(id2line[conversation[i+1]])
# Cleaning all questions
clean_questions = []
for question in questions:
clean_questions.append(clean_text(question))
# Cleaning all answers
clean_answers = []
for answer in answers:
clean_answers.append(clean_text(answer))
# Creating a dictionary that maps each word to its number of occurrence
word2count = {}
for question in clean_questions:
for word in question.split():
if word not in word2count:
word2count[word] = 1
else:
word2count[word] += 1
for answer in clean_answers:
for word in answer.split():
if word not in word2count:
word2count[word] = 1
else:
word2count[word] += 1
#Create dictionary of words which has more occurrence than threshold
for k in list(word2count):
if word2count[k] < threshold:
del word2count[k]
cleanest_questions, cleanest_answers, keys_list = [], [], list(word2count.keys())
for answers in clean_answers:
ans = []
for word in answers.split():
if word in keys_list:
ans.append(word)
else:
ans.append('<unk>')
cleanest_answers.append(' '.join(ans))
for question in clean_questions:
ques = []
for word in question.split():
if word in keys_list:
ques.append(word)
else:
ques.append('<unk>')
cleanest_questions.append(' '.join(ques))
return cleanest_questions, cleanest_answers
nmt_data_load_asmain_words.py
Change 1 : update less frequent word removal
from tqdm import tqdm
from import_corpus_mod import data_load
def load_dataset(clean_questions, clean_answers):
"""
Loads a dataset with m examples and vocabularies
:m: the number of examples to generate
"""
human_vocab = set()
machine_vocab = set()
dataset = []
lines = len(clean_questions)
for i in tqdm(range(lines)):
hu, mc = clean_questions[i], clean_answers[i]
if hu is not None:
dataset.append((hu, mc))
human_vocab.update(set(hu.split()))
machine_vocab.update(set(mc.split()))
human = dict(zip(sorted(human_vocab) + ['<pad>'],
list(range(len(human_vocab) + 1))))
#human = dict(zip(sorted(human_vocab) + ['<pad>'],
#list(range(len(human_vocab) + 1))))
#human = dict(zip(sorted(human_vocab),
#list(range(len(human_vocab)))))
machine = dict(zip(sorted(machine_vocab) + ['<pad>'],
list(range(len(machine_vocab) + 1))))
#machine = dict(zip(sorted(machine_vocab) + ['<pad>'],
#list(range(len(machine_vocab) + 1))))
inv_machine = {v:k for k,v in machine.items()}
inv_human = {p:q for q,p in human.items()}
return dataset, human, machine, inv_machine, inv_human
clean_questions, clean_answers = data_load()
dataset, human_vocab, machine_vocab, inv_machine_vocab, inv_human_vocab = load_dataset(clean_questions, clean_answers)
nmt_special_utils_mod.py
import numpy as np
from keras.utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt
import sys
# Initiate a list to store integer version of sentences
X_into_int = []
Y_into_int = []
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
X, Y = zip(*dataset)
X = np.asarray([string_to_int(i, Tx, human_vocab) for i in X])
Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
Xoh, Yoh = [], []
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))
return X, np.array(Y), Xoh, Yoh
def string_to_int(line, length, vocab):
#print("hello- inside function")
"""
Converts all strings in the vocabulary into a list of integers representing the positions of the
input string's characters in the "vocab"
Arguments:
string -- input string, e.g. 'Hello how are you'
length -- the number of time steps you'd like, determines if the output will be padded or cut
vocab -- vocabulary, dictionary used to index every character of your "string"
Returns:
rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
"""
'''
#make lower to standardize
for string in listofstring:
string = string.lower()
string = string.replace(',','')
if len(string) > length:
string = string[:length]
rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
if len(string) < length:
rep += [vocab['<pad>']] * (length - len(string))
#print (rep)
return rep
'''
newlist = []
if len(line.split()) > length:
line = line.split()
for i in range(length):
newlist.append(line[i])
line = ' '.join(newlist)
else:
line = line + ' <pad>' * (length - len(line.split()))
#print(line)
#print("hello- inside padded")
#words_into_int = []
ints = []
for word in line.split():
if word not in vocab:
ints.append(vocab['<unk>'])
else:
ints.append(vocab[word])
#print("hello- inside append if loop")
#words_into_int.append(ints)
#words_into_int = ",".join(x for x in words_into_int)
return ints
def int_to_string(ints, inv_vocab):
"""
Output a machine readable list of characters based on a list of indexes in the machine's vocabulary
Arguments:
ints -- list of integers representing indexes in the machine's vocabulary
inv_vocab -- dictionary mapping machine readable indexes to machine readable characters
Returns:
l -- list of characters corresponding to the indexes of ints thanks to the inv_vocab mapping
"""
l = [inv_vocab[i] for i in ints]
return l
EXAMPLES = ['3 May 1979', '5 Apr 09', '20th February 2016', 'Wed 10 Jul 2007']
def softmax(x, axis=1):
"""Softmax activation function.
# Arguments
x : Tensor.
axis: Integer, axis along which the softmax normalization is applied.
# Returns
Tensor, output of softmax transformation.
# Raises
ValueError: In case `dim(x) == 1`.
"""
ndim = K.ndim(x)
if ndim == 2:
return K.softmax(x)
elif ndim > 2:
e = K.exp(x - K.max(x, axis=axis, keepdims=True))
s = K.sum(e, axis=axis, keepdims=True)
return e / s
else:
raise ValueError('Cannot apply softmax to a tensor that is 1D')
def plot_attention_map(model, input_vocabulary, inv_output_vocabulary, text, n_s = 128, num = 6, Tx = 30, Ty = 10):
"""
Plot the attention map.
"""
attention_map = np.zeros((10, 30))
Ty, Tx = attention_map.shape
s0 = np.zeros((1, n_s))
c0 = np.zeros((1, n_s))
layer = model.layers[num]
encoded = np.array(string_to_int(text, Tx, input_vocabulary)).reshape((1, 30))
encoded = np.array(list(map(lambda x: to_categorical(x, num_classes=len(input_vocabulary)), encoded)))
f = K.function(model.inputs, [layer.get_output_at(t) for t in range(Ty)])
r = f([encoded, s0, c0])
for t in range(Ty):
for t_prime in range(Tx):
attention_map[t][t_prime] = r[t][0,t_prime,0]
# Normalize attention map
# row_max = attention_map.max(axis=1)
# attention_map = attention_map / row_max[:, None]
prediction = model.predict([encoded, s0, c0])
predicted_text = []
for i in range(len(prediction)):
predicted_text.append(int(np.argmax(prediction[i], axis=1)))
predicted_text = list(predicted_text)
predicted_text = int_to_string(predicted_text, inv_output_vocabulary)
text_ = list(text)
# get the lengths of the string
input_length = len(text)
output_length = Ty
# Plot the attention_map
plt.clf()
f = plt.figure(figsize=(8, 8.5))
ax = f.add_subplot(1, 1, 1)
# add image
i = ax.imshow(attention_map, interpolation='nearest', cmap='Blues')
# add colorbar
cbaxes = f.add_axes([0.2, 0, 0.6, 0.03])
cbar = f.colorbar(i, cax=cbaxes, orientation='horizontal')
cbar.ax.set_xlabel('Alpha value (Probability output of the "softmax")', labelpad=2)
# add labels
ax.set_yticks(range(output_length))
ax.set_yticklabels(predicted_text[:output_length])
ax.set_xticks(range(input_length))
ax.set_xticklabels(text_[:input_length], rotation=45)
ax.set_xlabel('Input Sequence')
ax.set_ylabel('Output Sequence')
# add grid and legend
ax.grid()
#f.show()
return attention_map
nmt_code_mod.py the main code
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 10 16:31:44 2018
#author: Anirban
"""
from keras.layers import Bidirectional, Concatenate, Dot, Input, LSTM
from keras.layers import RepeatVector, Dense, Activation
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import Model
import keras.backend as K
import numpy as np
from nmt_data_load_asmain_words import load_dataset
from import_corpus_mod import data_load
from nmt_special_utils_mod import *
epochs = 50
clean_questions, clean_answers = data_load()
dataset, human_vocab, machine_vocab, inv_machine_vocab, inv_human_vocab = load_dataset(clean_questions, clean_answers)
m = len(clean_questions)
Tx = 8
Ty = 8
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)
print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)
# Defined shared layers as global variables
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(20, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights') # We are using a custom softmax(axis = 1) loaded from nmt_special_utils
dotor = Dot(axes = 1)
def one_step_attention(a, s_prev):
"""
Performs one step of attention: Outputs a context vector computed as a dot product of the attention weights
"alphas" and the hidden states "a" of the Bi-LSTM.
Arguments:
a -- hidden state output of the Bi-LSTM, numpy-array of shape (m, Tx, 2*n_a)
s_prev -- previous hidden state of the (post-attention) LSTM, numpy-array of shape (m, n_s)
Returns:
context -- context vector, input of the next (post-attetion) LSTM cell
"""
### START CODE HERE ###
# Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a" (≈ 1 line)
s_prev = repeator(s_prev)
# Use concatenator to concatenate a and s_prev on the last axis (≈ 1 line)
concat = concatenator([a,s_prev])
# Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e. (≈1 lines)
e = densor1(concat)
# Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. (≈1 lines)
energies = densor2(e)
# Use "activator" on "energies" to compute the attention weights "alphas" (≈ 1 line)
alphas = activator(energies)
# Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell (≈ 1 line)
context = dotor([alphas,a])
### END CODE HERE ###
return context
n_a = 32
n_s = 64
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_vocab), activation=softmax)
def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
"""
Arguments:
Tx -- length of the input sequence
Ty -- length of the output sequence
n_a -- hidden state size of the Bi-LSTM
n_s -- hidden state size of the post-attention LSTM
human_vocab_size -- size of the python dictionary "human_vocab"
machine_vocab_size -- size of the python dictionary "machine_vocab"
Returns:
model -- Keras model instance
"""
# Define the inputs of your model with a shape (Tx,)
# Define s0 and c0, initial hidden state for the decoder LSTM of shape (n_s,)
X = Input(shape=(Tx, human_vocab_size))
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
# Initialize empty list of outputs
outputs = []
### START CODE HERE ###
# Step 1: Define your pre-attention Bi-LSTM. Remember to use return_sequences=True. (≈ 1 line)
a = Bidirectional(LSTM(n_a, return_sequences=True),input_shape=(m, Tx, n_a*2))(X)
# Step 2: Iterate for Ty steps
for t in range(Ty):
# Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
context = one_step_attention(a, s)
# Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
# Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
s, _, c = post_activation_LSTM_cell(context,initial_state = [s, c])
# Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
out = output_layer(s)
# Step 2.D: Append "out" to the "outputs" list (≈ 1 line)
outputs.append(out)
# Step 3: Create model instance taking three inputs and returning the list of outputs. (≈ 1 line)
model = Model(inputs=[X,s0,c0],outputs=outputs)
### END CODE HERE ###
return model
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))
opt = Adam(lr=0.05, beta_1=0.9, beta_2=0.999,decay=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))
model.fit([Xoh, s0, c0], outputs, epochs=epochs, batch_size=5)
EXAMPLES = ['can we make this quick roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again'
,'the thing is cameron i am at the mercy of a particularly hideous breed of loser my sister i cannot date until she does'
,'Hello how are you']
#EXAMPLES = ['13 May 1979', 'Tue 11 Jul 2007','Saturday May 9 2018', 'March 3 2001','March 3rd 2001', '1 March 2001','23 May 2017']
for example in EXAMPLES:
source = np.asarray([string_to_int(example, Tx, human_vocab)])
#need a try block here to prevent errors if vocab is small and example has characters not in the vocab
source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source))) #.swapaxes(0,1)
prediction = model.predict([source, s0, c0])
prediction = np.argmax(prediction, axis = -1)
output = [inv_machine_vocab[int(i)] for i in prediction]
pads = output.count('<pad>')
output = output[0:(len(output)-pads)]
print("source:", example)
print("output:", ' '.join(output))
Note: The code is as is code of very famous research paper in 2016 which coverts any date time to computer understandable date time. I was trying to re-use that for our Chatbot - Seq2Seq with Attention Model (bi-directional). The code is working - just that the movie corpus if loaded in 1000 conversation it works. When you load the full corpus it fails due to memory overload
EDIT
Thank You for collaboration efforts on this problem - Really appreciate the trouble you are taking to go through the code and trying to find out the best possible solution for this. As you instructed I have updated the import_corpus_mod.py to incorporate the threshold = 5 and at the very beginning converting the least frequent words less than 5 to < unk > without space. This change forced another small change in nmt_data_load_asmain_words.py to remove the addition of < unk > there.
Now based on the other point and the code shared by you - I hashed out the below lines in nmt_special_utils_mod.py
#Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
#Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))
And straight away change the input based on your guidance?
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, 100, embeddings_initializer='uniform', input_length=Tx , trainable=True )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
Got lot of errors
runfile('D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py', wdir='D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot')
Reloaded modules: nmt_data_load_asmain_words, import_corpus_mod, nmt_special_utils_mod
100%|██████████| 384/384 [00:00<00:00, 24615.06it/s]
100%|██████████| 384/384 [00:00<?, ?it/s]
X.shape: (384, 8)
Y.shape: (384, 8)
D:\Python\Anaconda3\lib\site-packages\keras\engine\topology.py:1592: UserWarning: Model inputs must come from a Keras Input layer, they cannot be the output of a previous non-Input layer. Here, a tensor specified as input to "model_2" was not an Input tensor, it was generated by layer embedding_1.
Note that input tensors are instantiated via `tensor = Input(shape)`.
The tensor that caused the issue was: embedding_1/Gather:0
str(x.name))
Traceback (most recent call last):
File "<ipython-input-44-addb6f9e6bc1>", line 1, in <module>
runfile('D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py', wdir='D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot')
File "D:\Python\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "D:\Python\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py", line 138, in <module>
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))
File "D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py", line 132, in model
model = Model(inputs=[X,s0,c0],outputs=outputs)
File "D:\Python\Anaconda3\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "D:\Python\Anaconda3\lib\site-packages\keras\engine\topology.py", line 1652, in __init__
layer.__class__.__name__))
TypeError: Input layers to a `Model` must be `InputLayer` objects. Received inputs: [<tf.Tensor 'embedding_1/Gather:0' shape=(?, 8, 100) dtype=float32>, <tf.Tensor 's0_1:0' shape=(?, 64) dtype=float32>, <tf.Tensor 'c0_1:0' shape=(?, 64) dtype=float32>]. Input 0 (0-based) originates from layer type `Embedding`
So reverting back the code here for nmt_code_mod.py and nmt_special_utils_mod.py
The problem is not one-hot encoding but rather storing the entire dataset in memory. The wise choice is to a generator, or a Sequence which will allow you to load and encode the data on the fly. This is commonly done for large image datasets for example.
I would recommend to perform all your pre-processing and save input, output pairs without encoding as a csv file, then you can create a generator that lazily loads and encodes:
class MySequence(Sequence):
def __init__(self, data, batch_size):
self.data_file = data
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, batch_id):
# Get corresponding batch data...
# one-hot encode
return X, Y
Note the generators (or Sequence[i]) returns a single batch.
I would not recommend using one-hot encodings and a dense matrix.
If you have a vocabulary of 100.000 words a 100.000 x 100.000 consumes more than 70Gb of RAM.
You can try using sparse a sparse matrix. But I guess that changes the rest of your code. you may take a look at this answer.
You could use is word embeddings representations, which are compact, memory friendly and used by all the state of the art NLP systems.
In any case, one think you must do with your model is to handle embedding inputs using the proper embedding layer.
This layer stores the embedding matrix once, and then you can build your training samples giving only one integer that represents the index of the word in the vocabulary.
If you want one hot encodings, you can build an embedding layer with a NxN identity matrix using a Keras initializer. Where N is the size of the vocabulary. Then your can pass as input the indexes of the words as integers. This will increase the size of your model, but it will reduce the size of your batches.
If you want word2vec embeddings, you can load an embedding matrix with a NxV dimensions. Where N is the size of the vocabulary and V is the dimension of the embeddings. You will notice that V is normally set to 100 or 200 dimensions, which is much smaller than N. Saving you a lot of memory.
EDIT: to clarify the usage of embeddings in your case:
You do:
X = Input(shape=(Tx, human_vocab_size))
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
Instead you can do one-hot-encoding this way:
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, human_vocab_size, embeddings_initializer=keras.initializers.Identity, input_length=Tx )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
By doing this, you can build your training samples using only the word indexes and not the one hot vectors. This will make you save some space in the training samples, but your model will be larger in size.
If it is still too large, you won't have the choice but using dense embeddings. To do so, you can do the following:
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, 100, embeddings_initializer='uniform', input_length=Tx , trainable=True )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
This initializes embeddings randomly with a compact representation (dimension 100 instead of human_vocab_size). This would save you a lot of memory.
Finally you could reduce the size of your vocabulary by putting everything in lowercase or replacing rare words (that appear only once or twice in the corpus) with an special token "RARE"

Why does this minimization routine work for 2 different samples from the same distribution while the plot only works for one of the 2 samples?

I am trying to plot a fitted curve to a lognormal dataset. I have two sample datasets. The minimization method works successfully for both datasets, but the plot only works successfully for one. I would like to know why this is so that I can fix it. Below is a quick example.
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
def from_lognormal_to_gauss(mu, sigma):
"""
Convert parameters from a lognormal distribution to the
parameters of the underlying normal distribution.
"""
variance = sigma **2
tmp = variance / mu**2 + 1
mu_mod = np.log(mu/np.sqrt(tmp))
sigma_mod = np.sqrt(np.log(tmp))
return mu_mod, sigma_mod
def initialize_data(sample_success, param_guess='estimate'):
"""
Retrieve sample data and parameter guess.
"""
if sample_success is True:
# this sample works
mu = 6
sigma = 0.5
elif sample_success is False:
# this sample minimizes successfully but does not output the correct plot
mu = 63
sigma = 7
x = np.random.lognormal(mean=mu, sigma=sigma, size=1000) # GET LOGNORMAL DATA
if param_guess == 'estimate':
param_guess = [np.mean(x), np.std(x)]
mu_guess, sigma_guess = from_lognormal_to_gauss(*param_guess)
param_guess = [mu_guess, sigma_guess]
elif not (isinstance(param_guess, (tuple, list, np.ndarray)) and (len(param_guess) == 2)):
raise ValueError("as an initial guess, input 2 parameters or input 'estimate'")
return x, param_guess
## GET PROBABILITY DENSITY FUNCTION (PDF)
pdf_logn = lambda x, mu, sigma : np.exp((-1) * (np.log(x) - mu)**2 / (2* sigma**2)) / (x * sigma * np.sqrt(2*np.pi))
## GET NEGATIVE LOG-LIKELIHOOD (NLL)
nll_logn = lambda prms, x : - np.sum(np.log(pdf_logn(x, prms[0], prms[1],)))
def view_plot(x, mu, sigma):
"""
Output plot of fitted curve.
"""
x = np.array(sorted(x))
y = pdf_logn(x, mu, sigma)
plt.plot(x, y)
plt.show()
As an example of a successful sample run, one can use an initial parameter guess of [10, 10], [10000000000000, 10000000000000], or 'estimate'.
sample_success = True
param_guess = 'estimate'
x, param_guess = initialize_data(sample_success, param_guess)
result = minimize(nll_logn, x0=param_guess, args=(x,), method='Nelder-Mead')
# print(result)
opt_mu, opt_sigma = result.x[0], result.x[1]
view_plot(x, opt_mu, opt_sigma)
For an example of the unsuccessful sample run, one can change sample_success from True to False in the code-block just above. The minimization specs for a single run looks like:
final_simplex: (array([[62.95901978, 6.59276009],
[62.95903676, 6.59281842],
[62.95908379, 6.59278252]]), array([66263.94460285, 66263.94460285, 66263.94460287]))
fun: 66263.94460285055
message: 'Optimization terminated successfully.'
nfev: 102
nit: 52
status: 0
success: True
x: array([62.95901978, 6.59276009])
Since the mean and sigma of the underlying normal distribution for the unsuccessful sample are 63 and 7, the minimization works successfully. Since I am using the same algorithm for both samples, I am not sure why the second sample does not plot successfully.

Resources