Keras One Hot Encoding Memory Management - best Possible way out - python-3.x
I know this problem has been answered in different ways in the past. But I am not able to figure out and fit in my code and need help. I am using the cornell movie corpus as my dataset. Trying to train a LSTM model for chatbot is the final expectation. But I am stuck with initial one hot encoding and is getting out of memory. Note the VM I am training is 86GB memory but still having issues.In nmt_special_utils_mod.py the one hot encoding is going beyond allocated memory and I am not able to pass the stage. Any alternative way to do these line will be helpful without loosing the functionality
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))
All codes as below to make the question clear
import_corpus_mod.py -
Change 1: updated less frequent word removal
def data_load():
TrainDataSetPath = 'D:\\Script\\Python\\NLP\\chatbotSeq2SeqWithAtt\\ChatBot\\'
####initializing libraries####
#import numpy as np
#import tensorflow as tf
import re
#import time
########### Data Pre-processing Part 1##########
def clean_text(text):
'''The function will clean known texts and make it more meaningful'''
text = text.lower()
text = re.sub(r"i'm", "i am", text)
text = re.sub(r"he's", "he is", text)
text = re.sub(r"she's", "she is", text)
text = re.sub(r"it's", "it is", text)
text = re.sub(r"let's", "let us", text)
text = re.sub(r"that's", "that is", text)
text = re.sub(r"what's", "what is", text)
text = re.sub(r"where's", "where is", text)
text = re.sub(r"how's", "how is", text)
text = re.sub(r"howz", "how is", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"don't", "do not", text)
text = re.sub(r"won't", "will not", text)
text = re.sub(r"can't", "cannot", text)
text = re.sub(r"wouldn't", "would not", text)
text = re.sub(r"wasn't", "was not", text)
text = re.sub(r"haven't", "have not", text)
text = re.sub(r"\s+"," ",text)
text = re.sub(r"[-()\"#/#;:<>+=~|{}.?,]", "", text)
#####Add more below this line######
#####Add more above this line######
return text
lines = open(TrainDataSetPath+'movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open(TrainDataSetPath+'movie_conversations_short.txt', encoding='utf-8', errors='ignore').read().split('\n')
#Create dictionary which maps each line with its corresponding ID
id2line = {}
for line in lines:
_line = line.split(' +++$+++ ')
if len(_line) == 5:
id2line[_line[0]] = _line[4]
#Create list of all conversation
conversations_ids = []
for conversation in conversations[:-1]: #the last line in conversation is blank hence -1
#Split then pick last part[-1] which is conversation. Then Removing square bracket by [1:-1] and then replacing quotes and space
_conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
# Append to form a list of list separating by comma
conversations_ids.append(_conversation.split(","))
#Separating the question and answer - assuming the first is the question second is the answer in a conversation
questions = []
answers = []
threshold = 5 #If more than 15 counts of words
for conversation in conversations_ids:
for i in range(len(conversation)-1):
questions.append(id2line[conversation[i]])
answers.append(id2line[conversation[i+1]])
# Cleaning all questions
clean_questions = []
for question in questions:
clean_questions.append(clean_text(question))
# Cleaning all answers
clean_answers = []
for answer in answers:
clean_answers.append(clean_text(answer))
# Creating a dictionary that maps each word to its number of occurrence
word2count = {}
for question in clean_questions:
for word in question.split():
if word not in word2count:
word2count[word] = 1
else:
word2count[word] += 1
for answer in clean_answers:
for word in answer.split():
if word not in word2count:
word2count[word] = 1
else:
word2count[word] += 1
#Create dictionary of words which has more occurrence than threshold
for k in list(word2count):
if word2count[k] < threshold:
del word2count[k]
cleanest_questions, cleanest_answers, keys_list = [], [], list(word2count.keys())
for answers in clean_answers:
ans = []
for word in answers.split():
if word in keys_list:
ans.append(word)
else:
ans.append('<unk>')
cleanest_answers.append(' '.join(ans))
for question in clean_questions:
ques = []
for word in question.split():
if word in keys_list:
ques.append(word)
else:
ques.append('<unk>')
cleanest_questions.append(' '.join(ques))
return cleanest_questions, cleanest_answers
nmt_data_load_asmain_words.py
Change 1 : update less frequent word removal
from tqdm import tqdm
from import_corpus_mod import data_load
def load_dataset(clean_questions, clean_answers):
"""
Loads a dataset with m examples and vocabularies
:m: the number of examples to generate
"""
human_vocab = set()
machine_vocab = set()
dataset = []
lines = len(clean_questions)
for i in tqdm(range(lines)):
hu, mc = clean_questions[i], clean_answers[i]
if hu is not None:
dataset.append((hu, mc))
human_vocab.update(set(hu.split()))
machine_vocab.update(set(mc.split()))
human = dict(zip(sorted(human_vocab) + ['<pad>'],
list(range(len(human_vocab) + 1))))
#human = dict(zip(sorted(human_vocab) + ['<pad>'],
#list(range(len(human_vocab) + 1))))
#human = dict(zip(sorted(human_vocab),
#list(range(len(human_vocab)))))
machine = dict(zip(sorted(machine_vocab) + ['<pad>'],
list(range(len(machine_vocab) + 1))))
#machine = dict(zip(sorted(machine_vocab) + ['<pad>'],
#list(range(len(machine_vocab) + 1))))
inv_machine = {v:k for k,v in machine.items()}
inv_human = {p:q for q,p in human.items()}
return dataset, human, machine, inv_machine, inv_human
clean_questions, clean_answers = data_load()
dataset, human_vocab, machine_vocab, inv_machine_vocab, inv_human_vocab = load_dataset(clean_questions, clean_answers)
nmt_special_utils_mod.py
import numpy as np
from keras.utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt
import sys
# Initiate a list to store integer version of sentences
X_into_int = []
Y_into_int = []
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
X, Y = zip(*dataset)
X = np.asarray([string_to_int(i, Tx, human_vocab) for i in X])
Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
Xoh, Yoh = [], []
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))
return X, np.array(Y), Xoh, Yoh
def string_to_int(line, length, vocab):
#print("hello- inside function")
"""
Converts all strings in the vocabulary into a list of integers representing the positions of the
input string's characters in the "vocab"
Arguments:
string -- input string, e.g. 'Hello how are you'
length -- the number of time steps you'd like, determines if the output will be padded or cut
vocab -- vocabulary, dictionary used to index every character of your "string"
Returns:
rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
"""
'''
#make lower to standardize
for string in listofstring:
string = string.lower()
string = string.replace(',','')
if len(string) > length:
string = string[:length]
rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
if len(string) < length:
rep += [vocab['<pad>']] * (length - len(string))
#print (rep)
return rep
'''
newlist = []
if len(line.split()) > length:
line = line.split()
for i in range(length):
newlist.append(line[i])
line = ' '.join(newlist)
else:
line = line + ' <pad>' * (length - len(line.split()))
#print(line)
#print("hello- inside padded")
#words_into_int = []
ints = []
for word in line.split():
if word not in vocab:
ints.append(vocab['<unk>'])
else:
ints.append(vocab[word])
#print("hello- inside append if loop")
#words_into_int.append(ints)
#words_into_int = ",".join(x for x in words_into_int)
return ints
def int_to_string(ints, inv_vocab):
"""
Output a machine readable list of characters based on a list of indexes in the machine's vocabulary
Arguments:
ints -- list of integers representing indexes in the machine's vocabulary
inv_vocab -- dictionary mapping machine readable indexes to machine readable characters
Returns:
l -- list of characters corresponding to the indexes of ints thanks to the inv_vocab mapping
"""
l = [inv_vocab[i] for i in ints]
return l
EXAMPLES = ['3 May 1979', '5 Apr 09', '20th February 2016', 'Wed 10 Jul 2007']
def softmax(x, axis=1):
"""Softmax activation function.
# Arguments
x : Tensor.
axis: Integer, axis along which the softmax normalization is applied.
# Returns
Tensor, output of softmax transformation.
# Raises
ValueError: In case `dim(x) == 1`.
"""
ndim = K.ndim(x)
if ndim == 2:
return K.softmax(x)
elif ndim > 2:
e = K.exp(x - K.max(x, axis=axis, keepdims=True))
s = K.sum(e, axis=axis, keepdims=True)
return e / s
else:
raise ValueError('Cannot apply softmax to a tensor that is 1D')
def plot_attention_map(model, input_vocabulary, inv_output_vocabulary, text, n_s = 128, num = 6, Tx = 30, Ty = 10):
"""
Plot the attention map.
"""
attention_map = np.zeros((10, 30))
Ty, Tx = attention_map.shape
s0 = np.zeros((1, n_s))
c0 = np.zeros((1, n_s))
layer = model.layers[num]
encoded = np.array(string_to_int(text, Tx, input_vocabulary)).reshape((1, 30))
encoded = np.array(list(map(lambda x: to_categorical(x, num_classes=len(input_vocabulary)), encoded)))
f = K.function(model.inputs, [layer.get_output_at(t) for t in range(Ty)])
r = f([encoded, s0, c0])
for t in range(Ty):
for t_prime in range(Tx):
attention_map[t][t_prime] = r[t][0,t_prime,0]
# Normalize attention map
# row_max = attention_map.max(axis=1)
# attention_map = attention_map / row_max[:, None]
prediction = model.predict([encoded, s0, c0])
predicted_text = []
for i in range(len(prediction)):
predicted_text.append(int(np.argmax(prediction[i], axis=1)))
predicted_text = list(predicted_text)
predicted_text = int_to_string(predicted_text, inv_output_vocabulary)
text_ = list(text)
# get the lengths of the string
input_length = len(text)
output_length = Ty
# Plot the attention_map
plt.clf()
f = plt.figure(figsize=(8, 8.5))
ax = f.add_subplot(1, 1, 1)
# add image
i = ax.imshow(attention_map, interpolation='nearest', cmap='Blues')
# add colorbar
cbaxes = f.add_axes([0.2, 0, 0.6, 0.03])
cbar = f.colorbar(i, cax=cbaxes, orientation='horizontal')
cbar.ax.set_xlabel('Alpha value (Probability output of the "softmax")', labelpad=2)
# add labels
ax.set_yticks(range(output_length))
ax.set_yticklabels(predicted_text[:output_length])
ax.set_xticks(range(input_length))
ax.set_xticklabels(text_[:input_length], rotation=45)
ax.set_xlabel('Input Sequence')
ax.set_ylabel('Output Sequence')
# add grid and legend
ax.grid()
#f.show()
return attention_map
nmt_code_mod.py the main code
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 10 16:31:44 2018
#author: Anirban
"""
from keras.layers import Bidirectional, Concatenate, Dot, Input, LSTM
from keras.layers import RepeatVector, Dense, Activation
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import Model
import keras.backend as K
import numpy as np
from nmt_data_load_asmain_words import load_dataset
from import_corpus_mod import data_load
from nmt_special_utils_mod import *
epochs = 50
clean_questions, clean_answers = data_load()
dataset, human_vocab, machine_vocab, inv_machine_vocab, inv_human_vocab = load_dataset(clean_questions, clean_answers)
m = len(clean_questions)
Tx = 8
Ty = 8
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)
print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)
# Defined shared layers as global variables
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(20, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights') # We are using a custom softmax(axis = 1) loaded from nmt_special_utils
dotor = Dot(axes = 1)
def one_step_attention(a, s_prev):
"""
Performs one step of attention: Outputs a context vector computed as a dot product of the attention weights
"alphas" and the hidden states "a" of the Bi-LSTM.
Arguments:
a -- hidden state output of the Bi-LSTM, numpy-array of shape (m, Tx, 2*n_a)
s_prev -- previous hidden state of the (post-attention) LSTM, numpy-array of shape (m, n_s)
Returns:
context -- context vector, input of the next (post-attetion) LSTM cell
"""
### START CODE HERE ###
# Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a" (≈ 1 line)
s_prev = repeator(s_prev)
# Use concatenator to concatenate a and s_prev on the last axis (≈ 1 line)
concat = concatenator([a,s_prev])
# Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e. (≈1 lines)
e = densor1(concat)
# Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. (≈1 lines)
energies = densor2(e)
# Use "activator" on "energies" to compute the attention weights "alphas" (≈ 1 line)
alphas = activator(energies)
# Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell (≈ 1 line)
context = dotor([alphas,a])
### END CODE HERE ###
return context
n_a = 32
n_s = 64
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_vocab), activation=softmax)
def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
"""
Arguments:
Tx -- length of the input sequence
Ty -- length of the output sequence
n_a -- hidden state size of the Bi-LSTM
n_s -- hidden state size of the post-attention LSTM
human_vocab_size -- size of the python dictionary "human_vocab"
machine_vocab_size -- size of the python dictionary "machine_vocab"
Returns:
model -- Keras model instance
"""
# Define the inputs of your model with a shape (Tx,)
# Define s0 and c0, initial hidden state for the decoder LSTM of shape (n_s,)
X = Input(shape=(Tx, human_vocab_size))
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
# Initialize empty list of outputs
outputs = []
### START CODE HERE ###
# Step 1: Define your pre-attention Bi-LSTM. Remember to use return_sequences=True. (≈ 1 line)
a = Bidirectional(LSTM(n_a, return_sequences=True),input_shape=(m, Tx, n_a*2))(X)
# Step 2: Iterate for Ty steps
for t in range(Ty):
# Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
context = one_step_attention(a, s)
# Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
# Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
s, _, c = post_activation_LSTM_cell(context,initial_state = [s, c])
# Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
out = output_layer(s)
# Step 2.D: Append "out" to the "outputs" list (≈ 1 line)
outputs.append(out)
# Step 3: Create model instance taking three inputs and returning the list of outputs. (≈ 1 line)
model = Model(inputs=[X,s0,c0],outputs=outputs)
### END CODE HERE ###
return model
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))
opt = Adam(lr=0.05, beta_1=0.9, beta_2=0.999,decay=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))
model.fit([Xoh, s0, c0], outputs, epochs=epochs, batch_size=5)
EXAMPLES = ['can we make this quick roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again'
,'the thing is cameron i am at the mercy of a particularly hideous breed of loser my sister i cannot date until she does'
,'Hello how are you']
#EXAMPLES = ['13 May 1979', 'Tue 11 Jul 2007','Saturday May 9 2018', 'March 3 2001','March 3rd 2001', '1 March 2001','23 May 2017']
for example in EXAMPLES:
source = np.asarray([string_to_int(example, Tx, human_vocab)])
#need a try block here to prevent errors if vocab is small and example has characters not in the vocab
source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source))) #.swapaxes(0,1)
prediction = model.predict([source, s0, c0])
prediction = np.argmax(prediction, axis = -1)
output = [inv_machine_vocab[int(i)] for i in prediction]
pads = output.count('<pad>')
output = output[0:(len(output)-pads)]
print("source:", example)
print("output:", ' '.join(output))
Note: The code is as is code of very famous research paper in 2016 which coverts any date time to computer understandable date time. I was trying to re-use that for our Chatbot - Seq2Seq with Attention Model (bi-directional). The code is working - just that the movie corpus if loaded in 1000 conversation it works. When you load the full corpus it fails due to memory overload
EDIT
Thank You for collaboration efforts on this problem - Really appreciate the trouble you are taking to go through the code and trying to find out the best possible solution for this. As you instructed I have updated the import_corpus_mod.py to incorporate the threshold = 5 and at the very beginning converting the least frequent words less than 5 to < unk > without space. This change forced another small change in nmt_data_load_asmain_words.py to remove the addition of < unk > there.
Now based on the other point and the code shared by you - I hashed out the below lines in nmt_special_utils_mod.py
#Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
#Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))
And straight away change the input based on your guidance?
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, 100, embeddings_initializer='uniform', input_length=Tx , trainable=True )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
Got lot of errors
runfile('D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py', wdir='D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot')
Reloaded modules: nmt_data_load_asmain_words, import_corpus_mod, nmt_special_utils_mod
100%|██████████| 384/384 [00:00<00:00, 24615.06it/s]
100%|██████████| 384/384 [00:00<?, ?it/s]
X.shape: (384, 8)
Y.shape: (384, 8)
D:\Python\Anaconda3\lib\site-packages\keras\engine\topology.py:1592: UserWarning: Model inputs must come from a Keras Input layer, they cannot be the output of a previous non-Input layer. Here, a tensor specified as input to "model_2" was not an Input tensor, it was generated by layer embedding_1.
Note that input tensors are instantiated via `tensor = Input(shape)`.
The tensor that caused the issue was: embedding_1/Gather:0
str(x.name))
Traceback (most recent call last):
File "<ipython-input-44-addb6f9e6bc1>", line 1, in <module>
runfile('D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py', wdir='D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot')
File "D:\Python\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "D:\Python\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py", line 138, in <module>
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))
File "D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py", line 132, in model
model = Model(inputs=[X,s0,c0],outputs=outputs)
File "D:\Python\Anaconda3\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "D:\Python\Anaconda3\lib\site-packages\keras\engine\topology.py", line 1652, in __init__
layer.__class__.__name__))
TypeError: Input layers to a `Model` must be `InputLayer` objects. Received inputs: [<tf.Tensor 'embedding_1/Gather:0' shape=(?, 8, 100) dtype=float32>, <tf.Tensor 's0_1:0' shape=(?, 64) dtype=float32>, <tf.Tensor 'c0_1:0' shape=(?, 64) dtype=float32>]. Input 0 (0-based) originates from layer type `Embedding`
So reverting back the code here for nmt_code_mod.py and nmt_special_utils_mod.py
The problem is not one-hot encoding but rather storing the entire dataset in memory. The wise choice is to a generator, or a Sequence which will allow you to load and encode the data on the fly. This is commonly done for large image datasets for example.
I would recommend to perform all your pre-processing and save input, output pairs without encoding as a csv file, then you can create a generator that lazily loads and encodes:
class MySequence(Sequence):
def __init__(self, data, batch_size):
self.data_file = data
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, batch_id):
# Get corresponding batch data...
# one-hot encode
return X, Y
Note the generators (or Sequence[i]) returns a single batch.
I would not recommend using one-hot encodings and a dense matrix.
If you have a vocabulary of 100.000 words a 100.000 x 100.000 consumes more than 70Gb of RAM.
You can try using sparse a sparse matrix. But I guess that changes the rest of your code. you may take a look at this answer.
You could use is word embeddings representations, which are compact, memory friendly and used by all the state of the art NLP systems.
In any case, one think you must do with your model is to handle embedding inputs using the proper embedding layer.
This layer stores the embedding matrix once, and then you can build your training samples giving only one integer that represents the index of the word in the vocabulary.
If you want one hot encodings, you can build an embedding layer with a NxN identity matrix using a Keras initializer. Where N is the size of the vocabulary. Then your can pass as input the indexes of the words as integers. This will increase the size of your model, but it will reduce the size of your batches.
If you want word2vec embeddings, you can load an embedding matrix with a NxV dimensions. Where N is the size of the vocabulary and V is the dimension of the embeddings. You will notice that V is normally set to 100 or 200 dimensions, which is much smaller than N. Saving you a lot of memory.
EDIT: to clarify the usage of embeddings in your case:
You do:
X = Input(shape=(Tx, human_vocab_size))
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
Instead you can do one-hot-encoding this way:
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, human_vocab_size, embeddings_initializer=keras.initializers.Identity, input_length=Tx )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
By doing this, you can build your training samples using only the word indexes and not the one hot vectors. This will make you save some space in the training samples, but your model will be larger in size.
If it is still too large, you won't have the choice but using dense embeddings. To do so, you can do the following:
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, 100, embeddings_initializer='uniform', input_length=Tx , trainable=True )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
This initializes embeddings randomly with a compact representation (dimension 100 instead of human_vocab_size). This would save you a lot of memory.
Finally you could reduce the size of your vocabulary by putting everything in lowercase or replacing rare words (that appear only once or twice in the corpus) with an special token "RARE"
Related
What would be the equivalent of keras.layers.Masking in pytorch?
I have time-series sequences which I needed to keep the length of sequences fixed to a number by padding zeroes into matrix and using keras.layers.Masking in keras I could neglect those padded zeros for further computations, I am wondering how could it be done in Pytorch? Either I need to do the padding in pytroch and pytorch can't handle the sequences with varying lengths what is the equivalent to Masking layer of keras in pytorch, or if pytorch handles the sequences with varying lengths, how could it be done?
You can use PackedSequence class as equivalent to keras masking. you can find more features at torch.nn.utils.rnn Here putting example from packing for variable-length sequence inputs for rnn import torch import torch.nn as nn from torch.autograd import Variable batch_size = 3 max_length = 3 hidden_size = 2 n_layers =1 # container batch_in = torch.zeros((batch_size, 1, max_length)) #data vec_1 = torch.FloatTensor([[1, 2, 3]]) vec_2 = torch.FloatTensor([[1, 2, 0]]) vec_3 = torch.FloatTensor([[1, 0, 0]]) batch_in[0] = vec_1 batch_in[1] = vec_2 batch_in[2] = vec_3 batch_in = Variable(batch_in) seq_lengths = [3,2,1] # list of integers holding information about the batch size at each sequence step # pack it pack = torch.nn.utils.rnn.pack_padded_sequence(batch_in, seq_lengths, batch_first=True) >>> pack PackedSequence(data=Variable containing: 1 2 3 1 2 0 1 0 0 [torch.FloatTensor of size 3x3] , batch_sizes=[3]) # initialize rnn = nn.RNN(max_length, hidden_size, n_layers, batch_first=True) h0 = Variable(torch.randn(n_layers, batch_size, hidden_size)) #forward out, _ = rnn(pack, h0) # unpack unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(out) >>> unpacked Variable containing: (0 ,.,.) = -0.7883 -0.7972 0.3367 -0.6102 0.1502 -0.4654 [torch.FloatTensor of size 1x3x2] more you would find this article useful. [Jum to Title - "How the PackedSequence object works"] - link
You can use a packed sequence to mask a timestep in the sequence dimension: batch_mask = ... # boolean mask e.g. (seq x batch) # move `padding` at right place then it will be cut when packing compact_seq = torch.zeros_like(x) for i, seq_len in enumerate(batch_mask.sum(0)): compact_seq[:seq_len, i] = x[batch_mask[:,i],i] # pack in sequence dimension (the number of agents) packed_x = pack_padded_sequence(compact_seq, batch_mask.sum(0).cpu().numpy(), enforce_sorted=False) packed_scores, rnn_hxs = nn.GRU(packed_x, rnn_hxs) # restore sequence dimension scores, _ = pad_packed_sequence(packed_scores) # restore order, moving padding in its place scores = torch.zeros((*batch_mask.shape,scores.size(-1))).to(scores.device).masked_scatter(batch_mask.unsqueeze(-1), scores) instead use a mask select/scatter to mask in the batch dimension: batch_mask = torch.any(x, -1).unsqueeze(-1) # boolean mask (batch,1) batch_x = torch.masked_select(x, batch_mask).reshape(-1, x.size(-1)) batch_rnn_hxs = torch.masked_select(rnn_hxs, batch_mask).reshape(-1, rnn_hxs.size(-1)) batch_rnn_hxs = nn.GRUCell(batch_x, batch_rnn_hxs) rnn_hxs = rnn_hxs.masked_scatter(batch_mask, batch_rnn_hxs) # restore batch Note that using scatter function is safe for gradient backpropagation
Pad data using tf.data.Dataset
I have to use tf.data.Dataset for creating a input pipeline for an RNN model in tensorflow. I am providing a basic code, by which I need to pad the data in batch with a pad token and use it for further manipulation. import pandas as pd import numpy as np import tensorflow as tf import functools total_data_size = 10000 embedding_dimension = 25 max_len = 17 varying_length = np.random.randint(max_len, size=(10000)) # varying length data X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding def gen(): for index in range(len(X)): yield X[index] , Y[index] dataset = tf.data.Dataset.from_generator(gen,(tf.int32,tf.int32)) dataset = dataset.batch(batch_size=25) padded_shapes = (tf.TensorShape([None])) # sentence of unknown size padding_values = (tf.constant(-111)) # the value with which pad index needs to be filled dataset = (dataset .padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values) ) iter2 = dataset.make_initializable_iterator() sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) sess.run(iter2.initializer) print(sess.run(iter2.get_next())) I hope the code is self explanatory with comments. But I am getting following error, InvalidArgumentError (see above for traceback): Cannot batch tensors with different shapes in component 0. First element had shape [11] and element 1 had shape [12]. [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,?], [?]], output_types=[DT_INT32, DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Iterator)]]
I believe that since your generator yields two outputs, your padded_shapes and padded_values tuples must have a length of two. For me, this works: dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32)) dataset = dataset.batch(batch_size=25) padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None])) # sentence of unknown size padding_values = (tf.constant(-111), tf.constant(-111)) # the value with which pad index needs to be filled dataset = (dataset .padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values) ) iter2 = dataset.make_initializable_iterator() sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) sess.run(iter2.initializer)
Finally got the answer. The issue was for the second padded shapes instead of Tensorshape([None]), we should provide [], because the second item returned by the generator is a scalar. If using Tensorshape([None]),, make sure we are returning a vector import pandas as pd import numpy as np import tensorflow as tf import functools total_data_size = 10000 embedding_dimension = 25 max_len = 17 varying_length = np.random.randint(max_len, size=(10000)) # varying length data X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding def gen(): for index in range(len(X)): yield X[index] , Y[index] dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), (tf.TensorShape([None]), [])) padded_shapes = (tf.TensorShape([None]), []) # sentence of unknown size dataset = (dataset .padded_batch(25, padded_shapes=padded_shapes, padding_values=(-111, 0)) ) iter2 = dataset.make_initializable_iterator() sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) sess.run(iter2.initializer) sess.run(iter2.get_next())
Keras fit_generator() & Input array should have the same as target samples
So far I'm trying to implement the fit-generator for sentiment analysis as I only have a small PGU and big dataset. But I keep getting this error Using Theano backend. Can not use cuDNN on context None: cannot compile with cuDNN. We got this error: b'In file included from C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/driver_types.h:53:0,\r\n from C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/cudnn.h:63,\r\n from C:\\Users\\Def\\AppData\\Local\\Temp\\try_flags_p2iwer2o.c:4:\r\nC:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/host_defines.h:84:0: warning: "__cdecl" redefined\r\n #define __cdecl\r\n ^\r\n<built-in>: note: this is the location of the previous definition\r\nd000029.o:(.idata$5+0x0): multiple definition of `__imp___C_specific_handler\'\r\nd000026.o:(.idata$5+0x0): first defined here\r\nC:/Users/Def/Anaconda3/envs/Final/Library/mingw-w64/bin/../lib/gcc/x86_64-w64-mingw32/5.3.0/../../../../x86_64-w64-mingw32/lib/../lib/crt2.o: In function `__tmainCRTStartup\':\r\nC:/repo/mingw-w64-crt-git/src/mingw-w64/mingw-w64-crt/crt/crtexe.c:285: undefined reference to `_set_invalid_parameter_handler\'\r\ncollect2.exe: error: ld returned 1 exit status\r\n' Mapped name None to device cuda: GeForce GTX 960M (0000:01:00.0) Epoch 1/10 Traceback (most recent call last): File "C:/Users/Def/PycharmProjects/KerasUkExpenditure/TweetParsing.py", line 136, in <module> epochs=10) File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\legacy\interfaces.py", line 88, in wrapper return func(*args, **kwargs) File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\models.py", line 1097, in fit_generator initial_epoch=initial_epoch) File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\legacy\interfaces.py", line 88, in wrapper return func(*args, **kwargs) File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1876, in fit_generator class_weight=class_weight) File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1614, in train_on_batch check_batch_axis=True) File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1307, in _standardize_user_data _check_array_lengths(x, y, sample_weights) File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 229, in _check_array_lengths 'and ' + str(list(set_y)[0]) + ' target samples.') ValueError: Input arrays should have the same number of samples as target arrays. Found 1000 input samples and 1 target samples. I have a matrix that is 1000 elements long since I only have a maximum corpus of 1000 words which is specified in the Tokenizer(). I then have the sentiment which is either a 0 for negative or a 1 for positive. My question is why do I receive the error? I have tried to use the transform on both the data and labels and I still receive the same error. here is my code. from keras.models import Sequential from keras.layers import Dense, Dropout from keras.preprocessing.text import Tokenizer import numpy as np import pandas as pd import pickle import matplotlib.pyplot as plt import re """ the amount of samples out to the 1 million to use, my 960m 2GB can only handle about 30,000ish at the moment depending on a number of neurons in the deep layer and a number of layers. """ maxSamples = 3000 #Load the CSV and get the correct columns data = pd.read_csv("C:\\Users\\Def\\Desktop\\Sentiment Analysis Dataset1.csv") dx = pd.DataFrame() dy = pd.DataFrame() dy[['Sentiment']] = data[['Sentiment']] dx[['SentimentText']] = data[['SentimentText']] dataY = dy.iloc[0:maxSamples] dataX = dx.iloc[0:maxSamples] testY = dy.iloc[maxSamples: maxSamples + 1000] testX = dx.iloc[maxSamples: maxSamples + 1000] """ here I filter the data and clean it up by removing # tags, hyperlinks and also any characters that are not alpha-numeric. """ def removeTagsAndLinks(dataframe): for x in dataframe.iterrows(): #Removes Hyperlinks x[1].values[0] = re.sub("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,#?^=%&:/~+#-]*[\w#?^=%&/~+#-])?", "", str(x[1].values[0])) #Removes # tags x[1].values[0] = re.sub("#\\w+", '', str(x[1].values[0])) #keeps only alpha-numeric chars x[1].values[0] = re.sub("\W+", ' ', str(x[1].values[0])) return dataframe xData = removeTagsAndLinks(dataX) xTest = removeTagsAndLinks(testX) """ This loop looks for any Tweets with characters shorter than 2 and once found write the index of that Tweet to an array so I can remove from the Dataframe of sentiment and the list of Tweets later """ indexOfBlankStrings = [] for index, string in enumerate(xData): if len(string) < 2: indexOfBlankStrings.append(index) for row in indexOfBlankStrings: dataY.drop(row, axis=0, inplace=True) """ This makes a BOW model out of all the tweets then creates a vector for each of the tweets containing all the words from the BOW model, each vector is the same size becuase the network expects it """ def vectorise(tokenizer, list): return tokenizer.fit_on_texts(list) #Make BOW model and vectorise it t = Tokenizer(lower=False, num_words=1000) t.fit_on_texts(dataX.iloc[:,0].tolist()) t.fit_on_texts(dataX.iloc[:,0].tolist()) """ Here im experimenting with multiple layers of the total amount of words in the syllabus divided by ^2 - This has given me quite accurate results compared to random guess's of amount of neron's. """ l1 = int(xData.shape[0] / 4) #To big for my GPU l2 = int(xData.shape[0] / 8) #To big for my GPU l3 = int(xData.shape[0] / 16) l4 = int(xData.shape[0] / 32) l5 = int(xData.shape[0] / 64) l6 = int(xData.shape[0] / 128) #Make the model model = Sequential() model.add(Dense(l1, input_dim=xData.shape[1])) model.add(Dropout(0.15)) model.add(Dense(l2)) model.add(Dropout(0.2)) model.add(Dense(l3)) model.add(Dropout(0.2)) model.add(Dense(l4)) model.add(Dense(1, activation='relu')) #Compile the model model.compile(optimizer='RMSProp', loss='binary_crossentropy', metrics=['acc']) """ This here will use multiple batches to train the model. startIndex: This is the starting index of the array for which you want to start training the network from. dataRange: The number of elements use to train the network in each batch so since dataRange = 1000 this mean it goes from startIndex...dataRange OR 0...1000 amountOfEpochs: This is kinda self explanitory, the more Epochs the more it is supposed to learn AKA updates the optimisation algo numbers """ amountOfEpochs = 1 dataRange = 1000 startIndex = 0 def generator(tokenizer, data, labels, totalSize=maxSamples, startIndex=0): l = labels.as_matrix() while True: for i in range(startIndex, totalSize): batch_features = tokenizer.texts_to_matrix(xData.iloc[i]) batch_labels = l[i] yield batch_features, batch_labels derp = generator(t, data=xData, labels=dataY) ##This runs the model for batch AKA load a little them process then load a little more for amountOfData in range(1000, maxSamples, 1000): #(loss, acc) = model.train_on_batch(x=dim[startIndex:amountOfData], y=np.asarray(dataY.iloc[startIndex:amountOfData])) history = model.fit_generator(generator=generator(tokenizer=t, data=xData, labels=dataY), steps_per_epoch=1, epochs=10) Thanks
The problem you are having is that the number of samples in your input array, do not equal the number of samples in your target array. This means the number of rows in you matrices do not match. The problems stems from your generator function. You index the data as batch_labels = l[i] which is only returning one sample (row of matrix). When instead it should be something like... batch_labels = l[i:i+1000] However there are other problems with your use of the fit_generator. You should not be using this within a loop. I don't see how it is benefiting the program, and calling the fit_generator in a loop defeats the purpose of using a generator. The function you would use to train an an individual batch of data would be train_on_batch() as seen in the docs
Trying to to use Caffe classifier causes "sequence argument must have length equal to input rank "error
I am trying to use Caffe.Classifier class and its predict() method on my Imagenet trained caffemodel. Images were resized to 256x256 and crops of 227x227 were used to train the net. Everything is simple and straight forward, yet I keep getting weird errors such as the following : --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) <ipython-input-7-3b440ebf1f6e> in <module>() 17 image_dims=(256, 256)) 18 ---> 19 out = net.predict([image_caffe], oversample=True) 20 print(labels[out[0].argmax()].strip(),' (', out[0][out[0].argmax()] , ')') 21 plabel = int(labels[out[0].argmax()].strip()) <ipython-input-5-e6ae1810b820> in predict(self, inputs, oversample) 65 for ix, in_ in enumerate(inputs): 66 print('image dims = ',self.image_dims[0],',',self.image_dims[1] ,'_in = ',in_.shape) ---> 67 input_[ix] = caffe.io.resize_image(in_, self.image_dims) 68 69 if oversample: C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\caffe\io.py in resize_image(im, new_dims, interp_order) 335 # ndimage interpolates anything but more slowly. 336 scale = tuple(np.array(new_dims, dtype=float) / np.array(im.shape[:2])) --> 337 resized_im = zoom(im, scale + (1,), order=interp_order) 338 return resized_im.astype(np.float32) 339 C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\scipy\ndimage\interpolation.py in zoom(input, zoom, output, order, mode, cval, prefilter) 588 else: 589 filtered = input --> 590 zoom = _ni_support._normalize_sequence(zoom, input.ndim) 591 output_shape = tuple( 592 [int(round(ii * jj)) for ii, jj in zip(input.shape, zoom)]) C:\Users\Master\Anaconda3\envs\anaconda35\lib\site-packages\scipy\ndimage\_ni_support.py in _normalize_sequence(input, rank, array_type) 63 if len(normalized) != rank: 64 err = "sequence argument must have length equal to input rank" ---> 65 raise RuntimeError(err) 66 else: 67 normalized = [input] * rank RuntimeError: sequence argument must have length equal to input rank And here is the snippets of code I'm using : import sys import caffe import numpy as np import lmdb import matplotlib.pyplot as plt import itertools def flat_shape(x): "Returns x without singleton dimension, eg: (1,28,28) -> (28,28)" return x.reshape(x.shape) def db_reader(fpath, type='lmdb'): if type == 'lmdb': return lmdb_reader(fpath) else: return leveldb_reader(fpath) def lmdb_reader(fpath): import lmdb lmdb_env = lmdb.open(fpath) lmdb_txn = lmdb_env.begin() lmdb_cursor = lmdb_txn.cursor() for key, value in lmdb_cursor: datum = caffe.proto.caffe_pb2.Datum() datum.ParseFromString(value) label = int(datum.label) image = caffe.io.datum_to_array(datum).astype(np.uint8) yield (key, flat_shape(image), label) def leveldb_reader(fpath): import leveldb db = leveldb.LevelDB(fpath) for key, value in db.RangeIter(): datum = caffe.proto.caffe_pb2.Datum() datum.ParseFromString(value) label = int(datum.label) image = caffe.io.datum_to_array(datum).astype(np.uint8) yield (key, flat_shape(image), label) Classifier class (copied form Caffe's python directory): import numpy as np import caffe class Classifier(caffe.Net): """ Classifier extends Net for image class prediction by scaling, center cropping, or oversampling. Parameters ---------- image_dims : dimensions to scale input for cropping/sampling. Default is to scale to net input size for whole-image crop. mean, input_scale, raw_scale, channel_swap: params for preprocessing options. """ def __init__(self, model_file, pretrained_file, image_dims=None, mean=None, input_scale=None, raw_scale=None, channel_swap=None): caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST) # configure pre-processing in_ = self.inputs[0] print('inputs[0]',self.inputs[0]) self.transformer = caffe.io.Transformer( {in_: self.blobs[in_].data.shape}) self.transformer.set_transpose(in_, (2, 0, 1)) if mean is not None: self.transformer.set_mean(in_, mean) if input_scale is not None: self.transformer.set_input_scale(in_, input_scale) if raw_scale is not None: self.transformer.set_raw_scale(in_, raw_scale) if channel_swap is not None: self.transformer.set_channel_swap(in_, channel_swap) print('crops: ',self.blobs[in_].data.shape[2:]) self.crop_dims = np.array(self.blobs[in_].data.shape[2:]) if not image_dims: image_dims = self.crop_dims self.image_dims = image_dims def predict(self, inputs, oversample=True): """ Predict classification probabilities of inputs. Parameters ---------- inputs : iterable of (H x W x K) input ndarrays. oversample : boolean average predictions across center, corners, and mirrors when True (default). Center-only prediction when False. Returns ------- predictions: (N x C) ndarray of class probabilities for N images and C classes. """ # Scale to standardize input dimensions. input_ = np.zeros((len(inputs), self.image_dims[0], self.image_dims[1], inputs[0].shape[2]), dtype=np.float32) for ix, in_ in enumerate(inputs): print('image dims = ',self.image_dims[0],',',self.image_dims[1] ,'_in = ',in_.shape) input_[ix] = caffe.io.resize_image(in_, self.image_dims) if oversample: # Generate center, corner, and mirrored crops. input_ = caffe.io.oversample(input_, self.crop_dims) else: # Take center crop. center = np.array(self.image_dims) / 2.0 crop = np.tile(center, (1, 2))[0] + np.concatenate([ -self.crop_dims / 2.0, self.crop_dims / 2.0 ]) input_ = input_[:, crop[0]:crop[2], crop[1]:crop[3], :] # Classify caffe_in = np.zeros(np.array(input_.shape)[[0, 3, 1, 2]], dtype=np.float32) for ix, in_ in enumerate(input_): caffe_in[ix] = self.transformer.preprocess(self.inputs[0], in_) out = self.forward_all(**{self.inputs[0]: caffe_in}) predictions = out[self.outputs[0]] # For oversampling, average predictions across crops. if oversample: predictions = predictions.reshape((len(predictions) / 10, 10, -1)) predictions = predictions.mean(1) return predictions Main section : proto ='deploy.prototxt' model='snap1.caffemodel' mean='imagenet_mean.binaryproto' db_path='G:/imagenet/ilsvrc12_val_lmdb' # Extract mean from the mean image file #mean_blobproto_new = caffe.proto.caffe_pb2.BlobProto() #f = open(mean, 'rb') #mean_blobproto_new.ParseFromString(f.read()) #mean_image = caffe.io.blobproto_to_array(mean_blobproto_new) #f.close() mu = np.load('mean.npy').mean(1).mean(1) caffe.set_mode_gpu() reader = lmdb_reader(db_path) i = 0 for i, image, label in reader: image_caffe = image.reshape(1, *image.shape) print(image_caffe.shape, mu.shape) net = Classifier(proto, model, mean= mu, channel_swap=(2,1,0), raw_scale=255, image_dims=(256, 256)) out = net.predict([image_caffe], oversample=True) print(i, labels[out[0].argmax()].strip(),' (', out[0][out[0].argmax()] , ')') i+=1 What is wrong here?
I found the cause, I had to feed the image in the form of 3D tensor not a 4D one! so our 4d tensor: image_caffe = image.reshape(1, *image.shape) needed to be changed to a 3D one: image_caffe = image.transpose(2,1,0) As a side note, try using python2 for running any caffe related. python3 might work at first but will definitely cause a lot of headaches. for instance, predict method with oversample set to True, will crash under python3 but works just fine under python2!
LSTMLayer produces NaN values even before training it
I'm currently trying to construct a LSTM network with Lasagne to predict the next step of noisy sequences. I first trained a stack of 2 LSTM layers for a while, but had to use an abysmally small learning rate (1e-6) because of divergence issues (that ultimately produced NaN values). The results were kind of disappointing, as the network produced smooth, out-of-phase versions of the input. I then came to the conclusion I should use better parameter initialization than what is given by default. The goal was to start from a network that just mimics identity, since for strongly auto-correlated signal it should be a good first estimation of the next step (x(t) ~ x(t+1)), and to sprinkle a bit of noise on top of it. import theano, numpy, lasagne from theano import tensor as T from lasagne.layers.recurrent import LSTMLayer, InputLayer, Gate from lasagne.layers import DropoutLayer from lasagne.nonlinearities import sigmoid, tanh, leaky_rectify from lasagne.layers import get_output from lasagne.init import GlorotNormal, Normal, Constant floatX = 'float32' # function to create a lstm that ~ propagate the input from start to finish off the bat # should be a good start for a predictive lstm with high one-step autocorrelation def create_identity_lstm(input, shape, orig_inp=None, noiselvl=0.01, G=10., mask_input=None): inp, out = shape # orig_inp is used to limit the number of units that are actually used to pass the input information from one layer to the other - the rest of the units should produce ~ 0 activation. if orig_inp is None: orig_inp = inp # input gate inputgate = Gate( W_in=GlorotNormal(noiselvl), W_hid=GlorotNormal(noiselvl), W_cell=Normal(noiselvl), b=Constant(0.), nonlinearity=sigmoid ) # forget gate forgetgate = Gate( W_in=GlorotNormal(noiselvl), W_hid=GlorotNormal(noiselvl), W_cell=Normal(noiselvl), b=Constant(0.), nonlinearity=sigmoid ) # cell gate cell = Gate( W_in=GlorotNormal(noiselvl), W_hid=GlorotNormal(noiselvl), W_cell=None, b=Constant(0.), nonlinearity=leaky_rectify ) # output gate outputgate = Gate( W_in=GlorotNormal(noiselvl), W_hid=GlorotNormal(noiselvl), W_cell=Normal(noiselvl), b=Constant(0.), nonlinearity=sigmoid ) lstm = LSTMLayer(input, out, ingate=inputgate, forgetgate=forgetgate, cell=cell, outgate=outputgate, nonlinearity=leaky_rectify, mask_input=mask_input) # change matrices and biases # ingate - should return ~1 (matrices = 0, big bias) b_i = lstm.b_ingate.get_value() b_i[:orig_inp] += G lstm.b_ingate.set_value(b_i) # forgetgate - should return 0 (matrices = 0, big negative bias) b_f = lstm.b_forgetgate.get_value() b_f[:orig_inp] -= G b_f[orig_inp:] += G # to help learning future features, I preserve a large bias on "unused" units to help it remember stuff lstm.b_forgetgate.set_value(b_f) # cell - should return x(t) (W_xc = identity, rest is 0) W_xc = lstm.W_in_to_cell.get_value() for i in xrange(orig_inp): W_xc[i, i] += 1. lstm.W_in_to_cell.set_value(W_xc) # outgate - should return 1 (same as ingate) b_o = lstm.b_outgate.get_value() b_o[:orig_inp] += G lstm.b_outgate.set_value(b_o) # done return lstm I then use this lstm generation code to generate the following network: # layers #input + dropout input = InputLayer((None, None, 7), name='input') mask = InputLayer((None, None), name='mask') drop1 = DropoutLayer(input, p=0.33) #lstm1 + dropout lstm1 = create_identity_lstm(drop1, (7, 1024), mask_input=mask) drop2 = DropoutLayer(lstm1, p=0.33) #lstm2 + dropout lstm2 = create_identity_lstm(drop2, (1024, 128), orig_inp=7, mask_input=mask) drop3 = DropoutLayer(lstm2, p=0.33) #lstm3 lstm3 = create_identity_lstm(drop3, (128, 7), orig_inp=7, mask_input=mask) # symbolic variables and prediction x = input.input_var ma = mask.input_var ma_reshape = ma.dimshuffle((0,1,'x')) yhat = get_output(lstm3, deterministic=False) yhat_det = get_output(lstm3, deterministic=True) y = T.ftensor3('y') predict = theano.function([x, ma], yhat_det) Problem is, even without any training, this network produces garbage values and sometimes even a bunch of NaNs, right from the very first LSTM layer: X = numpy.random.random((5, 10000, 7)).astype('float32') Masks = numpy.ones(X.shape[:2], dtype='float32') hid1 = get_output(lstm1, determistic=True) get_hid1 = theano.function([x, ma], hid1) h1 = get_hid1(X, Masks) print numpy.isnan(h1).sum(axis=1).sum(axis=1) array([6379520, 6367232, 6377472, 6376448, 6378496]) # even the first output value is garbage! print h1[:,0,0] - X[:,0,0] array([-0.03898358, -0.10118812, 0.34877831, -0.02509735, 0.36689138], dtype=float32) I don't get why, I checked each matrices and their values are fine, like I wanted them to be. I even tried to recreate each gate activations and the resulting hidden activations using the actual numpy arrays and they reproduce the input just fine. What did I do wrong there??