Related
I started working with Pytorch recently so my understanding of it isn't quite strong. I previously had a 1 layer CNN but wanted to extend it to 2 layers, but the input and output channels have been throwing errors I can seem to decipher. Why does it expect 192 channels? Can someone give me a pointer to help me understand this better? I have seen several related problems on here, but I don't understand those solutions either.
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertConfig, BertModel, BertTokenizer
import math
from transformers import AdamW, get_linear_schedule_with_warmup
def pad_sents(sents, pad_token): # Pad list of sentences according to the longest sentence in the batch.
sents_padded = []
max_len = max(len(s) for s in sents)
for s in sents:
padded = [pad_token] * max_len
padded[:len(s)] = s
sents_padded.append(padded)
return sents_padded
def sents_to_tensor(tokenizer, sents, device):
tokens_list = [tokenizer.tokenize(str(sent)) for sent in sents]
sents_lengths = [len(tokens) for tokens in tokens_list]
tokens_list_padded = pad_sents(tokens_list, '[PAD]')
sents_lengths = torch.tensor(sents_lengths, device=device)
masks = []
for tokens in tokens_list_padded:
mask = [0 if token == '[PAD]' else 1 for token in tokens]
masks.append(mask)
masks_tensor = torch.tensor(masks, dtype=torch.long, device=device)
tokens_id_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list_padded]
sents_tensor = torch.tensor(tokens_id_list, dtype=torch.long, device=device)
return sents_tensor, masks_tensor, sents_lengths
class ConvModel(nn.Module):
def __init__(self, device, dropout_rate, n_class, out_channel=16):
super(ConvModel, self).__init__()
self.bert_config = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
self.dropout_rate = dropout_rate
self.n_class = n_class
self.out_channel = out_channel
self.bert = BertModel.from_pretrained('bert-base-uncased', config=self.bert_config)
self.out_channels = self.bert.config.num_hidden_layers * self.out_channel
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', config=self.bert_config)
self.conv = nn.Conv2d(in_channels=self.bert.config.num_hidden_layers,
out_channels=self.out_channels,
kernel_size=(3, self.bert.config.hidden_size),
groups=self.bert.config.num_hidden_layers)
self.conv1 = nn.Conv2d(in_channels=self.out_channels,
out_channels=48,
kernel_size=(3, self.bert.config.hidden_size),
groups=self.bert.config.num_hidden_layers)
self.hidden_to_softmax = nn.Linear(self.out_channels, self.n_class, bias=True)
self.dropout = nn.Dropout(p=self.dropout_rate)
self.device = device
def forward(self, sents):
sents_tensor, masks_tensor, sents_lengths = sents_to_tensor(self.tokenizer, sents, self.device)
encoded_layers = self.bert(input_ids=sents_tensor, attention_mask=masks_tensor)
hidden_encoded_layer = encoded_layers[2]
hidden_encoded_layer = hidden_encoded_layer[0]
hidden_encoded_layer = torch.unsqueeze(hidden_encoded_layer, dim=1)
hidden_encoded_layer = hidden_encoded_layer.repeat(1, 12, 1, 1)
conv_out = self.conv(hidden_encoded_layer) # (batch_size, channel_out, some_length, 1)
conv_out = self.conv1(conv_out)
conv_out = torch.squeeze(conv_out, dim=3) # (batch_size, channel_out, some_length)
conv_out, _ = torch.max(conv_out, dim=2) # (batch_size, channel_out)
pre_softmax = self.hidden_to_softmax(conv_out)
return pre_softmax
def batch_iter(data, batch_size, shuffle=False, bert=None):
batch_num = math.ceil(data.shape[0] / batch_size)
index_array = list(range(data.shape[0]))
if shuffle:
data = data.sample(frac=1)
for i in range(batch_num):
indices = index_array[i * batch_size: (i + 1) * batch_size]
examples = data.iloc[indices]
sents = list(examples.train_BERT_tweet)
targets = list(examples.train_label.values)
yield sents, targets # list[list[str]] if not bert else list[str], list[int]
def train():
label_name = ['Yes', 'Maybe', 'No']
device = torch.device("cpu")
df_train = pd.read_csv('trainn.csv') # , index_col=0)
train_label = dict(df_train.train_label.value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max / train_label[i] for i in range(len(train_label))], device=device)
model = ConvModel(device=device, dropout_rate=0.2, n_class=len(label_name))
optimizer = AdamW(model.parameters(), lr=1e-3, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=1000) # changed the last 2 arguments to old ones
model = model.to(device)
model.train()
cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean')
train_batch_size = 16
for epoch in range(1):
for sents, targets in batch_iter(df_train, batch_size=train_batch_size, shuffle=True): # for each epoch
optimizer.zero_grad()
pre_softmax = model(sents)
loss = cn_loss(pre_softmax, torch.tensor(targets, dtype=torch.long, device=device))
loss.backward()
optimizer.step()
scheduler.step()
TrainingModel = train()
Here's a snippet of data https://github.com/Kosisochi/DataSnippet
It seems that the original version of the code you had in this question behaved differently. The final version of the code you have here gives me a different error from what you posted, more specifically - this:
RuntimeError: Calculated padded input size per channel: (20 x 1). Kernel size: (3 x 768). Kernel size can't be greater than actual input size
I apologize if I misunderstood the situation, but it seems to me that your understanding of what exactly nn.Conv2d layer does is not 100% clear and that is the main source of your struggle. I interpret the part "detailed explanation on 2 layer CNN in Pytorch" you requested as an ask to explain in detail on how that layer works and I hope that after this is done there will be no problem applying it 1 time, 2 times or more.
You can find all the documentation about the layer here, but let me give you a recap which hopefully will help to understand more the errors you're getting.
First of all nn.Conv2d inputs are 4-d tensors of the shape (BatchSize, ChannelsIn, Height, Width) and outputs are 4-d tensors of the shape (BatchSize, ChannelsOut, HeightOut, WidthOut). The simplest way to think about nn.Conv2d is of something applied to 2d images with pixel grid of size Height x Width and having ChannelsIn different colors or features per pixel. Even if your inputs have nothing to do with actual images the behavior of the layer is still the same. Simplest situation is when the nn.Conv2d is not using padding (as in your code). In that case the kernel_size=(kernel_height, kernel_width) argument specifies the rectangle which you can imagine sweeping through Height x Width rectangle of your inputs and producing one pixel for each valid position. Without padding the coordinate of the rectangle's point can be any pair of indicies (x, y) with x between 0 and Height - kernel_height and y between 0 and Width - kernel_width. Thus the output will look like a 2d image of size (Height - kernel_height + 1) x (Width - kernel_width + 1) and will have as many output channels as specified to nn.Conv2d constructor, so the output tensor will be of shape (BatchSize, ChannelsOut, Height - kernel_height + 1, Width - kernel_width + 1).
The parameter groups is not affecting how shapes are changed by the layer - it is only controlling which input channels are used as inputs for the output channels (groups=1 means that every input channel is used as input for every output channel, otherwise input and output channels are divided into corresponding number of groups and only input channels from group i are used as inputs for the output channels from group i).
Now in your current version of the code you have BatchSize = 16 and the output of pre-trained model is (BatchSize, DynamicSize, 768) with DynamicSize depending on the input, e.g. 22. You then introduce additional dimension as axis 1 with unsqueeze and repeat the values along that dimension transforming the tensor of shape (16, 22, 768) into (16, 12, 22, 768). Effectively you are using the output of the pre-trained model as 12-channel (with each channel having same values as others) 2-d images here of size (22, 768), where 22 is not fixed (depends on the batch). Then you apply a nn.Conv2d with kernel size (3, 768) - which means that there is no "wiggle room" for width and output 2-d images will be of size (20, 1) and since your layer has 192 channels final size of the output of first convolution layer has shape (16, 192, 20, 1). Then you try to apply second layer of convolution on top of that with kernel size (3, 768) again, but since your 2-d "image" is now just (20 x 1) there is no valid position to fit (3, 768) kernel rectangle inside a rectangle (20 x 1) which leads to the error message Kernel size can't be greater than actual input size.
Hope this explanation helps. Now to the choices you have to avoid the issue:
(a) is to add padding in such a way that the size of the output is not changing comparing to input (I won't go into details here,
because I don't think this is what you need)
(b) Use smaller kernel on both first and/or second convolutions (e.g. if you don't change first convolution the only valid width for
the second kernel would be 1).
(c) Looking at what you're trying to do my guess is that you actually don't want to use 2d convolution, you want 1d convolution (on the sequence) with every position described by 768 values. When you're using one convolution layer with 768 width kernel (and same 768 width input) you're effectively doing exactly same thing as 1d convolution with 768 input channels, but then if you try to apply second one you have a problem. You can specify kernel width as 1 for the next layer(s) and that will work for you, but a more correct way would be to transpose pre-trained model's output tensor by switching the last dimensions - getting shape (16, 768, DynamicSize) from (16, DynamicSize, 768) and then apply nn.Conv1d layer with 768 input channels and arbitrary ChannelsOut as output channels and 1d kernel_size=3 (meaning you look at 3 consecutive elements of the sequence for convolution). If you do that than without padding input shape of (16, 768, DynamicSize) will become (16, ChannelsOut, DynamicSize-2), and after you apply second Conv1d with e.g. the same settings as first one you'll get a tensor of shape (16, ChannelsOut, DynamicSize-4), etc. (each time the 1d length will shrink by kernel_size-1). You can always change number of channels/kernel_size for each subsequent convolution layer too.
I have tested the speed of separable_conv2d and normal conv2d implemented in TF, it seems the only depthwise_conv2d is faster than the normal conv2d but the performance of the dw_conv2d is poor obviously.
The separable_conv2d mentioned in MobileNet, its FLOPs is 1/9 of the normal conv when the kernel_size=3, but considering the Memory Access Cost the separable one cannot be 9 times faster than the normal one but in my experiment, the separable one is too much slower.
I modeling the experiment like this separable_conv2d is too slow. In this experiment, the separable_conv2d seems faster than the normal one when the depth_multiply=1, but when I use tf.nn to implement it as follows:
IMAGE_SIZE= 512
REPEAT = 100
KERNEL_SIZE = 3
data_format = 'NCHW'
#CHANNELS_BATCH_SIZE = 2048 # channe# ls * batch_size
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
def normal_layers(inputs, nfilter, name=''):
with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
shape = inputs.shape.as_list()
in_channels = shape[1]
filter = tf.get_variable(initializer=tf.initializers.random_normal,
shape=[KERNEL_SIZE, KERNEL_SIZE,
in_channels, nfilter], name='weight')
conv = tf.nn.conv2d(input= inputs, filter=filter, strides=
[1,1,1,1],padding='SAME',data_format=data_format,
name='conv')
return conv
def sep_layers(inputs, nfilter, name=''):
with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
shape= inputs.shape.as_list()
in_channels = shape[1]
dw_filter=
tf.get_variable(initializer=tf.initializers.random_normal,
shape=[KERNEL_SIZE, KERNEL_SIZE,
in_channels, 1], name='dw_weight')
pw_filter =
tf.get_variable(initializer=tf.initializers.random_normal,
shape=[1,1,in_channels, nfilter],
name='pw_weight')
conv = tf.nn.depthwise_conv2d_native(input=inputs,
filter=dw_filter,
strides=[1,1,1,1],
padding='SAME',
data_format=data_format)
conv = tf.nn.conv2d(input=conv,
filter=pw_filter,
strides=[1,1,1,1],
padding='SAME',
data_format=data_format)
return conv
Each layer run in 100 times,
different from the link is I set the batch_size as a constant 10,
and the channels is in [32, 64, 128],
inputs is[batch_size, channels, img_size, img_size]
and the duration of them as follows:
Channels: 32
Normal Conv 0.7769527435302734s, Sep Conv 1.4197885990142822s
Channels: 64
Normal Conv 0.8963277339935303s, Sep Conv 1.5703468322753906s
Channels: 128
Normal Conv 0.9741833209991455s, Sep Conv 1.665834665298462s
It seems that when batch_size is a constant only change channel the time cost of the normal one and the separable one is growing gradually.
And when setting batch_size * channels as a constant
Inputs shape as [CHANNELS_BATCH_SIZE // channels, channels, imgsize, imgsize]
Channels: 32
Normal Conv 0.871959924697876s, Sep Conv 1.569300651550293s
Channels: 64
Normal Conv 0.909860372543335s, Sep Conv 1.604109525680542s
Channels: 128
Normal Conv 0.9196009635925293s, Sep Conv 1.6144189834594727s
What confuses me is the result is different from the result of the link above: the time cost of sep_conv2d does not have obvious change.
My Questions are:
What makes it different between the link above and the experiment by myself?
I am a newbie, so is there something wrong in my code to implement the separable_conv2d?
How to implement the separable_conv2d can be faster than the normal one in TF or in Pytorch?
Any help would be appreciated. Thank in advance.
I have a time-series of data and am running some very basic tests to get a feel for TensorFlow, Keras, Python, etc.
To setup the problem, I have a large amount of images whereby 7 images of data (with Cartesian dimensions 33 x 33) when accumulated should yield a single value. Therefore, the amount of 'x' data should be y*7 where y is the 'truth' data being trained with.
All of the training data is in entitled 'alldatax' which is a large matrix: [420420 x 33 x 33 x 7 x 1] where the dimensions are the total number of single images, x-dimension, y-dimension, number of images to be accumulated for a single 'truth' value, and then a final dimension necessary for 3D convolving.
The 'truth' matrix, alldatay, is a 1D matrix which is simply 420420 / 7 = 60060.
When running a simple convnet:
model = models.Sequential()
model.add(layers.InputLayer(input_shape=(33,33,7,1)))
model.add(layers.Conv3D(16,(3,3,1), activation = 'relu', input_shape = (33,33,7,1)))
model.add(layers.LeakyReLU(alpha=0.3))
model.add(layers.MaxPooling3D((2,2,1)))
model.add(layers.Conv3D(32,(3,3,1), activation = 'relu'))
model.add(layers.LeakyReLU(alpha=0.3))
model.add(layers.MaxPooling3D((2,2,1)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation = 'relu'))
model.add(layers.LeakyReLU(alpha=0.3))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation = 'relu'))
model.add(layers.LeakyReLU(alpha=0.3))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation = 'relu'))
model.compile(optimizer = 'adam', loss = 'mse')
model.fit(x = alldatax, y = alldatay, batch_size = 1000, epochs = 50, verbose = 1, shuffle = False)
I get an error: ValueError: Input arrays should have the same number of samples as target arrays. Found 420420 input samples and 60060 target samples.
What needs to change to get the convnet to realize it needs 7*x for every y value?
Something seems to be wrong in your calculations.
You state that the neural net should take seven 33x33 images as one input example, so you set the input shape of the first layer to (33,33,7,1) which is right. This means for every 33x33x7x1 input there should be exactly one y value.
Since all of your data all your data comprises 420420 33x33x7x1 images there should be 420420 y values, not 60060.
I have a CNN code that was written using tensorflow library:
x_img = tf.placeholder(tf.float32)
y_label = tf.placeholder(tf.float32)
def convnet_3d(x_img, W):
conv_3d_layer = tf.nn.conv3d(x_img, W, strides=[1,1,1,1,1], padding='VALID')
return conv_3d_layer
def maxpool_3d(x_img):
maxpool_3d_layer = tf.nn.max_pool3d(x_img, ksize=[1,2,2,2,1], strides=[1,2,2,2,1], padding='VALID')
return maxpool_3d_layer
def convolutional_neural_network(x_img):
weights = {'W_conv1_layer':tf.Variable(tf.random_normal([3,3,3,1,32])),
'W_conv2_layer':tf.Variable(tf.random_normal([3,3,3,32,64])),
'W_fc_layer':tf.Variable(tf.random_normal([409600,1024])),
'W_out_layer':tf.Variable(tf.random_normal([1024, num_classes]))}
biases = {'b_conv1_layer':tf.Variable(tf.random_normal([32])),
'b_conv2_layer':tf.Variable(tf.random_normal([64])),
'b_fc_layer':tf.Variable(tf.random_normal([1024])),
'b_out_layer':tf.Variable(tf.random_normal([num_classes]))}
x_img = tf.reshape(x_img, shape=[-1, img_x, img_y, img_z, 1])
conv1_layer = tf.nn.relu(convnet_3d(x_img, weights['W_conv1_layer']) + biases['b_conv1_layer'])
conv1_layer = maxpool_3d(conv1_layer)
conv2_layer = tf.nn.relu(convnet_3d(conv1_layer, weights['W_conv2_layer']) + biases['b_conv2_layer'])
conv2_layer = maxpool_3d(conv2_layer)
fc_layer = tf.reshape(conv2_layer,[-1, 409600])
fc_layer = tf.nn.relu(tf.matmul(fc_layer, weights['W_fc_layer'])+biases['b_fc_layer'])
fc_layer = tf.nn.dropout(fc_layer, keep_rate)
output_layer = tf.matmul(fc_layer, weights['W_out_layer'])+biases['b_out_layer']
return output_layer
my input image x_img is 25x25x25(3d image), I have some questions about the code:
1- is [3,3,3,1,32] in 'W_conv1_layer' means [width x height x depth x channel x number of filters]?
2- in 'W_conv2_layer' weights are [3,3,3,32,64], why the output is 64? I know that 3x3x3 is filter size and 32 is input come from first layer.
3- in 'W_fc_layer' weights are [409600,1024], 1024 is number of nodes in FC layer, but where this magic number '409600' come from?
4- before the image get into the conv layers why we need to reshape the image
x_img = tf.reshape(x_img, shape=[-1, img_x, img_y, img_z, 1])
All the answers can be found in the official doc of conv3d.
The weights should be [filter_depth, filter_height, filter_width, in_channels, out_channels]
The numbers 32 and 64 are chosen because it works simply they are just hyperparameters
409600 comes from reshaping the output of maxpool3d (it is probably a mistake the real size should be 4096 see comments)
Because tensorflow expects certain layouts for its input
Your should try implementing a simple convnet on images before moving to more complicated stuff.
I am trying to feed a sequence with 20 featuresto an LSTM network as shown in the code. But I get an error that my Input0 is incompatible with LSTM input. Not sure how to change my layer structure to fit the data.
def build_model(features, aux1=None, aux2=None):
# create model
features[0] = np.asarray(features[0])
main_input = Input(shape=features[0].shape, dtype='float32', name='main_input')
main_out = LSTM(40, activation='relu')
aux1_input = Input(shape=(len(aux1[0]),), dtype='float32', name='aux1_input')
aux1_out = Dense(len(aux1[0]))(aux1_input)
aux2_input = Input(shape=(len(aux2[0]),), dtype='float32', name='aux2_input')
aux2_out = Dense(len(aux2[0]))(aux2_input)
x = concatenate([aux1_out, main_out, aux2_out])
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid', name='main_output')(x)
model = Model(inputs=[aux1_input, aux2_input, main_input], outputs= [output])
return model
Features variable is an array of shape (1456, 20) I have 1456 days and for each day I have 20 variables.
Your main_input should be of shape (samples, timesteps, features)
and then you should define main_input like this:
main_input = Input(shape=(timesteps,)) # for stateless RNN (your one)
or main_input = Input(batch_shape=(batch_size, timesteps,)) for stateful RNN (not the one you are using in your example)
if your features[0] is a 1-dimensional array of various features (1 timestep), then you also have to reshape features[0] like this:
features[0] = np.reshape(features[0], (1, features[0].shape))
and then do it to features[1], features[2] etc
or better reshape all your samples at once:
features = np.reshape(features, (features.shape[0], 1, features.shape[1]))
LSTM layers are designed to work with "sequences".
You say your sequence has 20 features, but how many time steps does it have?? Do you mean 20 time steps instead?
An LSTM layer requires input shapes such as (BatchSize, TimeSteps, Features).
If it's the case that you have 1 feature in each of the 20 time steps, you must shape your data as:
inputData = someData.reshape(NumberOfSequences, 20, 1)
And the Input tensor should take this shape:
main_input = Input((20,1), ...) #yes, it ignores the batch size