Improve CNN model's accuracy for clasifying fruits - Pytorch - pytorch

I need to classify cherry, strawberry, and tomato with 3600 training images and 900 testing images. However, my model performed poorly and overfitted. I tried weight_decay to avoid overfiting but the model gave error around shape not fitting. My training accuracy is 85% and my testing accuracy is 60%.
This is roughly my training data, they are all around 300x300 pixels
Transformation:
train_transform = transforms.Compose([
transforms.RandomRotation(10),
transforms.RandomHorizontalFlip(),
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])
])
test_transform = transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])
])
Model:
Batch_size = 100
epoch = 8
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(CNNmodel.parameters(), lr=0.001)
class ConvolutionalNetwork(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 3, 1)
self.conv2 = nn.Conv2d(6, 16, 3, 1)
self.fc1 = nn.Linear(54*54*16, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 3)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.max_pool2d(X, 2, 2) #kernel size 2, stride 2, padding tbc
X = F.relu(self.conv2(X))
X = F.max_pool2d(X, 2, 2)
X = X.view(-1, 54*54*16)
X = F.relu(self.fc1(X))
X = F.relu(self.fc2(X))
X = self.fc3(X)
return F.log_softmax(X, dim=1)

The most egregious mistake in your approach is that you are training a NN from scratch instead of fine-tuning a model already trained on ImageNet.
Check out https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html

Related

Convolutional Autoencoder CIFAR10 PyTorch - RuntimeError

I am using PyTorch version: 1.9.0+cu102 with Convolutional Autoencoder for CIFAR-10 dataset as follows:
# Define transformations for training and test sets-
transform_train = transforms.Compose(
[
# transforms.RandomCrop(32, padding = 4),
# transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
# transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]
)
transform_test = transforms.Compose(
[
transforms.ToTensor(),
# transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]
)
# Load dataset-
train_dataset = torchvision.datasets.CIFAR10(
root = './data', train = True,
download = True, transform = transform_train
)
test_dataset = torchvision.datasets.CIFAR10(
root = './data', train = False,
download = True, transform = transform_test
)
print(f"len(train_dataset) = {len(train_dataset)} & len(test_dataset) = {len(test_dataset)}")
# len(train_dataset) = 50000 & len(test_dataset) = 10000
batch_size = 64
# Create training and testing loaders-
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size = batch_size,
shuffle = True
)
test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size = batch_size,
shuffle = False
)
print(f"len(train_loader) = {len(train_loader)} & len(test_loader) = {len(test_loader)}")
# len(train_loader) = 782 & len(test_loader) = 157
# Sanity check-
len(train_dataset) / batch_size, len(test_dataset) / batch_size
# (781.25, 156.25)
# Get some random training images-
images, labels = next(iter(train_loader))
print(f"images.shape: {images.shape} & labels.shape: {labels.shape}")
# images.shape: torch.Size([64, 3, 32, 32]) & labels.shape: torch.Size([64])
LEARNING_RATE = 0.001
num_epochs = 20
class Reshape(nn.Module):
def __init__(self, *args):
super().__init__()
self.shape = args
def forward(self, x):
return x.view(self.shape)
class Trim(nn.Module):
def __init__(self, *args):
super().__init__()
def forward(self, x):
return x[:, :, :32, :32]
encoder = nn.Sequential(
nn.Conv2d(
in_channels = 3, out_channels = 32,
kernel_size = 3, padding = 1,
stride = 1, bias = True
),
nn.LeakyReLU(negative_slope = 0.01),
nn.Conv2d(
in_channels = 32, out_channels = 64,
kernel_size = 3, padding = 1,
stride = 2, bias = True
),
nn.LeakyReLU(negative_slope = 0.01),
nn.Conv2d(
in_channels = 64, out_channels = 64,
kernel_size = 3, padding = 1,
stride = 2, bias = True
),
nn.LeakyReLU(negative_slope = 0.01),
nn.Conv2d(
in_channels = 64, out_channels = 64,
kernel_size = 3, padding = 1,
stride = 1, bias = True
),
nn.LeakyReLU(negative_slope = 0.01),
nn.Flatten(),
nn.Linear(
in_features = 4096, out_features = 1500,
bias = True
),
nn.Linear(
in_features = 1500, out_features = 500,
bias = True
),
nn.Linear(
in_features = 500, out_features = 100,
bias = True
)
)
# Sanity check-
x = torch.rand(size = (32, 3, 32, 32))
print(f"x.shape = {x.shape}")
encoder_op = encoder(x)
print(f"encoder_op.shape = {encoder_op.shape}")
# x.shape = torch.Size([32, 3, 32, 32])
# encoder_op.shape = torch.Size([32, 100])
decoder = nn.Sequential(
nn.Linear(
in_features = 100, out_features = 500,
bias = True),
nn.Linear(
in_features = 500, out_features = 1500,
bias = True),
nn.Linear(
in_features = 1500, out_features = 4096,
bias = True),
Reshape(-1, 64, 8, 8),
nn.ConvTranspose2d(
in_channels = 64, out_channels = 64,
kernel_size = 3, stride = 1,
padding = 1, bias = True),
# output: torch.Size([32, 64, 8, 8])
nn.ConvTranspose2d(
in_channels = 64, out_channels = 64,
kernel_size = 3, stride = 2,
padding = 1, bias = True),
# output: torch.Size([32, 64, 15, 15])
nn.ConvTranspose2d(
in_channels = 64, out_channels = 32,
kernel_size = 3, stride = 2,
padding = 0, bias = True),
# torch.Size([32, 32, 31, 31])
nn.ConvTranspose2d(
in_channels = 32, out_channels = 3,
kernel_size = 3, stride = 1,
padding = 0, bias = True),
# output: torch.Size([32, 3, 33, 33])
Trim(),
# (3, 33, 33) -> (3, 32, 32)
nn.Sigmoid()
)
# Sanity check-
decoder(encoder_op).shape
# torch.Size([32, 3, 32, 32])
class AutoEncoder(nn.Module):
def __init__(self):
super().__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
# Initialize an autoencoder instance-
model = AutoEncoder()
# Move model to (GPU) device-
model.to(device)
# Specify optimizer and loss function-
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
loss_fn = F.mse_loss
num_epochs = 15
# Python3 lists to hold training metrics-
trainining_loss = []
validation_loss = []
def compute_epoch_loss_autoencoder(model, data_loader, loss_fn, device):
model.eval()
curr_loss, num_examples = 0., 0
with torch.no_grad():
for features, _ in data_loader:
features = features.to(device)
logits = model(features)
loss = loss_fn(logits, features, reduction='sum')
num_examples += features.size(0)
curr_loss += loss
curr_loss = curr_loss / num_examples
return curr_loss
start_time = time.time()
for epoch in range(num_epochs):
running_loss = 0.0
model.train()
for batch_idx, (features, _) in enumerate(train_loader):
features = features.to(device)
# forward and back prop-
logits = model(features) # make predictions using model
loss = loss_fn(logits, features)
optimizer.zero_grad()
# Perform backprop-
loss.backward()
# Update model parameters-
optimizer.step()
# Compute model's performance-
running_loss += loss.item() * features.size(0)
# Compute loss using training dataset-
epoch_loss = running_loss / len(train_dataset)
trainining_loss.append(epoch_loss)
# Compute loss using validation dataset-
val_loss = compute_epoch_loss_autoencoder(
model, test_loader,
loss_fn, device
)
validation_loss.append(val_loss)
print(f"Epoch = {epoch + 1}: Autoencoder train loss = {epoch_loss:.4f} & val loss = {val_loss:.4f}")
end_time = time.time()
# Get some validation images-
for img, label in test_loader:
break
img.shape, label.shape
# (torch.Size([64, 3, 32, 32]), torch.Size([64]))
img.to(device)
# Pass batch size = 64 images through encoder to get latent space representations-
model.encoder(img)
This line gives me the error:
RuntimeError Traceback (most recent call
last)
in ()
----> 1 model.encoder(img)
4 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/conv.py in
_conv_forward(self, input, weight, bias)
438 _pair(0), self.dilation, self.groups)
439 return F.conv2d(input, weight, bias, self.stride,
--> 440 self.padding, self.dilation, self.groups)
441
442 def forward(self, input: Tensor) -> Tensor:
RuntimeError: Input type (torch.FloatTensor) and weight type
(torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
What's going wrong?
Thanks!

RuntimeError: Expected 4-dimensional input for 4-dimensional weight X, but got 3-dimensional input of size Y instead

I am building a CNN to do image classification on the EMNIST dataset.
To do so, I have the following datasets:
import scipy .io
emnist = scipy.io.loadmat(DIRECTORY + '/emnist-letters.mat')
data = emnist ['dataset']
X_train = data ['train'][0, 0]['images'][0, 0]
X_train = X_train.reshape((-1,28,28), order='F')
y_train = data ['train'][0, 0]['labels'][0, 0]
X_test = data ['test'][0, 0]['images'][0, 0]
X_test = X_test.reshape((-1,28,28), order = 'F')
y_test = data ['test'][0, 0]['labels'][0, 0]
With shape:
X_train = (124800, 28, 28)
y_train = (124800, 1)
X_test = (20800, 28, 28)
y_test = (20800, 1)
Note that the pictures are grayscale, so the colors are represented with only one number.
Which I prepare further as follows:
train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
My model looks as follows:
class CNNModel(nn.Module):
def __init__(self):
super(CNNModel, self).__init__()
self.cnn_layers = Sequential(
# Defining a 2D convolution layer
Conv2d(1, 4, kernel_size=3, stride=1, padding=1),
BatchNorm2d(4),
ReLU(inplace=True),
MaxPool2d(kernel_size=2, stride=2),
# Defining another 2D convolution layer
Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
BatchNorm2d(4),
ReLU(inplace=True),
MaxPool2d(kernel_size=2, stride=2),
)
self.linear_layers = Sequential(
Linear(4 * 7 * 7, 10)
)
# Defining the forward pass
def forward(self, x):
x = self.cnn_layers(x)
x = x.view(x.size(0), -1)
x = self.linear_layers(x)
return x
model = CNNModel()
The code below is part of the code that I use to train my model:
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images = Variable(images)
labels = Variable(labels)
# Forward pass to get output/logits
outputs = model(images)
However, by excuting my code, I get the following error:
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [4, 1, 3, 3], but got 3-dimensional input of size [100, 28, 28] instead
So a 4D input is expected while my input is 3D. What can I do so a 3D model is expected rather than a 4D model?
Here a similar question is asked, however I dont see how I can translate this to my code
The convolution expects the input to have size [batch_size, channels, height, width], but your images have size [batch_size, height, width], the channel dimension is missing. Greyscale is represented with a single channel and you have correctly set the in_channels of the first convolutions to 1, but your images don't have the matching dimension.
You can easily add the singular dimension with torch.unsqueeze.
Also, please don't use Variable, it was deprecated with PyTorch 0.4.0, which was released over 2 years ago, and all of its functionality has been merged into the tensors.
for i, (images, labels) in enumerate(train_loader):
# Add a single channel dimension
# From: [batch_size, height, width]
# To: [batch_size, 1, height, width]
images = images.unsqueeze(1)
# Forward pass to get output/logits
outputs = model(images)

how to solve size mismatch for this model?

data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
batchsize=4
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=5, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(64, 128, kernel_size=5, padding=1)
self.fc1 = nn.Linear(256*6*6, 4096)
self.fc2 = nn.Linear(4096, 4096)
self.fc3 = nn.Linear(4096, 2)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.reshape(x.size(0), -1)
print(x.shape)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
RuntimeError: size mismatch, m1: [4 x 373248], m2: [9216 x 4096] at C:/w/1/s/tmp_conda_3.8_075429/conda/conda-bld/pytorch_1579852542185/work/aten/src\THC/generic/THCTensorMathBlas.cu:290
With the provided debug information we can only tell that there is a size mismatch in one of the layers.
Looking at the error, it seems the error is in the first linear layer. You should change its size to 373248 instead of 256*6*6.
This should also be clear from the output of the print statement print(x.shape).

Pytorch tensor, how to switch channel position - Runtime error

I have my training dataset as below, where X_train is 3D with 3 channels
Shape of X_Train: (708, 256, 3)
Shape of Y_Train: (708, 4)
Then I convert them into a tensor and input into the dataloader:
X_train=torch.from_numpy(X_data)
y_train=torch.from_numpy(y_data)
training_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=50, shuffle=False)
However when training the model, I get the following error:
RuntimeError: Given groups=1, weight of size 24 3 5, expected input[708, 256, 3] to have 3 channels, but got 256 channels instead
I suppose this is due to the position of the channel? In Tensorflow, the channel position is at the end, but in PyTorch the format is "Batch Size x Channel x Height x Width"? So how do I swap the positions in the x_train tensor to match the expected format in the dataloader?
class TwoLayerNet(torch.nn.Module):
def __init__(self):
super(TwoLayerNet,self).__init__()
self.conv1 = nn.Sequential(
nn.Conv1d(3, 3*8, kernel_size=5, stride=1),
nn.Sigmoid(),
nn.AvgPool1d(kernel_size=2, stride=0))
self.conv2 = nn.Sequential(
nn.Conv1d(3*8, 12, kernel_size=5, stride=1),
nn.Sigmoid(),
nn.AvgPool1d(kernel_size=2, stride = 0))
#self.drop_out = nn.Dropout()
self.fc1 = nn.Linear(708, 732)
self.fc2 = nn.Linear(732, 4)
def forward(self, x):
out = self.conv1(x)
out = self.conv2(out)
out = out.reshape(out.size(0), -1)
out = self.drop_out(out)
out = self.fc1(out)
out = self.fc2(out)
return out
Use permute.
X_train = torch.rand(708, 256, 3)
X_train = X_train.permute(2, 0, 1)
X_train.shape
# => torch.Size([3, 708, 256])

RuntimeError: Given groups=1, weight of size 16 1 5 5, expected input[100, 3, 256, 256] to have 1 channels, but got 3 channels instead

I try to run the following programe for images classification problem in Pytorch:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Hyper parameters
num_epochs = 5
num_classes = 10
batch_size = 100
learning_rate = 0.001
TRAIN_DATA_PATH = "train/"
TEST_DATA_PATH = "test/"
TRANSFORM_IMG = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225] )
])
train_dataset = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_dataset = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
# Convolutional neural network (two convolutional layers)
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7 * 7 * 32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
model = ConvNet(num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
# Test the model
model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
# Save the model checkpoint
torch.save(model.state_dict(), 'model/model.ckpt')
But I get a RuntimeError:
Traceback (most recent call last):
RuntimeError: Given groups=1, weight of size 16 1 5 5, expected input[100, 3, 256, 256] to have 1 channels, but got 3 channels instead
Someone could help to fix the bug? Thanks a lot.
Reference related:
https://discuss.pytorch.org/t/given-groups-1-weight-16-1-5-5-so-expected-input-100-3-64-64-to-have-1-channels-but-got-3-channels-instead/28831/17
RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[3, 1, 224, 224] to have 3 channels, but got 1 channels instead
Your input layer self.layer1 starts with a 2d convolution nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2). This conv layer expects an input with two spatial dimensions and one channel, and outputs a tesnor with the same spatial dimensions and 16 channels.
However, your input has three channels and not one (RGB image instead of gray level image).
Make sure your net and data are in synch.

Resources