While following an example about classifying cats and dogs using AlexNet on some post I got stuck on this import error:
Traceback (most recent call last):
File "C:\Users\Gsum\Desktop\Asirra 개 고양이\asirra-dogs-cats-classification-master\learning\optimizers.py", line 5, in <module>
from learning.utils import plot_learning_curve
ImportError: No module named 'learning'
I've been looking for modules named similar to learning or learn which includes 'plot_learning_curve' function.
Anyone who knows which library includes plot learning curve function, I would appreciate some help.
Here is my code:
import os
import time
from abc import abstractmethod
import tensorflow as tf
from learning.utils import plot_learning_curve
class Optimizer(object):
"""Base class for gradient-based optimization algorithms."""
def __init__(self, model, train_set, evaluator, val_set=None, **kwargs):
Optimizer initializer.
:param model: ConvNet, the model to be learned.
:param train_set: DataSet, training set to be used.
:param evaluator: Evaluator, for computing performance scores during training.
:param val_set: DataSet, validation set to be used, which can be None if not used.
:param kwargs: dict, extra arguments containing training hyperparameters.
- batch_size: int, batch size for each iteration.
- num_epochs: int, total number of epochs for training.
- init_learning_rate: float, initial learning rate.
self.model = model
self.train_set = train_set
self.evaluator = evaluator
self.val_set = val_set
# Training hyperparameters
self.batch_size = kwargs.pop('batch_size', 256)
self.num_epochs = kwargs.pop('num_epochs', 320)
self.init_learning_rate = kwargs.pop('init_learning_rate', 0.01)
self.learning_rate_placeholder = tf.placeholder(tf.float32) # Placeholder for current learning rate
self.optimize = self._optimize_op()
def _reset(self):
"""Reset some variables."""
self.curr_epoch = 1
self.num_bad_epochs = 0 # number of bad epochs, where the model is updated without improvement.
self.best_score = self.evaluator.worst_score # initialize best score with the worst one
self.curr_learning_rate = self.init_learning_rate # current learning rate
def _optimize_op(self, **kwargs):
tf.train.Optimizer.minimize Op for a gradient update.
This should be implemented, and should not be called manually.
def _update_learning_rate(self, **kwargs):
Update current learning rate (if needed) on every epoch, by its own schedule.
This should be implemented, and should not be called manually.
def _step(self, sess, **kwargs):
Make a single gradient update and return its results.
This should not be called manually.
:param sess: tf.Session.
:param kwargs: dict, extra arguments containing training hyperparameters.
- augment_train: bool, whether to perform augmentation for training.
:return loss: float, loss value for the single iteration step.
y_true: np.ndarray, true label from the training set.
y_pred: np.ndarray, predicted label from the model.
augment_train = kwargs.pop('augment_train', True)
# Sample a single batch
X, y_true = self.train_set.next_batch(self.batch_size, shuffle=True,
augment=augment_train, is_train=True)
# Compute the loss and make update
_, loss, y_pred = \
sess.run([self.optimize, self.model.loss, self.model.pred],
feed_dict={self.model.X: X, self.model.y: y_true,
self.model.is_train: True,
self.learning_rate_placeholder: self.curr_learning_rate})
return loss, y_true, y_pred
def train(self, sess, save_dir='/tmp', details=False, verbose=True, **kwargs):
Run optimizer to train the model.
:param sess: tf.Session.
:param save_dir: str, the directory to save the learned weights of the model.
:param details: bool, whether to return detailed results.
:param verbose: bool, whether to print details during training.
:param kwargs: dict, extra arguments containing training hyperparameters.
:return train_results: dict, containing detailed results of training.
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer()) # initialize all weights
train_results = dict() # dictionary to contain training(, evaluation) results and details
train_size = self.train_set.num_examples
num_steps_per_epoch = train_size // self.batch_size
num_steps = self.num_epochs * num_steps_per_epoch
if verbose:
print('Running training loop...')
print('Number of training iterations: {}'.format(num_steps))
step_losses, step_scores, eval_scores = [], [], []
start_time = time.time()
# Start training loop
for i in range(num_steps):
# Perform a gradient update from a single minibatch
step_loss, step_y_true, step_y_pred = self._step(sess, **kwargs)
# Perform evaluation in the end of each epoch
if (i+1) % num_steps_per_epoch == 0:
# Evaluate model with current minibatch, from training set
step_score = self.evaluator.score(step_y_true, step_y_pred)
# If validation set is initially given, use it for evaluation
if self.val_set is not None:
# Evaluate model with the validation set
eval_y_pred = self.model.predict(sess, self.val_set, verbose=False, **kwargs)
eval_score = self.evaluator.score(self.val_set.labels, eval_y_pred)
if verbose:
# Print intermediate results
print('[epoch {}]\tloss: {:.6f} |Train score: {:.6f} |Eval score: {:.6f} |lr: {:.6f}'\
.format(self.curr_epoch, step_loss, step_score, eval_score, self.curr_learning_rate))
# Plot intermediate results
plot_learning_curve(-1, step_losses, step_scores, eval_scores=eval_scores,
mode=self.evaluator.mode, img_dir=save_dir)
curr_score = eval_score
# else, just use results from current minibatch for evaluation
if verbose:
# Print intermediate results
print('[epoch {}]\tloss: {} |Train score: {:.6f} |lr: {:.6f}'\
.format(self.curr_epoch, step_loss, step_score, self.curr_learning_rate))
# Plot intermediate results
plot_learning_curve(-1, step_losses, step_scores, eval_scores=None,
mode=self.evaluator.mode, img_dir=save_dir)
curr_score = step_score
# Keep track of the current best model,
# by comparing current score and the best score
if self.evaluator.is_better(curr_score, self.best_score, **kwargs):
self.best_score = curr_score
self.num_bad_epochs = 0
saver.save(sess, os.path.join(save_dir, 'model.ckpt')) # save current weights
self.num_bad_epochs += 1
self.curr_epoch += 1
if verbose:
print('Total training time(sec): {}'.format(time.time() - start_time))
print('Best {} score: {}'.format('evaluation' if eval else 'training',
if details:
# Store training results in a dictionary
train_results['step_losses'] = step_losses # (num_iterations)
train_results['step_scores'] = step_scores # (num_epochs)
if self.val_set is not None:
train_results['eval_scores'] = eval_scores # (num_epochs)
return train_results
class MomentumOptimizer(Optimizer):
"""Gradient descent optimizer, with Momentum algorithm."""
def _optimize_op(self, **kwargs):
tf.train.MomentumOptimizer.minimize Op for a gradient update.
:param kwargs: dict, extra arguments for optimizer.
- momentum: float, the momentum coefficient.
:return tf.Operation.
momentum = kwargs.pop('momentum', 0.9)
update_vars = tf.trainable_variables()
return tf.train.MomentumOptimizer(self.learning_rate_placeholder, momentum, use_nesterov=False)\
.minimize(self.model.loss, var_list=update_vars)
def _update_learning_rate(self, **kwargs):
Update current learning rate, when evaluation score plateaus.
:param kwargs: dict, extra arguments for learning rate scheduling.
- learning_rate_patience: int, number of epochs with no improvement
after which learning rate will be reduced.
- learning_rate_decay: float, factor by which the learning rate will be updated.
- eps: float, if the difference between new and old learning rate is smaller than eps,
the update is ignored.
learning_rate_patience = kwargs.pop('learning_rate_patience', 10)
learning_rate_decay = kwargs.pop('learning_rate_decay', 0.1)
eps = kwargs.pop('eps', 1e-8)
if self.num_bad_epochs > learning_rate_patience:
new_learning_rate = self.curr_learning_rate * learning_rate_decay
# Decay learning rate only when the difference is higher than epsilon.
if self.curr_learning_rate - new_learning_rate > eps:
self.curr_learning_rate = new_learning_rate
self.num_bad_epochs = 0


SequenceClassifierOutput has generator as loss instead of a tensor

I'm doing Distillation from a Roberta with an Adapter, I'm following this tutorial
and in the function distill_roberta_weights() I just change teacher_model.config.to_dict()
to student.load_state_dict(teacher.state_dict(), strict=False), so the student model has the adapter too.
But when I am training the distillation using the
from here
I get the following error
Do you have any idea of what is the problem?
The student_output has a loss generator instead the tensor, the part of the cross entropy does not have any problem as it uses the logits from the outputs.
I am adding more information
def distill_weights(teacher, student):
Recursively copies the weights of the (teacher) to the (student).
This function is meant to be first called on a RobertaFor... model, but is then called on every children of that model recursively.
The only part that's not fully copied is the encoder, of which only half is copied.
# If the part is an entire RoBERTa model or a RobertaFor..., unpack and iterate
if isinstance(teacher, RobertaModel) or type(teacher).__name__.startswith('RobertaFor'):
for teacher_part, student_part in zip(teacher.children(), student.children()):
distill_weights(teacher_part, student_part)
# Else if the part is an encoder, copy one out of every layer
elif isinstance(teacher, RobertaEncoder):
teacher_encoding_layers = [layer for layer in next(teacher.children())]
student_encoding_layers = [layer for layer in next(student.children())]
for i in range(len(student_encoding_layers)):
# Else the part is a head or something else, copy the state_dict
student.load_state_dict(teacher.state_dict(), strict=False)
def distill_roberta_based(teacher_model):
Distilates a RoBERTa (teacher_model) like would DistilBERT for a BERT model.
The student model has the same configuration, except for the number of hidden layers, which is // by 2.
The student layers are initilized by copying one out of two layers of the teacher, starting with layer 0.
The head of the teacher is also copied.
# Set student configuration
configuration = teacher_model.config.to_dict()
configuration['num_hidden_layers'] //= 2
configuration = RobertaConfig.from_dict(configuration)
# create student model
student_model = type(teacher_model)(configuration)
distill_weights(teacher=teacher_model, student=student_model)
return student_model
#function for train the Distillated model
class DistillationTrainer(Trainer):
def __init__(self, *args, teacher_model=None, **kwargs):
super().__init__(*args, **kwargs)
self.teacher = teacher_model
# place teacher on same device as student
def compute_loss(self, model, inputs, return_outputs = False) :
The distillation loss for distilating a BERT-like model.
The loss takes the (teacher_logits), (student_logits) and (labels) for various losses.
The (temperature) can be given, otherwise it's set to 1 by default.
outputs_student = model(**inputs)
student_loss = outputs_student.loss
# compute teacher output
with torch.no_grad():
outputs_teacher = self.teacher(**inputs)
# assert size
assert outputs_student.logits.size() == outputs_teacher.logits.size()
# Classification loss (problem-specific loss)
loss_function = CrossEntropyLoss()
# Temperature and sotfmax
student_logits = F.softmax (outputs_student.logits / self.args.temperature, dim=-1)
teacher_logits = F.softmax (outputs_teacher.logits / self.args.temperature, dim=-1)
loss_logits = loss_function(student_logits, teacher_logits)
# Return weighted student loss
loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
return (loss, outputs_student) if return_outputs else loss
#create the student
student_model_adapter = distill_roberta_based(teacher_model)
#activate adapter
trainer = DistillationTrainer(
trainer.args._n_gpu = 4
So, the desired output of outputs_student should be like
SequenceClassifierOutput(loss=tensor([0.6899, 0.6902, 0.6926, 0.6913, 0.6906, 0.6904, 0.6922, 0.6917],
device='cuda:0', grad_fn=<GatherBackward>), logits=tensor([[-1.2512e-03, -9.7885e-03],
[ 6.2714e-03, -5.7755e-03],.....])
But instead the output is
SequenceClassifierOutput(loss=<generator object gather.<locals>.gather_map.<locals>.<genexpr> at 0x7f5bb4fbe9d0>, logits=tensor([[-0.0150, 0.0075],
[-0.0122, 0.0181],...

Knowledge Distillation - distillation_loss: 0.0000e+00

I am implementing the Knowledge Distillation code as in keras documentation https://keras.io/examples/vision/knowledge_distillation/.
However, when the distillation never surpasses 0.
Tried to change the function in student from softmax to
tf.nn.log_softmax(student_predictions / self.temperature, axis=1)
and the result is way higher but still static.
This is the code:
class Distiller(keras.Model):
def __init__(self, student, teacher):
super(Distiller, self).__init__()
self.teacher = teacher
self.student = student
def compile(
super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
self.student_loss_fn = student_loss_fn
self.distillation_loss_fn = distillation_loss_fn
self.alpha = alpha
self.temperature = temperature
def train_step(self, data):
# Unpack data
x, y = data
# Forward pass of teacher
teacher_predictions = self.teacher(x, training=False)
with tf.GradientTape() as tape:
# Forward pass of student
student_predictions = self.student(x, training=True)
# Compute losses
student_loss = self.student_loss_fn(y, student_predictions)
# Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
# The magnitudes of the gradients produced by the soft targets scale
# as 1/T^2, multiply them by T^2 when using both hard and soft targets.
distillation_loss = (
tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
tf.nn.log_softmax(student_predictions / self.temperature, axis=1),
* self.temperature**2
loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
# Compute gradients
trainable_vars = self.student.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update the metrics configured in `compile()`.
self.compiled_metrics.update_state(y, student_predictions)
# Return a dict of performance
results = {m.name: m.result() for m in self.metrics}
{"student_loss": student_loss, "distillation_loss": distillation_loss}
return results
def test_step(self, data):
# Unpack the data
x, y = data
# Compute predictions
y_prediction = self.student(x, training=False)
# Calculate the loss
student_loss = self.student_loss_fn(y, y_prediction)
# Update the metrics.
self.compiled_metrics.update_state(y, y_prediction)
# Return a dict of performance
results = {m.name: m.result() for m in self.metrics}
results.update({"student_loss": student_loss})
return results
...define both teacher and student models
# Initialize and compile distiller
distiller = Distiller(student=student, teacher=teacher)
# Distill teacher to student
distiller.fit(train_ds, validation_data = val_ds, epochs=1)

Pytorch freezes when checking dataloader

I am running this block of codes for Pytorch and it seems to run forever/freeze in my notebook. I suspect it has something to do with my dataloader but I can't seem to figure out what is wrong here. I am running this on a GPU environment and I have previously ran tensorflow v2 keras for the CNN model and it was able to work.
In addition I have also tried to do model.train() and it was also stuck at the first epoch.
Code I am running
import time
start_time = time.time()
for data, label in train_dataloader:
print("Time taken: ", time.time() - start_time)
The dataloader is implemented with these line of codes
train_dataset = ChestXrayDataset("dataset/CheXpert-v1.0-small/train/train", train_data, IMAGE_SIZE, True)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
These are the parameters
IMAGE_SIZE = 224 # Image size (224x224)
IMAGENET_MEAN = [0.485, 0.456, 0.406] # Mean of ImageNet dataset (used for normalization)
IMAGENET_STD = [0.229, 0.224, 0.225] # Std of ImageNet dataset (used for normalization)
LEARNING_RATE_SCHEDULE_FACTOR = 0.1 # Parameter used for reducing learning rate
LEARNING_RATE_SCHEDULE_PATIENCE = 5 # Parameter used for reducing learning rate
MAX_EPOCHS = 100 # Maximum number of training epochs
I have checked the dataloader and this is what I got
<torch.utils.data.dataloader.DataLoader at 0x1f96cd5f6a0>
The class for ChestXrayDataset is shown here
class ChestXrayDataset(Dataset):
def __init__(self, folder_dir, dataframe, image_size, normalization):
Init Dataset
folder_dir: str
folder contains all images
dataframe: pandas.DataFrame
dataframe contains all information of images
image_size: int
image size to rescale
normalization: bool
whether applying normalization with mean and std from ImageNet or not
self.image_paths = [] # List of image paths
self.image_labels = [] # List of image labels
# Define list of image transformations
image_transformation = [
transforms.Resize((image_size, image_size)),
if normalization:
# Normalization with mean and std from ImageNet
image_transformation.append(transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD))
self.image_transformation = transforms.Compose(image_transformation)
# Get all image paths and image labels from dataframe
for index, row in dataframe.iterrows():
image_path = os.path.join(folder_dir, row.Path)
if len(row) < 14:
labels = [0] * 14
labels = []
for col in row[5:]:
if col == 1:
def __len__(self):
return len(self.image_paths)
def __getitem__(self, index):
Read image at index and convert to torch Tensor
# Read image
image_path = self.image_paths[index]
image_data = Image.open(image_path).convert("RGB") # Convert image to RGB channels
# TODO: Image augmentation code would be placed here
# Resize and convert image to torch tensor
image_data = self.image_transformation(image_data)
return image_data, torch.FloatTensor(self.image_labels[index])
Checking the length of dataframe.iterrows() and row[5:] would help.

Keras : using generators to output trainingset batches and targets but also auxiliary data not used for training

I need to use generators (because of too large datasets) to yield training data and targets to a CNN for training. However, each data sample is normalized (/maxVal) and I need to un-normalize/de-normalize it just before the loss function. I don't know how to output this auxiliary data at the same time as a batch of (X,Y) from the generator?
It is something very similar to https://towardsdatascience.com/keras-data-generators-and-how-to-use-them-b69129ed779c :
import numpy as np
import cv2
from tensorflow.keras.utils import Sequence
class DataGenerator(Sequence):
"""Generates data for Keras
Sequence based data generator. Suitable for building data generator for training and prediction.
def __init__(self, list_IDs, labels, image_path, mask_path,
to_fit=True, batch_size=32, dim=(256, 256),
n_channels=1, n_classes=10, shuffle=True):
:param list_IDs: list of all 'label' ids to use in the generator
:param labels: list of image labels (file names)
:param image_path: path to images location
:param mask_path: path to masks location
:param to_fit: True to return X and y, False to return X only
:param batch_size: batch size at each iteration
:param dim: tuple indicating image dimension
:param n_channels: number of image channels
:param n_classes: number of output masks
:param shuffle: True to shuffle label indexes after every epoch
self.list_IDs = list_IDs
self.labels = labels
self.image_path = image_path
self.mask_path = mask_path
self.to_fit = to_fit
self.batch_size = batch_size
self.dim = dim
self.n_channels = n_channels
self.n_classes = n_classes
self.shuffle = shuffle
def __len__(self):
"""Denotes the number of batches per epoch
:return: number of batches per epoch
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
"""Generate one batch of data
:param index: index of the batch
:return: X and y when fitting. X only when predicting
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X = self._generate_X(list_IDs_temp)
if self.to_fit:
y = self._generate_y(list_IDs_temp)
return X/np.max(X), y/np.max(y)
return X
def on_epoch_end(self):
"""Updates indexes after each epoch
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
def _generate_X(self, list_IDs_temp):
"""Generates data containing batch_size images
:param list_IDs_temp: list of label ids to load
:return: batch of images
# Initialization
X = np.empty((self.batch_size, *self.dim, self.n_channels))
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
X[i,] = self._load_grayscale_image(self.image_path + self.labels[ID])
return X
def _generate_y(self, list_IDs_temp):
"""Generates data containing batch_size masks
:param list_IDs_temp: list of label ids to load
:return: batch if masks
y = np.empty((self.batch_size, *self.dim), dtype=int)
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
y[i,] = self._load_grayscale_image(self.mask_path + self.labels[ID])
return y
def _load_grayscale_image(self, image_path):
"""Load grayscale image
:param image_path: path to image to load
:return: loaded image
img = cv2.imread(image_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img = img / 255
return img
So, if I have understood your need correctly, what you need to do:
Fit a MinMaxScaler on your whole target (y) dataset (if possible)
For each batch
Scale your batch's targets
Yield your batch's targets
Create a custom loss function that takes your scaler as an argument
Call your scaler's inverse_transform on your y_true and y_pred in your custom loss
Call your favorite loss function on your de-normalized y_true and y_pred and return its value

Difference between Parameter vs. Tensor in PyTorch

I would like to know the difference between PyTorch Parameter and Tensor?
The existing answer is for the old PyTorch where variables are being used?
This is the whole idea of the Parameter class (attached) in a single image.
Since it is sub-classed from Tensor it is a Tensor.
But there is a trick. Parameters that are inside of a module are added to the list of Module parameters. If m is your module m.parameters() will hold your parameter.
Here is the example:
class M(nn.Module):
def __init__(self):
self.weights = nn.Parameter(torch.randn(2, 2))
self.bias = nn.Parameter(torch.zeros(2))
def forward(self, x):
return x # self.weights + self.bias
[Parameter containing:
tensor([[ 0.5527, 0.7096],
[-0.2345, -1.2346]], requires_grad=True), Parameter containing:
tensor([0., 0.], requires_grad=True)]
You see how the parameters will show what we defined.
And if we just add a tensor inside a class, like self.t = Tensor, it will not show in the parameters list. That is literally it. Nothing fancy.
Adding to #prosti's answer, a nn.Module class, doesn't always explicitly knows what Tensor objects it should optimize for. If you go through this simple commented piece of code, it could clarify it further.
import torch
from torch import nn
# Simple Objective : Learn a function that maps [1,1] -> [0,0]
x = torch.ones(2) # input tensor
y = torch.zeros(2) # expected output
# Model 1
class M1(nn.Module):
def __init__(self):
self.weights = nn.Parameter(torch.randn(2, 2))
self.bias = nn.Parameter(torch.zeros(2))
def forward(self, x):
return x # self.weights + self.bias
# Model 2
class M2(nn.Module):
def __init__(self):
# though the Tensor Objects below can undergo backprop and minimize some loss
# our model class doesn't know, it should use these tensors during optimization
self.weights = torch.randn(2,2).requires_grad_(True)
self.bias = torch.zeros(2).requires_grad_(True)
def forward(self, x):
return x # self.weights + self.bias
m2 = M2()
# Bunch of parameters get printed
print('Model 1 params : ')
# This is empty, meaning, there is no parameter for model to optimize
# In the forward pass, model just knows to use these
# `weight` and `bias` tensor to do some operations over the input.
# But model doesn't know, it should optimize over those `weight` and `bias` tensors objects
print('Model 2 params : ')
# Initialize the loss function
loss_fn = nn.MSELoss(reduction='mean')
## ===== Training ===== ##
# Trainer
def train_loop(model, loss_fn=loss_fn):
# Simple optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
for i in range(5):
# Compute prediction and loss
pred = model(x)
loss = loss_fn(pred, y)
# Backpropagation
print(f"loss > {loss.item()}")
# ====== Train Model 1 ====== #
# loss will keep on decreasing, as model_1 finds better weights for
train_loop( m1 )
# ====== Trying to Train Model 2 ====== #
# Code breaks, at this line : optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# Reason being, there is no any parameters to optimize for.
train_loop( m2 )
For further clarification, check out this short blog implementing pytorch's nn.Linear module.
