How to preprocess and feed data to keras model? - python-3.x

I have a dataset with two columns, path and class. I'd like to fine tune VGGface with it.
dataset.head(5):
path class
0 /f3_224x224.jpg red
1 /bc_224x224.jpg orange
2 /1c_224x224.jpg brown
3 /4b_224x224.jpg red
4 /0c_224x224.jpg yellow
I'd like to use these paths to preprocess images and feed to keras. My preprocessing functions is below:
from keras.preprocessing.image import img_to_array, load_img
def prep_image(photo):
img = image.load_img(path + photo, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = utils.preprocess_input(x, version=1)
return x
I prepare my datasets with the following code:
from sklearn.model_selection import train_test_split
path = list(dataset.columns.values)
path.remove('class')
X = dataset[path]
y = dataset['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
I train my model with the following code:
nb_class = 4
hidden_dim = 512
vgg_model = VGGFace(include_top=False, input_shape=(224, 224, 3))
last_layer = vgg_model.get_layer('pool5').output
x = Flatten(name='flatten')(last_layer)
x = Dense(hidden_dim, activation='relu', name='fc6')(x)
x = Dense(hidden_dim, activation='relu', name='fc7')(x)
out = Dense(nb_class, activation='softmax', name='fc8')(x)
custom_vgg_model = Model(vgg_model.input, out)
custom_vgg_model.compile(
optimizer="adam",
loss="categorical_crossentropy"
)
custom_vgg_model.fit(X_train, y_train, epochs=50, batch_size=16)
test_loss, test_acc = model.evaluate(X_test, y_test)
However i get value error because i can't figure out how to preprocess images and feed the arrays. How can i transform the paths from X_train/test dataframes and replace them with the output of prep_image function?
ValueError: Error when checking input: expected input_2 to have 4 dimensions, but got array with shape (50297, 1)
So the shape should be (50297, 224, 224, 3).

X_train, X_test are basically just path names it seems. In your data preparation step you just need to modify your code like that adding those last two lines.
from sklearn.model_selection import train_test_split
path = list(dataset.columns.values)
path.remove('class')
X = dataset[path]
y = dataset['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train = np.array([prep_image(path)[0] for path in X_train])
X_test = np.array([prep_image(path)[0] for path in X_test])

Related

log loss computed manually diverging from the cross_validation_score method from scikit-learn

I have a question about how the cross_val_score() from the Scikit-Learn works. I tried divide the dataset in 10 folds with Kfold() and compute the log loss of both training and validation sets for each fold. However I got different answers using the cross_validation_score, setting the parameter scoring = 'neg_log_loss'.
X and y are arrays of shape (1800, 12) and (1800, 1), respectively.
kfold = KFold(n_splits=10)
train_loss = []
val_loss = []
for train_index, val_index in kfold.split(X, y):
clf_logreg = LogisticRegression()
#
X_train, X_val = X[train_index], X[val_index]
y_train, y_val = y[train_index], y[val_index]
clf_logreg.fit(X_train, y_train)
y_train_pred = clf_logreg.predict(X_train)
y_val_pred = clf_logreg.predict(X_val)
train_loss.append(log_loss(y_train, y_train_pred))
val_loss.append(log_loss(y_val, y_val_pred))
clf_logreg.fit(X,y)
y_error = cross_val_score(clf_logreg, X, y, cv=kfold, scoring='neg_log_loss')
print("cross_val log_loss: ", -y_error)
print("\ntrain_loss: ", train_loss)
print("\nval_loss: ", val_loss)
The answers I got:
cross_val log_loss: [0.18546779 0.18002459 0.21591202 0.15872213 0.22852112 0.18766844
0.28641203 0.14923009 0.21446935 0.20373971]
train_loss: [2.79298449379999, 2.7290223160363962, 2.558457002245472, 2.835624958485065, 2.5797806896386337, 2.622420660745048, 2.5797797024813125, 2.6224201671663874, 2.5797782217453302, 2.6863818513513213]
val_loss: [1.9188431218680995, 2.1107385395747733, 3.645826363693089, 2.110734097366828, 3.2620355282797417, 2.686367043991502, 3.453913177154633, 2.4944849529086657, 2.8782624616981765, 2.4944938373245567]
As Ben Reiniger noted in the comment log_loss expects probabilities in y_train_pred, y_val_pred. So you need to change
clf_logreg.predict
to:
clf_logreg.predict_proba
Example:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, cross_val_score
X, y = load_iris(return_X_y=True)
y = y == 1
kfold = KFold(n_splits=10, random_state=1, shuffle=True)
train_loss = []
val_loss = []
for train_index, val_index in kfold.split(X, y):
clf_logreg = LogisticRegression()
#
X_train, X_val = X[train_index], X[val_index]
y_train, y_val = y[train_index], y[val_index]
clf_logreg.fit(X_train, y_train)
y_train_pred = clf_logreg.predict_proba(X_train)
y_val_pred = clf_logreg.predict_proba(X_val)
train_loss.append(log_loss(y_train, y_train_pred))
val_loss.append(log_loss(y_val, y_val_pred))
clf_logreg.fit(X, y)
y_error = cross_val_score(clf_logreg, X, y, cv=kfold, scoring="neg_log_loss")
print("cross_val log_loss: ", -y_error)
print("\nval_loss: ", val_loss)
Results:
cross_val log_loss: [0.53548503 0.54200945 0.60324094 0.64781483 0.43323992 0.37625601
0.55101127 0.46172226 0.50216316 0.64359642]
val_loss: [0.5354850268015129, 0.5420094471965571, 0.6032409439788419, 0.647814828089315, 0.43323991804482626, 0.3762560144867495, 0.5510112741331039, 0.46172225526408, 0.5021631570133954, 0.6435964210060579]

Training 1D conv model in keras

I am trying to train simple !D conv model in keras. I have 1D data in excel file. I am first
reading the data in python. Then I am making the training dataset python. Then I splitting the
dataset into training and testing dataset. But when i trained simple !D con model, I am getting the
error as shown below. Please guide me how to solve this issue.
training_data = []
def create_training_data():
for label in labels:
dir_path = os.path.join(path_dir, label)
class_num = labels.index(label)
file_list = os.listdir(dir_path)
for file_name in file_list:
img_path = os.path.join(dir_path, file_name)
dir_split = dir_path.split('\\')
training_data.append([img_path, class_num])
create_training_data()
print(len(training_data))
import random
random.shuffle(training_data)
for sample in training_data[:10]:
print(sample[1])
X = []
Y = []
for features, classes in training_data:
X.append(features)
Y.append(classes)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
len(y_test)
import numpy as np
#inp = np.array(X_train)
#print(inp.shape)
kernel_size = 3
x = Conv1D(filters=32, kernel_size=kernel_size, activation="relu")(inp)
x = MaxPooling1D(2)(x)
x = Conv1D(filters=32, kernel_size=kernel_size, activation='relu')(x)
x = MaxPooling1D(2)(x)
x = Conv1D(filters=64, kernel_size=kernel_size, activation='relu')(x)
x = MaxPooling1D(2)(x)
x = Conv1D(filters=64, kernel_size=kernel_size, activation='relu')(x)
x = MaxPooling1D(2)(x)
x = Flatten()(x)
x = Dense(4, activation="softmax")(x)
return Model(inputs=inp, outputs=x)
I am getting the following error:
All inputs to the layer should be tensors.
How I can solve this problem.

Model fit after each "fold" in LOOCV? Multilabel LOOCV by hand

I am wondering if it is an issue (possible data leakage?) when implementing leave one out cross validation by hand if the model is fit each iteration after testing on each fold? It seems like if the model is trained on all data except for "X" and after testing on "X" the model is trained on all data other than "Y" and tested on "Y" it has seen "Y" on the first iteration. Is this actually a problem, and does my implementation of LOOCV by hand appear to be correct?
Thanks for your time!
i = 0
j = 0
for i in range(0, 41):
X_copy = X_orig[(i):(i+1)] #Slice the ith element from the numpy array
y_copy = y_orig[(i):(i+1)]
X_model = X_orig
y_model = y_orig
X_model = np.delete(X_model, i, axis = 0)
y_model = np.delete(y_model, i, axis = 0)
model.fit(X_model, y_model, epochs=115, batch_size=28, verbose = 0) #verbose = 0 removes learning info
prediction = model.predict(X_copy)
prediction[prediction>=0.5] = 1
prediction[prediction<0.5] = 0
print(prediction, y_copy)
if np.array_equal(y_copy, prediction):
j = j + 1
#print(y_copy, prediction)
if np.not_equal:
#print(y_copy, prediction)
pass
print(j/41) #For 41 samples in dataset
Why don't you use this?
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
model =...
test_fold_predictions = []
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model.fit(X_train, y_train)
test_fold_predictions.append(model.predict(X_test))
EDIT
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
model = Sequential()
model.add(Dense(5000, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.1))
model.add(Dense(600, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy',
optimizer=sgd)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
test_fold_predictions = []
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model.fit(X_train, y_train, epochs=5, batch_size=2000)
test_fold_predictions.append(model.predict(X_test))

dimension related problem in training LightGBM for Multiclass Multilable Classification?

I would like to classify by LightGBM algorithm for Multiclass Multilable Classification but I encounter a problem during training because of not being a list the input. DATA
is The length of real rows is 10000
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:,np.r_[0:6, 7:27]].values
y = dataset.iloc[:,np.r_[6]].values
x_train, x_test, y_train, y_test = train_test_split(X, y,test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
import lightgbm as lgb
d_train = lgb.Dataset(x_train, label=y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10
clf = lgb.train(params, d_train, 100)
y_pred=clf.predict(x_test)
for i in range(0,99):
if y_pred[i]>=.5:
y_pred[i]=1
else:
y_pred[i]=0
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
I encounter this problem:
clf = lgb.train(params, d_train, 100)
File "..\lightgbm\engine.py", line 228, in train
...
File "..\lightgbm\basic.py", line 1336, in set_label
label = list_to_1d_numpy(_label_from_pandas(label), name='label')
File "..\lightgbm\basic.py", line 86, in list_to_1d_numpy
"It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
This error is found in basic.py in a function: """Convert data to numpy 1-D array.""" While when I have changed my data to 1D by
y_train = np.reshape(y_train, [1,trainsize])
x_train = np.reshape(x_train, [1,trainsize*26])
The problem is not solved!
Then I use ravel to make 1D for x_train, y_train
x_train = np.ravel(x_train)
y_train = np.ravel(y_train)
but new error is shown:
\lib\site-packages\lightgbm\basic.py", line 872, in __init_from_np2d
raise ValueError('Input numpy.ndarray must be 2 dimensional')
ValueError: Input numpy.ndarray must be 2 dimensional
What is wrong? How I can solve this?

Memory used up for loading data alone in Keras program

My code is for training vgg16 from custom data. Two classes, Diseased and not diseased.
I have around 3400 Images, The problem is while loading the data-set to memory.The above-mentioned process utilizes 99% of ram memory and it gets stuck.I am using spyder,however when I followed another example which has lower data size it works fine. My question is as follows Can anyone suggest an efficent method to run it without loading all the images into the memory? Because this is eventually leading to the blue screen of death.
Ps:my system is capable of running deeplearning codes.
import numpy as np
import os
import time
from vgg16 import VGG16
from keras.preprocessing import image
from imagenet_utils import preprocess_input, decode_predictions
from keras.layers import Dense, Activation, Flatten
from keras.layers import merge, Input
from keras.models import Model
from keras.utils import np_utils
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
# Loading the training data
PATH = os.getcwd()
# Define data path
data_path = PATH + '/data'
data_dir_list = os.listdir(data_path)
img_data_list=[]
for dataset in data_dir_list:
img_list=os.listdir(data_path+'/'+ dataset)
print ('Loaded the images of dataset-'+'{}\n'.format(dataset))
for img in img_list:
img_path = data_path + '/'+ dataset + '/'+ img
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
# x = x/255
print('Input image shape:', x.shape)
img_data_list.append(x)
img_data = np.array(img_data_list)
#img_data = img_data.astype('float32')
print (img_data.shape)
img_data=np.rollaxis(img_data,1,0)
print (img_data.shape)
img_data=img_data[0]
print (img_data.shape)
# Define the number of classes
num_classes = 2
num_of_samples = img_data.shape[0]
labels = np.ones((num_of_samples,),dtype='int64')
labels[0:2345]=0
labels[2245:3567]=1
names = ['YES','NO']
# convert class labels to on-hot encoding
Y = np_utils.to_categorical(labels, num_classes)
#Shuffle the dataset
x,y = shuffle(img_data,Y, random_state=2)
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
#########################################################################################
# Custom_vgg_model_1
#Training the classifier alone
image_input = Input(shape=(224, 224, 3))
model = VGG16(input_tensor=image_input, include_top=True,weights='imagenet')
model.summary()
last_layer = model.get_layer('fc2').output
#x= Flatten(name='flatten')(last_layer)
out = Dense(num_classes, activation='softmax', name='output')(last_layer)
custom_vgg_model = Model(image_input, out)
custom_vgg_model.summary()
for layer in custom_vgg_model.layers[:-1]:
layer.trainable = False
custom_vgg_model.layers[3].trainable
custom_vgg_model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
t=time.time()
# t = now()
hist = custom_vgg_model.fit(X_train, y_train, batch_size=32, epochs=12, verbose=1, validation_data=(X_test, y_test))
print('Training time: %s' % (t - time.time()))
(loss, accuracy) = custom_vgg_model.evaluate(X_test, y_test, batch_size=10, verbose=1)
print("[INFO] loss={:.4f}, accuracy: {:.4f}%".format(loss,accuracy * 100))
####################################################################################################################
#Training the feature extraction also
image_input = Input(shape=(224, 224, 3))
model = VGG16(input_tensor=image_input, include_top=True,weights='imagenet')
model.summary()
last_layer = model.get_layer('block5_pool').output
x= Flatten(name='flatten')(last_layer)
x = Dense(128, activation='relu', name='fc1')(x)
x = Dense(128, activation='relu', name='fc2')(x)
out = Dense(num_classes, activation='softmax', name='output')(x)
custom_vgg_model2 = Model(image_input, out)
custom_vgg_model2.summary()
# freeze all the layers except the dense layers
for layer in custom_vgg_model2.layers[:-3]:
layer.trainable = False
custom_vgg_model2.summary()
custom_vgg_model2.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])
t=time.time()
# t = now()
hist = custom_vgg_model2.fit(X_train, y_train, batch_size=32, epochs=12, verbose=1, validation_data=(X_test, y_test))
print('Training time: %s' % (t - time.time()))
(loss, accuracy) = custom_vgg_model2.evaluate(X_test, y_test, batch_size=10, verbose=1)
print("[INFO] loss={:.4f}, accuracy: {:.4f}%".format(loss,accuracy * 100))
#%%
import matplotlib.pyplot as plt
# visualizing losses and accuracy
train_loss=hist.history['loss']
val_loss=hist.history['val_loss']
train_acc=hist.history['acc']
val_acc=hist.history['val_acc']
xc=range(12)
plt.figure(1,figsize=(7,5))
plt.plot(xc,train_loss)
plt.plot(xc,val_loss)
plt.xlabel('num of Epochs')
plt.ylabel('loss')
plt.title('train_loss vs val_loss')
plt.grid(True)
plt.legend(['train','val'])
#print plt.style.available # use bmh, classic,ggplot for big pictures
plt.style.use(['classic'])
plt.figure(2,figsize=(7,5))
plt.plot(xc,train_acc)
plt.plot(xc,val_acc)
plt.xlabel('num of Epochs')
plt.ylabel('accuracy')
plt.title('train_acc vs val_acc')
plt.grid(True)
plt.legend(['train','val'],loc=4)
#print plt.style.available # use bmh, classic,ggplot for big pictures
plt.style.use(['classic'])

Resources