Related
I am trying to train a T5 model for NER problem, but whenever I try to load the data it fails.
Here is how I load the tokenizer:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
Then I created the DataLoad as follows:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': True,
'num_workers': 0,
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
I was facing issues training the model so I tried to debug the loading as:
for idx, batch in enumerate(training_loader):
print(idx)
break
But I am getting this error:
<ipython-input-8-fa9e34b10c06> in __getitem__(self, index)
34 if mapping[0] == 0 and mapping[1] != 0:
35 # overwrite label
---> 36 encoded_labels[idx] = labels[i]
37 i += 1
38
IndexError: list index out of range
I checked the dataset, it is not empty and there is data, not sure what is wrong.
I tried the same code with BERT by changing this line:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
To:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
& it worked perfectly, so the issue is with working with T5
Here is the Dataset class:
class dataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
def __getitem__(self, index):
# step 1: get the sentence and word labels
sentence = self.data.sentence[index].strip().split()
word_labels = self.data.word_labels[index].split(",")
# step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
# BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
encoding = self.tokenizer(sentence,
is_split_into_words=True,
return_offsets_mapping=True,
truncation=True,
padding='max_length',
max_length=self.max_len)
#return_offsets_mapping=True,
#padding='max_length',
#truncation=True,)
# step 3: create token labels only for first word pieces of each tokenized word
labels = [labels_to_ids[label] for label in word_labels]
# code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
# create an empty array of -100 of length max_length
encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
# set only labels whose first offset position is 0 and the second is not 0
i = 0
for idx, mapping in enumerate(encoding["offset_mapping"]):
if mapping[0] == 0 and mapping[1] != 0:
# overwrite label
encoded_labels[idx] = labels[i]
i += 1
# step 4: turn everything into PyTorch tensors
item = {key: torch.as_tensor(val) for key, val in encoding.items()}
item['labels'] = torch.as_tensor(encoded_labels)
return item
def __len__(self):
return self.len
i have a code of classification desicion tree (classification), and
i'm trying to convert it to a regression tree.
i understand that i need to change the Impurity measure.
in classification i have the Gini and Entropy.
in regression i need to use SSR.
if i understand right, i need to change the information_gain function for calculating the SSR.
can someone help me understand how should i change it?
class DecisionTreeClassifier():
def __init__(self, min_samples_split=2, max_depth=2):
''' constructor '''
# the root of the tree
self.root = None
# stopping conditions
# if the num of samples became less then min sample we will stop and it will be a leaf.
# same with depth
self.min_samples_split = min_samples_split
self.max_depth = max_depth
def build_tree(self, dataset, curr_depth=0):
''' recursive function to build the tree '''
#splitting the features and target
X, Y = dataset[:,:-1], dataset[:,-1]
num_samples, num_features = np.shape(X)
# split until stopping conditions are met
if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
# find the best split
best_split = self.get_best_split(dataset, num_samples, num_features)
# check if information gain is positive, if it eq to 0 it means its pure
if best_split["info_gain"]>0:
# recursive left
left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
# recursive right
right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
# return decision node
return Node(best_split["feature_index"], best_split["threshold"],
left_subtree, right_subtree, best_split["info_gain"])
# calculate leaf node
leaf_value = self.calculate_leaf_value(Y)
# return leaf node
return Node(value=leaf_value)
def get_best_split(self, dataset, num_samples, num_features):
''' function to find the best split '''
# dictionary to store the best split
best_split = {}
#we want to maximize the and to find that we have to use a number that less then any other number
max_info_gain = -float("inf")
# loop over all the features
for feature_index in range(num_features):
feature_values = dataset[:, feature_index]
# return the unique values of particular feature
possible_thresholds = np.unique(feature_values)
# loop over all the feature values present in the data
for threshold in possible_thresholds:
# get current split
dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
# check if childs are not null
if len(dataset_left)>0 and len(dataset_right)>0:
#getting the target values
y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
# y = target values
# compute information gain
curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
# once we get the current information gain we need the check if the currentinformation gain
#bigger then the max information gain if yes ? we need to update oyr best split
if curr_info_gain>max_info_gain:
best_split["feature_index"] = feature_index
best_split["threshold"] = threshold
best_split["dataset_left"] = dataset_left
best_split["dataset_right"] = dataset_right
best_split["info_gain"] = curr_info_gain
max_info_gain = curr_info_gain
# return best split
return best_split
def split(self, dataset, feature_index, threshold):
''' function to split the data '''
# takes the dataset and the feature index and the threshold value and split it to two parts ( left and right child)
# we will split with <> threshold
dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
return dataset_left, dataset_right
def information_gain(self, parent, l_child, r_child, mode="gini"):
''' function to compute information gain '''
# calculate the weights. child/parent
weight_l = len(l_child) / len(parent)
weight_r = len(r_child) / len(parent)
# calculate the Gini
if mode=="gini":
gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
else:
gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
return gain
# for that home work we do not need entropy but nice to have
'''def entropy(self, y):
# function to compute entropy
class_labels = np.unique(y)
entropy = 0
for cls in class_labels:
p_cls = len(y[y == cls]) / len(y)
entropy += -p_cls * np.log2(p_cls)
return entropy'''
def gini_index(self, y):
''' function to compute gini index '''
class_labels = np.unique(y)
gini = 0
for cls in class_labels:
p_cls = len(y[y == cls]) / len(y)
gini += p_cls**2
return 1 - gini
def calculate_leaf_value(self, Y):
''' function to compute leaf node '''
# find the most occuring element in Y
Y = list(Y)
return max(Y, key=Y.count)
def print_tree(self, tree=None, indent=" "):
''' recursive function to print the tree '''
if not tree:
tree = self.root
if tree.value is not None:
print(tree.value)
else:
print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
print("%sleft:" % (indent), end="")
self.print_tree(tree.left, indent + indent)
print("%sright:" % (indent), end="")
self.print_tree(tree.right, indent + indent)
def fit(self, X, Y):
''' function to train the tree '''
dataset = np.concatenate((X, Y), axis=1)
self.root = self.build_tree(dataset)
def predict(self, X):
''' function to predict new dataset '''
preditions = [self.make_prediction(x, self.root) for x in X]
return preditions
def make_prediction(self, x, tree):
''' function to predict a single data point '''
if tree.value!=None: return tree.value
feature_val = x[tree.feature_index]
if feature_val<=tree.threshold:
return self.make_prediction(x, tree.left)
else:
return self.make_prediction(x, tree.right)
I'm new to machine learning and I'm doing my "hello world" using sklearn and nltk, but I have problems with the result of the prediction, it always throws me a single value.
I am following a tutorial that I obtained, that has errors and I have been modifying it little by little until in the end it gave me the result, but it is not the expected one.
Attach the tutorial link: https://towardsdatascience.com/text-classification-using-k-nearest-neighbors-46fa8a77acc5
I attach my current code: (always show: "Conditions" as final result)
import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score
from nltk.stem.wordnet import WordNetLemmatizer
class KNN_NLC_Classifer():
def __init__(self, k=1, distance_type = 'path'):
self.k = k
self.distance_type = distance_type
# This function is used for training
def fit(self, x_train, y_train):
self.x_train = x_train
self.y_train = y_train
# This function runs the K(1) nearest neighbour algorithm and
# returns the label with closest match.
def predict(self, x_test):
self.x_test = x_test
y_predict = []
for i in range(len(x_test)):
max_sim = 0
max_index = 0
for j in range(self.x_train.shape[0]):
temp = self.document_similarity(x_test[i], self.x_train[j])
if temp > max_sim:
max_sim = temp
max_index = j
y_predict.append(self.y_train[max_index])
return y_predict
def convert_tag(self, tag):
"""Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
try:
return tag_dict[tag[0]]
except KeyError:
return None
def doc_to_synsets(self, doc):
"""
Returns a list of synsets in document.
Tokenizes and tags the words in the document doc.
Then finds the first synset for each word/tag combination.
If a synset is not found for that combination it is skipped.
Args:
doc: string to be converted
Returns:
list of synsets
"""
tokens = word_tokenize(str(doc)+' ')
l = []
tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
for token, tag in zip(tokens, tags):
syntag = self.convert_tag(tag[1])
syns = wn.synsets(token, syntag)
if (len(syns) > 0):
l.append(syns[0])
return l
def similarity_score(self, s1, s2, distance_type = 'path'):
"""
Calculate the normalized similarity score of s1 onto s2
For each synset in s1, finds the synset in s2 with the largest similarity value.
Sum of all of the largest similarity values and normalize this value by dividing it by the
number of largest similarity values found.
Args:
s1, s2: list of synsets from doc_to_synsets
Returns:
normalized similarity score of s1 onto s2
"""
s1_largest_scores = []
for i, s1_synset in enumerate(s1, 0):
max_score = 0
for s2_synset in s2:
if distance_type == 'path':
score = s1_synset.path_similarity(s2_synset, simulate_root = False)
else:
score = s1_synset.wup_similarity(s2_synset)
if score != None:
if score > max_score:
max_score = score
if max_score != 0:
s1_largest_scores.append(max_score)
mean_score = np.mean(s1_largest_scores)
return mean_score
def document_similarity(self,doc1, doc2):
"""Finds the symmetrical similarity between doc1 and doc2"""
synsets1 = self.doc_to_synsets(doc1)
synsets2 = self.doc_to_synsets(doc2)
return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2
#doc1 = 'I like rains'
#doc2 = 'I like showers'
#x = KNN_NLC_Classifer()
#print("Test Similarity Score: ", x.document_similarity(doc1, doc2))
# 1. Importing the dataset
#we'll use the demo dataset available at Watson NLC Classifier Demo.
FILENAME = "https://raw.githubusercontent.com/watson-developer-cloud/natural-language-classifier-nodejs/master/training/weather_data_train.csv"
dataset = pd.read_csv(FILENAME, header = None)
dataset.rename(columns = {0:'text', 1:'answer'}, inplace = True)
dataset['output'] = np.where(dataset['answer'] == 'temperature', 1,0)
Num_Words = dataset.shape[0]
print(dataset.head())
print("\nSize of input file is ", dataset.shape)
array = dataset.values
X = array[:,2]
Y = array[:,0]
validation_size = 0.20
seed = 7
# 4. Train the Classifier
classifier = KNN_NLC_Classifer(k=1, distance_type='path')
x_train,y_train = X,Y
classifier.fit(x_train, y_train)
final_test_list = ['will it rain', 'Is it hot outside?' , 'What is the expected high for today?' ,
'Will it be foggy tomorrow?', 'Should I prepare for sleet?',
'Will there be a storm today?', 'do we need to take umbrella today',
'will it be wet tomorrow', 'is it humid tomorrow', 'what is the precipitation today',
'is it freezing outside', 'is it cool outside', "are there strong winds outside",]
test_corpus = []
lmtzr = WordNetLemmatizer()
#ps = PorterStemmer()
for i in range(len(final_test_list)):
review = re.sub('[^a-zA-Z]', ' ', final_test_list[i])
review = review.lower()
review = review.split()
review = [lmtzr.lemmatize(word) for word in review] # if not word in s
review = ' '.join(review)
test_corpus.append(review)
y_pred_final = classifier.predict(test_corpus)
output_df = pd.DataFrame(data = {'text': final_test_list, 'code': y_pred_final})
output_df['answer'] = np.where(output_df['code']==1, 'Temperature','Conditions')
print(output_df)
In the tutorial, the method similarity_score() tries to find the highest similarity for each synset in s1 and average them. However, it doesn't count the words in s1 that couldn't find any synset in s2 into account. It makes more sense to me if we add zeros into s1_largest_scores for those occasions.
Take two sentences "Will it be uncomfortably hot?" and "will it rain" for example. The method in the tutorial will give you 1 for similarity while the method that I purposed will give you 0.53 for similarity. The sentences are in different categories so we'd like the similarity to be low.
Here is my code:
import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score
from nltk.stem.wordnet import WordNetLemmatizer
class KNN_NLC_Classifer():
def __init__(self, k=1, distance_type = 'path'):
self.k = k
self.distance_type = distance_type
# This function is used for training
def fit(self, x_train, y_train):
self.x_train = x_train
self.y_train = y_train
# This function runs the K(1) nearest neighbour algorithm and
# returns the label with closest match.
def predict(self, x_test):
self.x_test = x_test
y_predict = []
for i in range(len(x_test)):
max_sim = 0
max_index = 0
for j in range(self.x_train.shape[0]):
temp = self.document_similarity(x_test[i], self.x_train[j])
if temp > max_sim:
max_sim = temp
max_index = j
y_predict.append(self.y_train[max_index])
return y_predict
def convert_tag(self, tag):
"""Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
try:
return tag_dict[tag[0]]
except KeyError:
return None
def doc_to_synsets(self, doc):
"""
Returns a list of synsets in document.
Tokenizes and tags the words in the document doc.
Then finds the first synset for each word/tag combination.
If a synset is not found for that combination it is skipped.
Args:
doc: string to be converted
Returns:
list of synsets
"""
tokens = word_tokenize(str(doc)+' ')
l = []
tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
for token, tag in zip(tokens, tags):
syntag = self.convert_tag(tag[1])
syns = wn.synsets(token, syntag)
if (len(syns) > 0):
l.append(syns[0])
return l
def similarity_score(self, s1, s2, distance_type = 'path'):
"""
Calculate the normalized similarity score of s1 onto s2
For each synset in s1, finds the synset in s2 with the largest similarity value.
Sum of all of the largest similarity values and normalize this value by dividing it by the
number of largest similarity values found.
Args:
s1, s2: list of synsets from doc_to_synsets
Returns:
normalized similarity score of s1 onto s2
"""
s1_largest_scores = []
for i, s1_synset in enumerate(s1):
max_score = 0
for s2_synset in s2:
if distance_type == 'path':
score = s1_synset.path_similarity(s2_synset, simulate_root=False)
else:
score = s1_synset.wup_similarity(s2_synset)
if score != None and score > max_score:
max_score = score
# if max_score != 0:
s1_largest_scores.append(max_score)
mean_score = np.mean(s1_largest_scores)
return mean_score
def document_similarity(self,doc1, doc2):
"""Finds the symmetrical similarity between doc1 and doc2"""
synsets1 = self.doc_to_synsets(doc1)
synsets2 = self.doc_to_synsets(doc2)
return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2
# 1. Importing the dataset
#we'll use the demo dataset available at Watson NLC Classifier Demo.
FILENAME = "https://raw.githubusercontent.com/watson-developer-cloud/natural-language-classifier-nodejs/master/training/weather_data_train.csv"
dataset = pd.read_csv(FILENAME, header = None)
dataset.rename(columns = {0:'text', 1:'answer'}, inplace = True)
dataset['output'] = np.where(dataset['answer'] == 'temperature', 1,0)
Num_Words = dataset.shape[0]
print(dataset)
print("\nSize of input file is ", dataset.shape)
array = dataset.values
X = array[:,0]
Y = array[:,2]
validation_size = 0.20
seed = 7
# 4. Train the Classifier
classifier = KNN_NLC_Classifer(k=1, distance_type='path')
x_train,y_train = X, Y
classifier.fit(x_train, y_train)
final_test_list = [
'will it rain',
'Is it hot outside?',
'What is the expected high for today?',
'Will it be foggy tomorrow?',
'Should I prepare for sleet?',
'Will there be a storm today?',
'do we need to take umbrella today',
'will it be wet tomorrow',
'is it humid tomorrow',
'what is the precipitation today',
'is it freezing outside',
'is it cool outside',
'are there strong winds outside',
]
test_corpus = []
lmtzr = WordNetLemmatizer()
#ps = PorterStemmer()
for i in range(len(final_test_list)):
review = re.sub('[^a-zA-Z]', ' ', final_test_list[i])
review = review.lower()
review = review.split()
review = [lmtzr.lemmatize(word) for word in review] # if not word in s
review = ' '.join(review)
test_corpus.append(review)
y_pred_final = classifier.predict(test_corpus)
output_df = pd.DataFrame(data = {'text': final_test_list, 'code': y_pred_final})
output_df['answer'] = np.where(output_df['code']==1, 'Temperature', 'Conditions')
print(output_df)
And here is the result which I consider more reasonable:
text code answer
0 will it rain 0 Conditions
1 Is it hot outside? 1 Temperature
2 What is the expected high for today? 1 Temperature
3 Will it be foggy tomorrow? 1 Temperature
4 Should I prepare for sleet? 0 Conditions
5 Will there be a storm today? 1 Temperature
6 do we need to take umbrella today 0 Conditions
7 will it be wet tomorrow 1 Temperature
8 is it humid tomorrow 1 Temperature
9 what is the precipitation today 1 Temperature
10 is it freezing outside 1 Temperature
11 is it cool outside 1 Temperature
12 are there strong winds outside 0 Conditions
After printing out x_train and y_train, you'll figure out the bug.
For some reason, your Y is the feature while your X is your label. If you changed the line x_train, y_train = X, Y to x_train, y_train = Y, X, it would work.
I am creating a deep neural network using Keras using images from the Gym library from Open AI.
I tried to reshape the images using the following code:
def reshape_dimensions(observation):
processed = np.mean(observation,2,keepdims = False)
cropped = processed[35:195]
result = cropped[::2,::2]
return result
This gives me an image of shape (80,80) but every time I try to input that shape in the first layer of the Keras network it doesn't work.
What should be the shape I should use so I can further develop the network?
Attached the whole code:
PART I retrieves the training data
import gym
import random
import numpy as np
from statistics import mean, median
from collections import Counter
### GAME VARIABLE SETTINGS ###
env = gym.make('MsPacman-v0')
env.reset()
goal_steps = 2000
score_requirement = 250
initial_games = 200
print('Options to play: ',env.unwrapped.get_action_meanings())
### DEFINE FUNCTIONS ####
def reshape_dimensions(observation):
processed = np.mean(observation,2,keepdims = False)
cropped = processed[35:195]
result = cropped[::2,::2]
return result
def initial_population():
training_data = []
scores = []
accepted_scores = []
for _ in range(initial_games):
score = 0
game_memory = []
prev_obvservation = []
for _ in range(goal_steps):
#env.render()
action = env.action_space.sample() #Take random action in the env
observation, reward, done, info = env.step(action)
reshape_observation = reshape_dimensions(observation)
if len(prev_obvservation) > 0:
game_memory.append([prev_obvservation, action])
prev_obvservation = reshape_observation
score = score + reward
if done:
break
if score >= score_requirement:
accepted_scores.append(score)
for data in game_memory:
if data[1] == 0:
output = [1,0,0,0,0,0,0,0,0]
elif data[1] == 1:
output = [0,1,0,0,0,0,0,0,0]
elif data[1] == 2:
output = [0,0,1,0,0,0,0,0,0]
elif data[1] == 3:
output = [0,0,0,1,0,0,0,0,0]
elif data[1] == 4:
output = [0,0,0,0,1,0,0,0,0]
elif data[1] == 5:
output = [0,0,0,0,0,1,0,0,0]
elif data[1] == 6:
output = [0,0,0,0,0,0,1,0,0]
elif data[1] == 7:
output = [0,0,0,0,0,0,0,1,0]
elif data[1] == 8:
output = [0,0,0,0,0,0,0,0,1]
training_data.append([data[0],output])
env.reset()
scores.append(score)
print('Average accepted scores:', mean(accepted_scores))
print('Median accepted scores:', median(accepted_scores))
print(Counter(accepted_scores))
return training_data
### RUN CODE ###
training_data = initial_population()
np.save('data_for_training_200.npy', training_data)
PART II trains the model
import gym
import random
import numpy as np
import keras
from statistics import mean, median
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
### LOAD DATA ###
raw_training_data = np.load("data_for_training_200.npy")
training_data = [i[0:2] for i in raw_training_data]
print(np.shape(training_data))
### DEFINE FUNCTIONS ###
def neural_network_model():
network = Sequential()
network.add(Dense(100, activation = 'relu', input_shape = (80,80)))
network.add(Dense(9,activation = 'softmax'))
optimizer = Adam(lr = 0.001)
network.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics=['accuracy'])
return network
def train_model(training_data):
X = [i[0] for i in training_data]
y = [i[1] for i in training_data]
#X = np.array([i[0] for i in training_data])
#y = np.array([i[1] for i in training_data])
print('shape of X: ', np.shape(X))
print('shape of y: ', np.shape(y))
early_stopping_monitor = EarlyStopping(patience = 3)
model = neural_network_model()
model.fit(X, y, epochs = 20, callbacks = [early_stopping_monitor])
return model
train_model(training_data = training_data)
It seems like you are pre-processing individual images correctly but putting them inside a list instead of an input tensor. From the error message you have a list of 36859 (80,80) arrays while you would like to have a single array of shape (36859, 80, 80). You have the code that does this commented out X = np.array([i[0] for i in training_data]), you have to ensure that every i[0] is of same shape (80,80) for this to work.
I want to merge two model with different models, and use fit_generator to train the merged model. And the generator is by myself.
This is one of the generators.
def image_generator(self, batch_size, train_test, data_type, concat=False):
train, test = self.split_train_test()
data = train if train_test == 'train' else test
print("Creating %s generator with %d samples." % (train_test, len(data)))
print ("image_generator")
while 1:
X, y = [], []
# Generate batch_size samples.
for _ in range(batch_size):
# Reset to be safe.
sequence = None
# Get a random sample.
sample = random.choice(data)
# Check to see if we've already saved this sequence.
if data_type is "images":
# Get and resample frames.
frames = self.get_frames_for_sample(sample)
frames = self.rescale_list(frames, self.seq_length)
# Build the image sequence
sequence = self.build_image_sequence(frames)
else:
# Get the sequence from disk.
sequence = self.get_image_sequence(data_type, sample, train_test)
if sequence is None:
print("Can't find sequence. Did you generate them?")
sys.exit() # TODO this should raise
if concat:
# We want to pass the sequence back as a single array. This
# is used to pass into an MLP rather than an RNN.
sequence = np.concatenate(sequence).ravel()
X.append(sequence)
y.append(self.get_class_one_hot(sample[1]))
yield np.array(X), np.array(y)
This is get_image_sequences:
def get_image_sequence(self, data_type, sample, train_test):
"""get the images shaped with array."""
# train,ApplyEyeMakeup,v_ApplyEyeMakeup_g10_c02,99
num = random.randint(1, int(sample[3]))
path = glob.glob('./data/' + train_test + '/' + sample[1] + '/' + sample[2] + '-' + '*' + num + '.jpg')
if os.path.isfile(path):
img = Image.open(path)
if img.size != target_size:
img = img.resize(target_size)
img = img_to_array(img)
img = np.expand_dims(img, axis=0)
img /= 255
return img
else:
print ("path is error" + path)
return None
Now, merge and fit it:
modeltmp = merge([model1.output, model2.output], mode='concat', concat_axis=1)
modeltmp = BatchNormalization()(modeltmp)
modeltmp = Dense(1024, activation='relu')(modeltmp)
modeltmp = Dense(len(classes), activation='softmax')(modeltmp)
model = Model(input=[model1.input, model2.input], outputs=modeltmp)
# model1 --- generator
train_gen_1 = data.image_generator(batch_size, 'train', cnn_lstm_datatype, concat)
test_gen_1 = data.image_generator(batch_size, 'test', cnn_lstm_datatype, concat)
# model2 ---- generator
train_gen_2 = data.frame_generator(batch_size=batch_size, train_test='train', data_type=cnn_lstm_datatype, concat=concat)
test_gen_2 = data.frame_generator(batch_size=batch_size, train_test='test', data_type=cnn_lstm_datatype, concat=concat)
model.fit_generator([train_gen_1, train_gen_2],
verbose=1,
steps_per_epoch=batch_size,
validation_steps=10,
epochs=10000,
callbacks=[checkpointer, tb, early_stopper, csv_logger],
validation_data=[test_gen_1, test_gen_2]
)
However, I get the error:
TypeError: Error when checking model input: data should be a Numpy array, or list/dict of Numpy arrays. Found: generator object image_generator at 0x12205df00 ...
How can I solve it?Thanks!