recursionerror: maximum recursion depth exceeded in comparison in tensorflow with skopt - python-3.x

I want to compute a Bayesian Search with [skopt] (https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html).
My dataset is a Time series, and t is my time step.
But i have a error:
recursionerror: maximum recursion depth exceeded in comparison
This is my code :
def Grid_search_class(X_train=X_train[:,0:t+1,:]
,y_train=y_train
,X_test=X_test[:,0:t+1,:],
y_test=y_test
,n_calls=20,
print_score=False,t=t):
""" INPUTS : Train Test data
n_calls Number of calls to func"""
import tensorflow as tf
Adam = tf.keras.optimizers.Adam(learning_rate=0.007)
Adagrad = tf.keras.optimizers.Adagrad(learning_rate=0.007)
dim_num_input_text = Categorical([16,32,64,128,256,512,1024,2048], name='num_dense_layers_text')
dim_num_dense_text = Integer(low=0, high=5, name='num_HLD_nodes_text')
dim_drop_text = Categorical([0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4], name='drop_text')
dim_num_input_temp = Categorical([16,32,64,128,256,512,1024,2048], name='num_dense_layers_temp')
dim_num_dense_temp = Integer(low=0, high=5, name='num_HLD_nodes_temp')
dim_drop_temp = Categorical([0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4], name='drop_temp')
dim_num_input_fixe = Categorical([16,32,64,128,256,512,1024,2048], name='num_dense_layers_fixe')
dim_num_dense_fixe = Integer(low=0, high=5, name='num_HLD_nodes_fixe')
dim_drop_fixe = Categorical([0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4], name='drop_fixe')
dim_num_input_merge = Categorical([16,32,64,128,256,512,1024,2048], name='num_dense_layers_merge')
dim_num_dense_merge = Integer(low=0, high=5, name='num_HLD_nodes_merge')
dim_drop_merge = Categorical([0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4], name='drop_merge')
dim_optim=Categorical([Adam,Adagrad], name='optim')
dimensions = [dim_num_input_text,
dim_num_dense_text,
dim_drop_text,
dim_num_input_temp,
dim_num_dense_temp,
dim_drop_temp,
dim_num_input_fixe,
dim_num_dense_fixe,
dim_drop_fixe,
dim_num_input_merge,
dim_num_dense_merge,
dim_drop_merge,
dim_optim
]
default_parameters = [512,0,0.1,512,0,0.1,512,0,0.1,512,0,0.1,Adam]
def create_model(num_dense_layers_text,num_HLD_nodes_text,drop_text,
num_dense_layers_temp,num_HLD_nodes_temp,drop_temp,
num_dense_layers_fixe,num_HLD_nodes_fixe,drop_fixe,
num_dense_layers_merge,num_HLD_nodes_merge,drop_merge,optim,t=t):
x_text = model_text.layers[ind_list[-1]-1].output
if num_dense_layers_text>0:
for i in range(num_dense_layers_text):
x_text =tf.keras.layers.Dense(num_HLD_nodes_text,activation='relu')(x_text)
x_text=tf.keras.layers.Dropout(drop_text)(x_text)
x_temp = model_temp[t].layers[ind_list[t]].output
if num_dense_layers_temp>0:
for i in range(num_dense_layers_temp):
x_temp =tf.keras.layers.Dense(num_HLD_nodes_temp,activation='relu')(x_temp)
x_temp=tf.keras.layers.Dropout(drop_temp)(x_temp)
x_fixe= model_fixe.layers[1].output
if num_dense_layers_fixe>0:
for i in range(num_dense_layers_fixe):
x_fixe =tf.keras.layers.Dense(num_HLD_nodes_fixe,activation='relu')(x_fixe)
x_fixe=tf.keras.layers.Dropout(drop_fixe)(x_fixe)
merge = tf.keras.layers.concatenate([x_text,x_temp,x_fixe])
if num_dense_layers_merge>0:
for i in range(num_dense_layers_merge):
merge =tf.keras.layers.Dense(num_HLD_nodes_merge,activation='relu')(merge)
merge=tf.keras.layers.Dropout(drop_merge)(merge)
#add our classification layer.
predictions = tf.keras.layers.Dense(3,activation='softmax')(merge)
model = tf.keras.Model(inputs = [model_text.input,model_temp[t].input,model_fixe.input], outputs = predictions)
#setup our optimizer and compile
model.compile(optimizer=optim, loss=ncce,
metrics=[ tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),F1Score(num_classes=3,name='F1',average='macro')])
return model
score='val_F1'
#use_named_args(dimensions=dimensions)
def fitness(num_dense_layers_text,num_HLD_nodes_text,drop_text,
num_dense_layers_temp,num_HLD_nodes_temp,drop_temp,
num_dense_layers_fixe,num_HLD_nodes_fixe,drop_fixe,
num_dense_layers_merge,num_HLD_nodes_merge,drop_merge,optim):
print(num_dense_layers_text,num_HLD_nodes_text,drop_text,
num_dense_layers_temp,num_HLD_nodes_temp,drop_temp,
num_dense_layers_fixe,num_HLD_nodes_fixe,drop_fixe,
num_dense_layers_merge,num_HLD_nodes_merge,drop_merge,optim)
model = create_model(num_dense_layers_text=num_dense_layers_text,
num_HLD_nodes_text=num_HLD_nodes_text,drop_text=drop_text,
num_dense_layers_temp=num_dense_layers_temp,
num_HLD_nodes_temp=num_HLD_nodes_temp,drop_temp=drop_temp,
num_dense_layers_fixe=num_dense_layers_fixe,
num_HLD_nodes_fixe=num_HLD_nodes_fixe,drop_fixe=drop_fixe,
num_dense_layers_merge=num_dense_layers_merge,drop_merge=drop_merge,
num_HLD_nodes_merge=num_HLD_nodes_merge,optim=optim,t=t)
callback=tf.keras.callbacks.EarlyStopping(
monitor=score, min_delta=0.01, patience=1, verbose=0, mode='auto',
baseline=0, restore_best_weights=False
)
#named blackbox becuase it represents the structure
blackbox = model.fit(x=X_train,
y=y_train,verbose=1,
epochs=2,
batch_size=32,
validation_data=(X_test,y_test)
)
#return the validation accuracy for the last epoch.
val_loss = blackbox.history[score][-1]
if score=='val_F1':
val_loss=-val_loss
# Print the classification accuracy.
if print_score :
print()
print("val_score: {}".format(val_loss))
print()
# Delete the Keras model with these hyper-parameters from memory.
del model
# Clear the Keras session, otherwise it will keep adding new
# models to the same TensorFlow graph each time we create
# a model with a different set of hyper-parameters.
tf.keras.backend.clear_session()
tf.compat.v1.reset_default_graph()
# the optimizer aims for the lowest score, so we return our negative accuracy
return -val_loss
gp_result = gp_minimize(fitness,
dimensions=dimensions,
n_calls=n_calls,n_random_starts=7,
noise= 0.01,
x0=default_parameters)
a=pd.concat([pd.DataFrame(gp_result.x_iters, columns = ["dense layers text","HLD nodes text","drop text",
"dense layers temp","HLD nodes temp","drop temp",
"dense layers fixe","HLD nodes fixe","drop fixe",
"dense layers merge","HLD nodes merge","drop merge",
"optim","batch size"]),
(pd.Series(gp_result.func_vals*-1, name="val_loss"))], axis=1)
a.sort_values(by=['val_loss'], inplace=True,ascending=False)
print(a.iloc[:10])
return a
This step is looking for the best parameters a step t
def Run_Grid_search_temp(j=0,n_calls=25):
while j<X_train.shape[1] :
temp=Grid_search_class(t=j,n_calls=n_calls)
print(temp)
j+=1
return
And this one is a loop on the step.

sys.setrecursionlimit(10000)
seems to resolve my problem.

Related

Detectron2: No instances in prediction

I'm trying to train Detectron2 on a custom dataset that I annotated with coco-annotator. After training I wanted to predict Instances of my Image, but I dont get any shown.
Training:
from detectron2.engine import DefaultTrainer
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("TrashTron_train",)
cfg.DATASETS.TEST = ("TrashTron_val",)
# cfg.DATASETS.TEST = ()
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml") # Let training initialize from model zoo
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025 # pick a good LR
cfg.SOLVER.MAX_ITER = 300 # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
cfg.SOLVER.STEPS = [] # do not decay learning rate
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 # faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 24 # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
# NOTE: this config means the number of classes, but a few popular unofficial tutorials incorrect uses num_classes+1 here.
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()
Prediction:
test_data = [{'1191.jpg': '/content/datasets/val/1191.jpg',
'image_id': 1308}]
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 # set a custom testing threshold
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth") # path to the model we just trained
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 24
predictor = DefaultPredictor(cfg)
outputs = predictor(im)
# print(outputs["instances"].pred_densepose)
im = cv2.imread(test_data[0]["1191.jpg"])
v = Visualizer(im[:, :, ::-1],
metadata=MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
scale=0.5,
instance_mode=ColorMode.IMAGE_BW)
out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
img = cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_RGBA2RGB)
plt.imshow(img)
The corresponding image is shown, but no instances.
Any suggestions? The overall evaluation scores aren't that great, but I picked the best class and there I also dont get any predictions...
I would try to lower the threshold, since you have said that overall training scores were not great.
In this answer in official repo, following code is suggested to change the threshold:
cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST = 0.5
at another answer at the same thread, other thresholds are modified as well.
cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold

How to map a function to a pre-batched ('BatchDataset') tensor in Tensorflow

I am making data windows (input, output pairs of windows) from time series data. I have already converted my time series to a tf dataset, where each batch has the number of time steps equal to the total window sizes I need.
def make_dataset(data=train_df[0]):
ds = tf.keras.preprocessing.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32
)
return ds
Example of the shape returned:
for example in tensor.take(1):
print(f'shape: {example.shape}')
shape: (32, 48, 18)
What I need to do now is split the time dimension into my input-output pairs, and I have a function to do this, however, when I try to map this function to my 'ds' in the above function I get the following error:
'BatchDataset' object is not subscriptable
I am hoping someone can help me understand where I am going wrong? I am pretty new to tensorflow... My code is below, in this example 'input slice' and 'label_slice' are 0 and 24 respectively. So my aim is to split my batches into input-output pairs of length 24 each.
def split_window(features):
inputs = features[:, input_slice, :]
labels = features[:, labels_slice, :]
inputs.set_shape([None, input_width, None])
labels.set_shape([None, label_width, None])
return inputs, labels
def make_dataset(data=train_df[0]):
data = np.array(data, dtype=np.float32)
ds = tf.keras.preprocessing.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32
)
ds = ds.map(split_window(ds))
return ds
tensor = make_dataset()
tensor
'BatchDataset' object is not subscriptable
Your snippet of code looks similar to the tutorial of Time Series in Tensorflow. Based on that, I modified the main class WindowGenerator() (excluded the parts of train/val/test datasets and output-labels selection) to a simpler class suitable to your question.
class WindowGenerator():
def __init__(self, input_width, label_width, shift):
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size[self.input_slice]
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size [self.labels_slice]
def split_window(self, features):
inputs = features[:, self.input_slice, :]
labels = features[:, self.labels_slice, :]
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, None])
return inputs, labels
def make_dataset(self, data):
data = np.array(data, dtype=np.float32)
ds = tf.keras.utils.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=self.total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=batch_size,)
ds = ds.map(self.split_window)
return ds
input_width=24
label_width=24
total_windth = input_width + label_width
batch_size = 32
window = WindowGenerator(input_width=input_width, label_width=label_width, shift=1)
dataset = window.make_dataset(train_df[0])
I would recommend, though, to use Dataset.window(). It is simpler and more intuitive.
dataset = tf.data.Dataset.from_tensor_slices(train_df[0])
dataset = dataset.window(total_windth, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(batch_size))
dataset = dataset.map(lambda window: (window[:-label_width], window[-input_width:]))

for-loop issue (how to use i-parameter on the obj name created within the for-loop)

i'd like to create few Random forest model within a for
loop in which i move the number of estimators. Train each of them on the same data sample and measure the accuracy of each.
This is my beginning code:
r = range(0, 100)
for i in r:
RF_model_%i = RandomForestClassifier(criterion="entropy", n_estimators=i, oob_score=True)
RF_model_%i.fit(X_train, y_train)
y_predict = RF_model_%i.predict(X_test)
accuracy_%i = accuracy_score(y_test, y_predict)
what i like to understand is:
how can i put the i-parameter on the name of each model (in
order to recognise them)?
after tained and calculated the
accuracy score each of the i-models how can i put the result on
a list within the for-loop?
You can do something like this:
results = [] # init
r = range(0, 100)
for i in r:
RF_model_%i = RandomForestClassifier(criterion="entropy", n_estimators=i, oob_score=True)
RF_model_%i.id = i # dynamically add fields to objects
RF_model_%i.fit(X_train, y_train)
y_predict = RF_model_%i.predict(X_test)
accuracy_%i = accuracy_score(y_test, y_predict)
results.append(accuracy_%i) # put the result on a list within the for-loop

Find wrongly categorized samples from validation step

I am using a keras neural net for identifying category in which the data belongs.
self.model.compile(loss='categorical_crossentropy',
optimizer=keras.optimizers.Adam(lr=0.001, decay=0.0001),
metrics=[categorical_accuracy])
Fit function
history = self.model.fit(self.X,
{'output': self.Y},
validation_split=0.3,
epochs=400,
batch_size=32
)
I am interested in finding out which labels are getting categorized wrongly in the validation step. Seems like a good way to understand what is happening under the hood.
You can use model.predict_classes(validation_data) to get the predicted classes for your validation data, and compare these predictions with the actual labels to find out where the model was wrong. Something like this:
predictions = model.predict_classes(validation_data)
wrong = np.where(predictions != Y_validation)
If you are interested in looking 'under the hood', I'd suggest to use
model.predict(validation_data_x)
to see the scores for each class, for each observation of the validation set.
This should shed some light on which categories the model is not so good at classifying. The way to predict the final class is
scores = model.predict(validation_data_x)
preds = np.argmax(scores, axis=1)
be sure to use the proper axis for np.argmax (I'm assuming your observation axis is 1). Use preds to then compare with the real class.
Also, as another exploration you want to see the overall accuracy on this dataset, use
model.evaluate(x=validation_data_x, y=validation_data_y)
I ended up creating a metric which prints the "worst performing category id + score" on each iteration. Ideas from link
import tensorflow as tf
import numpy as np
class MaxIoU(object):
def __init__(self, num_classes):
super().__init__()
self.num_classes = num_classes
def max_iou(self, y_true, y_pred):
# Wraps np_max_iou method and uses it as a TensorFlow op.
# Takes numpy arrays as its arguments and returns numpy arrays as
# its outputs.
return tf.py_func(self.np_max_iou, [y_true, y_pred], tf.float32)
def np_max_iou(self, y_true, y_pred):
# Compute the confusion matrix to get the number of true positives,
# false positives, and false negatives
# Convert predictions and target from categorical to integer format
target = np.argmax(y_true, axis=-1).ravel()
predicted = np.argmax(y_pred, axis=-1).ravel()
# Trick from torchnet for bincounting 2 arrays together
# https://github.com/pytorch/tnt/blob/master/torchnet/meter/confusionmeter.py
x = predicted + self.num_classes * target
bincount_2d = np.bincount(x.astype(np.int32), minlength=self.num_classes**2)
assert bincount_2d.size == self.num_classes**2
conf = bincount_2d.reshape((self.num_classes, self.num_classes))
# Compute the IoU and mean IoU from the confusion matrix
true_positive = np.diag(conf)
false_positive = np.sum(conf, 0) - true_positive
false_negative = np.sum(conf, 1) - true_positive
# Just in case we get a division by 0, ignore/hide the error and set the value to 0
with np.errstate(divide='ignore', invalid='ignore'):
iou = false_positive / (true_positive + false_positive + false_negative)
iou[np.isnan(iou)] = 0
return np.max(iou).astype(np.float32) + np.argmax(iou).astype(np.float32)
~
usage:
custom_metric = MaxIoU(len(catagories))
self.model.compile(loss='categorical_crossentropy',
optimizer=keras.optimizers.Adam(lr=0.001, decay=0.0001),
metrics=[categorical_accuracy, custom_metric.max_iou])

How to predict Label of an email using a trained NB Classifier in sklearn?

I have created a Gaussian Naive Bayes classifier on a email (spam/not spam) dataset and was able to run it successfully. I vectorized the data, divided in it train and test sets and then calculated the accuracy, all the features that are present in the sklearn-Gaussian Naive Bayes classifier.
Now I want to be able to use this classifier to predict "labels" for new emails - whether they are by spam or not.
For example say I have an email. I want to feed it to my classifier and get the prediction as to whether it is a spam or not. How can I achieve this? Please Help.
Code for classifier file.
#!/usr/bin/python
import sys
from time import time
import logging
# Display progress logs on stdout
logging.basicConfig(level = logging.DEBUG, format = '%(asctime)s %(message)s')
sys.path.append("../DatasetProcessing/")
from vectorize_split_dataset import preprocess
### features_train and features_test are the features
for the training and testing datasets, respectively### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()
#########################################################
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
t0 = time()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("training time:", round(time() - t0, 3), "s")
print(clf.score(features_test, labels_test))
## Printing Metrics
for Training and Testing
print("No. of Testing Features:" + str(len(features_test)))
print("No. of Testing Features Label:" + str(len(labels_test)))
print("No. of Training Features:" + str(len(features_train)))
print("No. of Training Features Label:" + str(len(labels_train)))
print("No. of Predicted Features:" + str(len(pred)))
## Calculating Classifier Performance
from sklearn.metrics import classification_report
y_true = labels_test
y_pred = pred
labels = ['0', '1']
target_names = ['class 0', 'class 1']
print(classification_report(y_true, y_pred, target_names = target_names, labels = labels))
# How to predict label of a new text
new_text = "You won a lottery at UK lottery commission. Reply to claim it"
Code for Vectorization
#!/usr/bin/python
import os
import pickle
import numpy
numpy.random.seed(42)
path = os.path.dirname(os.path.abspath(__file__))
### The words(features) and label_data(labels), already largely processed.###These files should have been created beforehand
feature_data_file = path + "./createdDataset/dataSet.pkl"
label_data_file = path + "./createdDataset/dataLabel.pkl"
feature_data = pickle.load(open(feature_data_file, "rb"))
label_data = pickle.load(open(label_data_file, "rb"))
### test_size is the percentage of events assigned to the test set(the### remainder go into training)### feature matrices changed to dense representations
for compatibility with### classifier functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(feature_data, label_data, test_size = 0.1, random_state = 42)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = 'english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test)#.toarray()
## feature selection to reduce dimensionality
from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile = 5)
selector.fit(features_train, labels_train)
features_train_transformed_reduced = selector.transform(features_train).toarray()
features_test_transformed_reduced = selector.transform(features_test).toarray()
features_train = features_train_transformed_reduced
features_test = features_test_transformed_reduced
def preprocess():
return features_train, features_test, labels_train, labels_test
Code for dataset generation
#!/usr/bin/python
import os
import pickle
import re
import sys
# sys.path.append("../tools/")
""
"
Starter code to process the texts of accuate and inaccurate category to extract
the features and get the documents ready for classification.
The list of all the texts from accurate category are in the accurate_files list
likewise for texts of inaccurate category are in (inaccurate_files)
The data is stored in lists and packed away in pickle files at the end.
"
""
accurate_files = open("./rawDatasetLocation/accurateFiles.txt", "r")
inaccurate_files = open("./rawDatasetLocation/inaccurateFiles.txt", "r")
label_data = []
feature_data = []
### temp_counter is a way to speed up the development--there are### thousands of lines of accurate and inaccurate text, so running over all of them### can take a long time### temp_counter helps you only look at the first 200 lines in the list so you### can iterate your modifications quicker
temp_counter = 0
for name, from_text in [("accurate", accurate_files), ("inaccurate", inaccurate_files)]:
for path in from_text: ###only look at first 200 texts when developing### once everything is working, remove this line to run over full dataset
temp_counter = 1
if temp_counter < 200:
path = os.path.join('..', path[: -1])
print(path)
text = open(path, "r")
line = text.readline()
while line: ###use a
function parseOutText to extract the text from the opened text# stem_text = parseOutText(text)
stem_text = text.readline().strip()
print(stem_text)### use str.replace() to remove any instances of the words# stem_text = stem_text.replace("germani", "")### append the text to feature_data
feature_data.append(stem_text)### append a 0 to label_data
if text is from Sara, and 1
if text is from Chris
if (name == "accurate"):
label_data.append("0")
elif(name == "inaccurate"):
label_data.append("1")
line = text.readline()
text.close()
print("texts processed")
accurate_files.close()
inaccurate_files.close()
pickle.dump(feature_data, open("./createdDataset/dataSet.pkl", "wb"))
pickle.dump(label_data, open("./createdDataset/dataLabel.pkl", "wb"))
Also I want to know whether i can incrementally train the classifier meaning thereby that retrain a created model with newer data for refining the model over time?
I would be really glad if someone can help me out with this. I am really stuck at this point.
You are already using your model to predict labels of emails in your test set. This is what pred = clf.predict(features_test) does. If you want to see these labels, do print pred.
But perhaps you what to know how you can predict labels for emails that you discover in the future and that are not currently in your test set? If so, you can think of your new email(s) as a new test set. As with your previous test set, you will need to run several key processing steps on the data:
1) The first thing you need to do is to generate features for your new email data. The feature generation step is not included in your code above, but will need to occur.
2) You are using a Tfidf vectorizer, which converts a collection of documents to a matrix of Tfidf features based upon term frequency and inverse document frequency. You need to put your new email test feature data through the vectorizer that you fit on your training data.
3) Then your new email test feature data will need to go through dimensionality reduction using the same selector that you fit on your training data.
4) Finally, run predict on your new test data. Use print pred if you want to view the new label(s).
To respond to your final question about iteratively re-training your model, yes you definitely can do this. It's just a matter of selecting a frequency, producing a script that expands your data set with incoming data, then re-running all steps from there, from pre-processing to Tfidf vectorization, to dimensionality reduction, to fitting, and prediction.

Resources