Getting error on running the trained Machine Learning model - python-3.x

I have a dataset containing columns 'studentDetails' and 'studentId'. I trained my model on this dataset and saved it. When I am training the model and saving the trained model, then loading the trained model to predict, it successfully giving me the output. But when I am loading the saved model standalone and predicting using that, it is giving me an error "CountVectorizer - Vocabulary wasn't fitted"
Here is the code I am using:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(df['studentDetails'], df['studentId'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
classificationModel = LinearSVC().fit(X_train_tfidf, y_train)
filename = 'finalized_model.sav'
pickle.dump(classificationModel, open(filename, 'wb'))
Now loading the model and predicting:
from sklearn.feature_extraction.text import CountVectorizer
data_to_be_predicted="Alicia Scott is from United States"
filename = 'finalized_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
count_vect = CountVectorizer()
result = loaded_model.predict(count_vect.transform([data_to_be_predicted]))
print(result)
output:
94120
When I am running just the second code snippet standalone, it is giving me an error
error:
CountVectorizer - Vocabulary wasn't fitted
I am just wondering, how come I am getting an error in the second case because I am not redefining the count_vect = CountVectorizer() anywhere in the first case when I am getting the correct result.

The problem with the second snippet is that you are not using the fitted CounVectorizer, its a new one so it is not fitted.
I will suggets you use fit instead of fit_transform, this will return you a CountVectorizer already fitted and then you can save it as you do with your model.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(df['studentDetails'], df['studentId'], random_state = 0)
count_vect = CountVectorizer().fit(X_train)
X_train_counts = count_vect.transform(X_train)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
classificationModel = LinearSVC().fit(X_train_tfidf, y_train)
filename = 'finalized_model.sav'
pickle.dump(classificationModel, open(filename, 'wb'))
pickle.dump(count_vect, open('count_vect, 'wb'))
pickle.dump(tfidf_transformer, open('tfidf_transformer, 'wb'))
And now you can load the 3 of them when you want to do predictions:
from sklearn.feature_extraction.text import CountVectorizer
data_to_be_predicted="Alicia Scott is from United States"
filename = 'finalized_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
count_vect = pickle.load(open('count_vect', 'rb'))
result = loaded_model.predict(count_vect.transform([data_to_be_predicted]))
print(result)

Related

AttributeError: 'numpy.ndarray' object has no attribute 'lower' - how to fix it?

The full error is this. I am not sure how to fix it. I'm trying to predict the link between gender and aggresiveness in tweets.
(https://i.stack.imgur.com/T4Ual.png)
This is the whole script
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#De specifikke, vi ved vi kommer til at bruge
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB #Gør at man kan have mere end 2 classes
data = pd.read_csv('/work/90301/Individual project/TheClimateChangeTwitterDataset.csv')
#corpus=data['text']
#corpus=text.loc[:,['aggressiveness', 'gender']]
cv=CountVectorizer() #Take some text and turn it into a matrix
X = cv.fit_transform(data.values).toarray()
#x = X['aggressiveness'].values
#y = X['gender'].values
y=data['gender'].values
print(X.shape)
print(y.shape)
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
#Instantiate and train Naive Bayes
classifier = MultinomialNB(fit_prior=True)
classifier.fit(X_train, y_train)
#test model
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f'Relative accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Accuracy in instances: {accuracy_score(y_test, y_pred, normalize=False)}')
#Infer the label (spam/ham) of a message
aggressiveness=[corpus]
#print(email)
aggressiveness_array = cv.transform(aggressiveness).toarray()
print(classifier.predict(aggressiveness_array))

how to train one class svm multipul times

I have a multi-class dataset and am trying to use OneClassSVM() to classify each class.
from sklearn.svm import OneClassSVM
clf = OneClassSVM(gamma='auto').fit(df)
x_train,x_test,y_train,y_test = train_test_split(df,target,test_size=0.30, random_state=25)
inliers=df[clf.predict(df)==1]
outliers=df[clf.predict(df)==-1]
so I would like to know how can I train OneClassSVM() on each class?
One way to do this is to separate the dataset by class and each class be trained separately in OCSVM. Here is a code that returns different evaluation metrics for inliers (1) and outliers (-1).
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report
def evaluation_one_class(preds_interest, preds_outliers):
y_true = [1]*len(preds_interest) + [-1]*len(preds_outliers)
y_pred = list(preds_interest)+list(preds_outliers)
return classification_report(y_true, y_pred, output_dict=False)
def evaluate_model(X_train, X_test, X_outlier, model):
one_class_classifier = model.fit(X_train)
Y_pred_interest = one_class_classifier.predict(X_test)
Y_pred_ruido = one_class_classifier.predict(X_outlier)
print(evaluation_one_class(Y_pred_interest, Y_pred_ruido))
class_of_interest = ''
df_interest = df[df['target'] == class_of_interest]
df_outlier = df[df['target'] != class_of_interest]
df_train_int, df_test_int = train_test_split(df_interest,test_size=0.30, random_state=25)
clf = OneClassSVM(gamma='auto')
evaluate_model(df_train_int, df_test_int, df_outlier, clf)

How To Do Model Predict Using Distributed Dask With a Pre-Trained Keras Model?

I am loading my pre-trained keras model and then trying to parallelize a large number of input data using dask? Unfortunately, I'm running into some issues with this relating to how I'm creating my dask array. Any guidance would be greatly appreciated!
Setup:
First I cloned from this repo https://github.com/sanchit2843/dlworkshop.git
Reproducible Code Example:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from keras.models import load_model
import keras
from keras.models import Sequential
from keras.layers import Dense
from dask.distributed import Client
import warnings
import dask.array as DaskArray
warnings.filterwarnings('ignore')
dataset = pd.read_csv('data/train.csv')
X = dataset.drop(['price_range'], axis=1).values
y = dataset[['price_range']].values
# scale data
sc = StandardScaler()
X = sc.fit_transform(X)
ohe = OneHotEncoder()
y = ohe.fit_transform(y).toarray()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
# Neural network
model = Sequential()
model.add(Dense(16, input_dim=20, activation="relu"))
model.add(Dense(12, activation="relu"))
model.add(Dense(4, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=64)
# Use dask
client = Client()
def load_and_predict(input_data_chunk):
def contrastive_loss(y_true, y_pred):
margin = 1
square_pred = K.square(y_pred)
margin_square = K.square(K.maximum(margin - y_pred, 0))
return K.mean(y_true * square_pred + (1 - y_true) * margin_square)
mlflow.set_tracking_uri('<uri>')
mlflow.set_experiment('clean_parties_ml')
runs = mlflow.search_runs()
artifact_uri = runs.loc[runs['start_time'].idxmax()]['artifact_uri']
model = mlflow.keras.load_model(artifact_uri + '/model', custom_objects={'contrastive_loss': contrastive_loss})
y_pred = model.predict(input_data_chunk)
return y_pred
da_input_data = da.from_array(X_test, chunks=(100, None))
prediction_results = da_input_data.map_blocks(load_and_predict, dtype=X_test.dtype).compute()
The Error I'm receiving:
AttributeError: '_thread._local' object has no attribute 'value'
Keras/Tensorflow don't play nicely with other threaded systems. There is an ongoing issue on this topic here: https://github.com/dask/dask-examples/issues/35

How to Insert new data to make a prediction? Sklearn

I'm doing the "Hello world" in machine learning, using the Iris dataset. I already have an acceptable result for the entry of this model, I am using 80% of the information to train it and the remaining 20% ​​to do the validation. I am using 6 prediction algorithms, which work well.
but I have a problem, how can I insert new information so that it is analyzed? How do I insert the characteristics of a flower and tell me the type of iris it is? Either: Iris-setosa, Iris-versicolor or Iris-virginica?
# Load libraries
import pandas
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)
#######Evaluate Some Algorithms########
#Create a Validation Dataset
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
########Build Models########
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
########Make Predictions########
print('######## Make Predictions ########')
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
I think you can follow this other post to save your model, and after you can load him and pass new data and make some predictions.
Remember to set the data to same input shape as used during training.
import cPickle
# save the classifier
with open('my_dumped_classifier.pkl', 'wb') as fid:
cPickle.dump(gnb, fid)
# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
gnb_loaded = cPickle.load(fid)
# make predictions

FastText: Can't get cross_validation

I am struggling to implement FastText (FTTransformer) into a Pipeline that iterates over different vectorizers. More particular, I can't get cross-validation scores. Following code is used:
%%time
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from gensim.utils import simple_preprocess
from gensim.sklearn_api.ftmodel import FTTransformer
np.random.seed(0)
data = pd.read_csv('https://pastebin.com/raw/dqKFZ12m')
X_train, X_test, y_train, y_test = train_test_split(data.text, data.label, random_state=0)
w2v_texts = [simple_preprocess(doc) for doc in X_train]
models = [FTTransformer(size=10, min_count=0, seed=42)]
classifiers = [LogisticRegression(random_state=0)]
for model in models:
for classifier in classifiers:
model.fit(w2v_texts)
classifier.fit(model.transform(X_train), y_train)
pipeline = Pipeline([
('vec', model),
('clf', classifier)
])
print(pipeline.score(X_train, y_train))
#print(model.gensim_model.wv.most_similar('kirk'))
cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=5)
KeyError: 'all ngrams for word "Machine learning can be useful
branding sometimes" absent from model'
How can the problem be solved?
Sidenote: My other pipelines with D2VTransformer or TfIdfVectorizer work just fine. Here, I can simply apply pipeline.fit(X_train, y_train) after defining the pipeline, instead of the two fits as shown above. It seems like FTTransformer doesn't integrate so well with other given vectorizers?
Yes, to be used in a pipeline, FTTransformer needs to be modified to split documents to words inside its fit method. One can do it as follows:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from gensim.utils import simple_preprocess
from gensim.sklearn_api.ftmodel import FTTransformer
np.random.seed(0)
class FTTransformer2(FTTransformer):
def fit(self, x, y):
super().fit([simple_preprocess(doc) for doc in x])
return self
data = pd.read_csv('https://pastebin.com/raw/dqKFZ12m')
X_train, X_test, y_train, y_test = train_test_split(data.text, data.label, random_state=0)
classifiers = [LogisticRegression(random_state=0)]
for classifier in classifiers:
pipeline = Pipeline([
('ftt', FTTransformer2(size=10, min_count=0, seed=0)),
('clf', classifier)
])
score = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=5)
print(score)

Resources