Confusion Matrix in SkLearn showing error - python-3.x

I am trying to plot a confusion matrix for my classification model given the iris dataset. However, I keep getting an error. I hope someone can guide.Thanks
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import confusion_matrix
def train_and_predict(train_input_features, train_outputs, prediction_features):
classifier=tree.DecisionTreeClassifier()
classifier.fit(train_input_features,train_outputs)
predictions=classifier.predict(prediction_features)
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,test_size=0.3, random_state=0)
y_pred = train_and_predict(X_train, y_train, X_test)
print(confusion_matrix(y_test, predictions))
OUT: NameError: name 'predictions' is not defined

I found out that I needed to paste the code within the function,i.e.:
def train_and_predict(train_input_features, train_outputs, prediction_features):
classifier=tree.DecisionTreeClassifier()
classifier.fit(train_input_features,train_outputs)
predictions=classifier.predict(prediction_features)
print(predictions)
print('Confusion matrix\n',confusion_matrix(y_test,classifier.predict(X_test)))

Related

AttributeError: 'numpy.ndarray' object has no attribute 'lower' - how to fix it?

The full error is this. I am not sure how to fix it. I'm trying to predict the link between gender and aggresiveness in tweets.
(https://i.stack.imgur.com/T4Ual.png)
This is the whole script
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#De specifikke, vi ved vi kommer til at bruge
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB #Gør at man kan have mere end 2 classes
data = pd.read_csv('/work/90301/Individual project/TheClimateChangeTwitterDataset.csv')
#corpus=data['text']
#corpus=text.loc[:,['aggressiveness', 'gender']]
cv=CountVectorizer() #Take some text and turn it into a matrix
X = cv.fit_transform(data.values).toarray()
#x = X['aggressiveness'].values
#y = X['gender'].values
y=data['gender'].values
print(X.shape)
print(y.shape)
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
#Instantiate and train Naive Bayes
classifier = MultinomialNB(fit_prior=True)
classifier.fit(X_train, y_train)
#test model
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f'Relative accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Accuracy in instances: {accuracy_score(y_test, y_pred, normalize=False)}')
#Infer the label (spam/ham) of a message
aggressiveness=[corpus]
#print(email)
aggressiveness_array = cv.transform(aggressiveness).toarray()
print(classifier.predict(aggressiveness_array))

LogisticRegression classifier

I need to use Logistic Regression classifier I have dataset the length of each column 2000 this is all my code:
from statistics import mode
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
# Importing the datasets
###Social_Network_Ads
datasets = pd.read_csv('C:/Users/n3.csv',header=None)
X = datasets.iloc[:, 0:5].values
Y = datasets.iloc[:, 5].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
# instantiate the model (using the default parameters)
model = LogisticRegression()
# fit the model with data
model.fit(X_Train, Y_Train)
predicted = cross_val_predict(mode, X_Train, Y_Train, cv=5)
train_acc = model.score(X_Train, Y_Train)
print("The Accuracy for Training Set is {}".format(train_acc*100))
But in I got on this error:
TypeError: Cannot clone object '<function mode at 0x000000FD6579B9D0>'
(type <class 'function'>): it does not seem to be a scikit-learn
estimator as it does not implement a 'get_params' method.
How solve this?
Change this line
predicted = cross_val_predict(mode, X_Train, Y_Train, cv=5)
to
predicted = cross_val_predict(model, X_Train, Y_Train, cv=5)
You have a simple typo. You want to pass your estimator to the function but instead you passed mode which is imported from statistics. That's why the error tells you that it can not clone an object of type function. You are passing a function but it expects an estimator.

cut-off point into a logistic regression with the Scikit learn library

I'm trying to change the cut-off point into a logistic regression with the Scikit learn library but I don't see the way even having read the documentation for it. In SPSS it gives you the option to change that parameter but here I don't get it. I put algorithm code. Any help? Thank you
X = np.array(dataS)
y = np.array(target)
X.shape
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
warnings.filterwarnings("ignore")
model = linear_model.LogisticRegression()
model.fit(X,y)
predictions = model.predict(X)
model.score(X,y)
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, y,
test_size=validation_size, random_state=seed)
name='Logistic Regression'
kfold = model_selection.KFold(n_splits=161, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

Calculating precision, recall, and F-measure for Logistic Regression classifier

I have a labeled and clean dataset for sentiment analysis, and I used logistic regression for classification. Here is my code.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
xl = pd.ExcelFile('d:/data.xlsx')
df3 = xl.parse("Sheet1")
cl_data, sent = df3['Clean-Reviews'].fillna(' '), df3['Sentiment']
sent_train, sent_test, y_train, y_test = train_test_split(cl_data, sent,
test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(sent_train)
X_train = vectorizer.transform(sent_train)
X_test = vectorizer.transform(sent_test)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
when I try to calculate precision, recall, and F-measure:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print(f1_score(X_test, y_test, average="macro"))
print(precision_score(X_test, y_test, average="macro"))
print(recall_score(X_test, y_test, average="macro"))
I got an error:
TypeError: len() of unsized object
Can anyone tell what's the problem here? Thanks in Advance
accuracy is measured between predicted and true value, and in your code x_test is not a predicted value. it should be
y_pred = classifier.predict(x_test)
print(f1_score(y_test,y_pred, average="macro"))

FastText: Can't get cross_validation

I am struggling to implement FastText (FTTransformer) into a Pipeline that iterates over different vectorizers. More particular, I can't get cross-validation scores. Following code is used:
%%time
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from gensim.utils import simple_preprocess
from gensim.sklearn_api.ftmodel import FTTransformer
np.random.seed(0)
data = pd.read_csv('https://pastebin.com/raw/dqKFZ12m')
X_train, X_test, y_train, y_test = train_test_split(data.text, data.label, random_state=0)
w2v_texts = [simple_preprocess(doc) for doc in X_train]
models = [FTTransformer(size=10, min_count=0, seed=42)]
classifiers = [LogisticRegression(random_state=0)]
for model in models:
for classifier in classifiers:
model.fit(w2v_texts)
classifier.fit(model.transform(X_train), y_train)
pipeline = Pipeline([
('vec', model),
('clf', classifier)
])
print(pipeline.score(X_train, y_train))
#print(model.gensim_model.wv.most_similar('kirk'))
cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=5)
KeyError: 'all ngrams for word "Machine learning can be useful
branding sometimes" absent from model'
How can the problem be solved?
Sidenote: My other pipelines with D2VTransformer or TfIdfVectorizer work just fine. Here, I can simply apply pipeline.fit(X_train, y_train) after defining the pipeline, instead of the two fits as shown above. It seems like FTTransformer doesn't integrate so well with other given vectorizers?
Yes, to be used in a pipeline, FTTransformer needs to be modified to split documents to words inside its fit method. One can do it as follows:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from gensim.utils import simple_preprocess
from gensim.sklearn_api.ftmodel import FTTransformer
np.random.seed(0)
class FTTransformer2(FTTransformer):
def fit(self, x, y):
super().fit([simple_preprocess(doc) for doc in x])
return self
data = pd.read_csv('https://pastebin.com/raw/dqKFZ12m')
X_train, X_test, y_train, y_test = train_test_split(data.text, data.label, random_state=0)
classifiers = [LogisticRegression(random_state=0)]
for classifier in classifiers:
pipeline = Pipeline([
('ftt', FTTransformer2(size=10, min_count=0, seed=0)),
('clf', classifier)
])
score = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=5)
print(score)

Resources