Tensorflow classifier.evaluate running indefinitely? - python-3.x

I've started trying out some of the Tensorflow API's. I am using the iris data set to experiment with Tensorflows Estimator's. I'm loosely following this tutorial except that I load my data in a little differently: https://www.tensorflow.org/guide/premade_estimators#top_of_page
My problem is that when the code below executes and I get to the section with:
# Evaluate the model.
eval_result = classifier.evaluate(
My computer just runs seemingly without end. I've been waiting for my jupyter notebook to complete this step now for an hour and a half but no end in sight. The lastoutput of the notebook is:
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Problem statement: How can I adjust my code to make this evaluation more efficient I'm obviously making it do much more work than I anticipated.
So far I have tried adjusting the batch size and the number or neurons in the layers but with no luck.
#First we want to import what we need. Typically this will be some combination of:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
#Extract the data from the iris dataset.
df = pd.read_csv('IRIS.csv')
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])
#Extract both into features and labels.
#features should be a dictionary.
#label can just be an array
def extract_features_and_labels(dataframe):
#features and label for training
x = dataframe.copy()
y = dataframe.pop('species')
return dict(x), y
#break the data up into train and test.
#split the overall df into training and testing data
train, test = train_test_split(df, test_size=0.2)
train_x, train_y = extract_features_and_labels(train)
test_x, test_y = extract_features_and_labels(test)
print(len(train_x), 'training examples')
print(len(train_y), 'testing examples')
my_feature_columns = []
for key in train_x.keys():
my_feature_columns.append(tf.feature_column.numeric_column(key=key))
def train_input_fn(features, labels, batch_size):
"""An input function for training"""
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
# Shuffle, repeat, and batch the examples.
return dataset.shuffle(1000).repeat().batch(batch_size)
#Build the classifier!!!!
classifier = tf.estimator.DNNClassifier(
feature_columns=my_feature_columns,
# Two hidden layers of 10 nodes each.
hidden_units=[4, 4],
# The model must choose between 3 classes.
n_classes=3)
# Train the Model.
classifier.train(
input_fn=lambda:train_input_fn(train_x, train_y, 10), steps=1000)
# Evaluate the model.
eval_result = classifier.evaluate(
input_fn=lambda:train_input_fn(test_x, test_y, 100))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

Had to switch a lot of things up but finally got the estimator working on the IRIS data set. Here is the code below for any who may find it useful in the future. Cheers.
#First we want to import what we need. Typically this will be some combination of:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
%matplotlib inline
#Extract the data from the iris dataset.
df = pd.read_csv('IRIS.csv')
#Grab only our categorial data
#categories = df.select_dtypes(include=[object])
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])
# use df.apply() to apply le.fit_transform to all columns
#X_2 = X.apply(le.fit_transform)
#Reshape as the one hot encoder doesnt like one row/column
#X_2 = X.reshape(-1, 1)
#features is everything but our label so makes sense to simply....
features = df.drop('species', axis=1)
print(features.head())
target = df['species']
print(target.head())
#trains and test
X_train, X_test, y_train, y_test = train_test_split(
features, target, test_size=0.33, random_state=42)
#Introduce Tensorflow feature column (numeric column)
numeric_column = ['sepal_length','sepal_width','petal_length','petal_width']
numeric_features = [tf.feature_column.numeric_column(key = column)
for column in numeric_column]
print(numeric_features[0])
#Build the input function for training
training_input_fn = tf.estimator.inputs.pandas_input_fn(x = X_train,
y=y_train,
batch_size=10,
shuffle=True,
num_epochs=None)
#Build the input function for testing input
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x=X_test,
y=y_test,
batch_size=10,
shuffle=False,
num_epochs=1)
#Instansiate the model
dnn_classifier = tf.estimator.DNNClassifier(feature_columns=numeric_features,
hidden_units=[3,3],
optimizer=tf.train.AdamOptimizer(1e-4),
n_classes=3,
dropout=0.1,
model_dir = "dnn_classifier")
dnn_classifier.train(input_fn = training_input_fn,steps=2000)
#Evaluate the trained model
dnn_classifier.evaluate(input_fn = eval_input_fn)
# print("Loss is " + str(loss))
pred = list(dnn_classifier.predict(input_fn = eval_input_fn))
for e in pred:
print(e)
print("\n")
#pred = [p['species'][0] for p in pred]

Related

ValueError: Expected 2D array, got scalar array instead: array=6.5

While practicing Support Vector Regression Model I got this error:
ValueError: Expected 2D array, got scalar array instead:
array=6.5.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
This is my code (Python 3.7)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 20 14:39:06 2021
#author: lulu
"""
# SVR
# simple learning regression
# Data preprocessing
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd # import data sets and manage data sets
# Importing dataset
dataset = pd.read_csv('/home/lulu/machineLearning/Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values
# Splitting the dataset into the training set and test set
"""from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 1/3,random_state = 0)"""
# Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_X.fit_transform(y)
# Fitting SVR to the training set
# Create your regressor here
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)
# Producing a new result
y_pred = regressor.predict(sc_X.transform(6.5))
# Visualizing the test VR results
plt.scatter(X, y, color='red')
plt.plot(X, regressor.predict(X),color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
I want to predict a new result and so since 6.5 is in some way not transformed we actually need to transform it with the following function but i don't know how i apply this function correctly
sc_X.transform
I want to know why i don't get a result ideal according to ground about multiple linear regression and so forth. The result is a senseless, i can't formulate a conclusion.
Try this:
y_pred = regressor.predict(sc_X.transform([[6.5]]))

Unable to calculate Model performance for Decision Tree Regressor

Although my code run fine on repl and did giving me results but it miserably fails on the Katacoda testing environment.
I am attaching the repl file here for your review as well, which also contains the question which is commented just above the code I have written.
Kindly review and let me know what mistakes I am making here.
Repl Link
https://repl.it/repls/WarmRobustOolanguage
Also sharing code below
Commented is Question Instructions
#Import two modules sklearn.datasets, and #sklearn.model_selection.
#Import numpy and set random seed to 100.
#Load popular Boston dataset from sklearn.datasets module #and assign it to variable boston.
#Split boston.data into two sets names X_train and X_test. #Also, split boston.target into two sets Y_train and Y_test.
#Hint: Use train_test_split method from #sklearn.model_selection; set random_state to 30.
#Print the shape of X_train dataset.
#Print the shape of X_test dataset.
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import numpy as np
np.random.seed(100)
max_depth = range(2, 6)
boston = datasets.load_boston()
X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, random_state=30)
print(X_train.shape)
print(X_test.shape)
#Import required module from sklearn.tree.
#Build a Decision tree Regressor model from X_train set and #Y_train labels, with default parameters. Name the model as #dt_reg.
#Evaluate the model accuracy on training data set and print #it's score.
#Evaluate the model accuracy on testing data set and print it's score.
#Predict the housing price for first two samples of X_test #set and print them.(Hint : Use predict() function)
dt_reg = DecisionTreeRegressor(random_state=1)
dt_reg = dt_reg.fit(X_train, Y_train)
print('Accuracy of Train Data :', cross_val_score(dt_reg, X_train,Y_train, cv=10 ))
print('Accuracy of Test Data :', cross_val_score(dt_reg, X_test,Y_test, cv=10 ))
predicted = dt_reg.predict(X_test[:2])
print(predicted)
#Fit multiple Decision tree regressors on X_train data and #Y_train labels with max_depth parameter value changing from #2 to 5.
#Evaluate each model accuracy on testing data set.
#Hint: Make use of for loop
#Print the max_depth value of the model with highest accuracy.
dt_reg = DecisionTreeRegressor()
random_grid = {'max_depth': max_depth}
dt_random = RandomizedSearchCV(estimator = dt_reg, param_distributions = random_grid,
n_iter = 90, cv = 3, verbose=2, random_state=42, n_jobs = -1)
dt_random.fit(X_train, Y_train)
dt_random.best_params_
def evaluate(model, test_features, test_labels):
predictions = model.predict(test_features)
errors = abs(predictions - test_labels)
mape = 100 * np.mean(errors / test_labels)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))
return accuracy
best_random = dt_random.best_estimator_
random_accuracy = evaluate(best_random, X_test,Y_test)
print("Accuracy Scores of the Model ",random_accuracy)
best_parameters = (dt_random.best_params_['max_depth']);
print(best_parameters)
The question is asking for default values. Try to remove random_state=1
Current Line:
dt_reg = DecisionTreeRegressor(random_state=1)
Update Line:
dt_reg = DecisionTreeRegressor()
I think it should Work!!!
# ================================================================================
# Machine Learning Using Scikit-Learn | 3 | Decision Trees ================================================================================
import sklearn.datasets as datasets
import sklearn.model_selection as model_selection
import numpy as np
from sklearn.tree import DecisionTreeRegressor
np.random.seed(100)
# Load popular Boston dataset from sklearn.datasets module and assign it to variable boston.
boston = datasets.load_boston()
# print(boston)
# Split boston.data into two sets names X_train and X_test. Also, split boston.target into two sets Y_train and Y_test
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(boston.data, boston.target, random_state=30)
# Print the shape of X_train dataset
print(X_train.shape)
# Print the shape of X_test dataset.
print(X_test.shape)
# Build a Decision tree Regressor model from X_train set and Y_train labels, with default parameters. Name the model as dt_reg
dt_Regressor = DecisionTreeRegressor()
dt_reg = dt_Regressor.fit(X_train, Y_train)
print(dt_reg.score(X_train,Y_train))
print(dt_reg.score(X_test,Y_test))
predicted = dt_reg.predict(X_test[:2])
print(predicted)
# Get the max depth
maxdepth = 2
maxscore = 0
for x in range(2, 6):
dt_Regressor = DecisionTreeRegressor(max_depth=x)
dt_reg = dt_Regressor.fit(X_train, Y_train)
score = dt_reg.score(X_test, Y_test)
if(maxscore < score):
maxdepth = x
maxscore = score
print(maxdepth)

How to Insert new data to make a prediction? Sklearn

I'm doing the "Hello world" in machine learning, using the Iris dataset. I already have an acceptable result for the entry of this model, I am using 80% of the information to train it and the remaining 20% ​​to do the validation. I am using 6 prediction algorithms, which work well.
but I have a problem, how can I insert new information so that it is analyzed? How do I insert the characteristics of a flower and tell me the type of iris it is? Either: Iris-setosa, Iris-versicolor or Iris-virginica?
# Load libraries
import pandas
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)
#######Evaluate Some Algorithms########
#Create a Validation Dataset
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
########Build Models########
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
########Make Predictions########
print('######## Make Predictions ########')
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
I think you can follow this other post to save your model, and after you can load him and pass new data and make some predictions.
Remember to set the data to same input shape as used during training.
import cPickle
# save the classifier
with open('my_dumped_classifier.pkl', 'wb') as fid:
cPickle.dump(gnb, fid)
# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
gnb_loaded = cPickle.load(fid)
# make predictions

ValueError: Can only tuple-index with a MultiIndex

For a Multilabel Classification problem i am trying to plot precission and recall curve.
The sample code is taken from "https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py" under section Create multi-label data, fit, and predict.
I am trying to fit it in my code but i get below error as "ValueError: Can only tuple-index with a MultiIndex" when i try below code.
train_df.columns.values
array(['DefId', 'DefectCount', 'SprintNo', 'ReqName', 'AreaChange',
'CodeChange', 'TestSuite'], dtype=object)
Test Suite is the value to be predicted
X_train = train_df.drop("TestSuite", axis=1)
Y_train = train_df["TestSuite"]
X_test = test_df.drop("DefId", axis=1).copy()
classes --> i have hardcorded with the testsuite values
from sklearn.preprocessing import label_binarize
# Use label_binarize to be multi-label like settings
Y = label_binarize(Y_train, classes=np.array([0, 1, 2,3,4])
n_classes = Y.shape[1]
# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier
# Run classifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=3))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import pandas as pd
# For each class
precision = dict()
recall = dict()
average_precision = dict()
#n_classes = Y.shape[1]
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(Y_train[:, i], y_score[:, i])
average_precision[i] = average_precision_score(Y_train[:, i], y_score[:, i])
Input Data -> Values has been categorised

Why does this Tensorflow program work?

I'm running on Mac OS 10.12.4, Anaconda Python 3.5 and Tensorflow 1.1.
I have cobbled together the reproducible code shown below.
I have defined "my_model" with arguments "features" and "labels".
I did not define them. The "my_model" function is called without any arguments.
My Spyder "variables" window does not show them after the program runs.
My question is: where are these variables defined?
Charles
from sklearn import metrics, cross_validation
from tensorflow.contrib import layers
from tensorflow.contrib import learn
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# shut up the warnings
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
def my_model(features, labels):
labels = tf.one_hot(labels, 3, 1, 0)
features = layers.stack(features, layers.fully_connected, [10, 20, 10])
prediction, loss = (learn.models.logistic_regression(features, labels))
train_op = tf.contrib.layers.optimize_loss(
loss,
tf.contrib.framework.get_global_step(),
optimizer='Adagrad',
learning_rate=0.1)
return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op
df = pd.read_csv("iris.csv")
df = df.sample(frac=1) # shuffle all rows
print(df.head())
column_names = list(df.columns[:4])
X = df[column_names].as_matrix()
y = df['Species']
le = LabelEncoder()
le.fit(df["Species"])
y = le.transform(df["Species"])
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
X, y, test_size=0.2, random_state=35)
classifier = tf.contrib.learn.Estimator(model_fn = my_model)
classifier.fit(x_train, y_train, steps=1000)
y_predicted = [p['class'] for p in classifier.predict(x_test, as_iterable=True)]
score = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(score))
my_model is not called in your code. It is a callback function called by Estimator with 2 arguments: features and labels.
They are actually x_train and y_train for the fit() function.
As the doc says, "Model function, takes features and targets tensors or dicts of tensors and returns predictions and loss tensors. E.g. "(features, targets) -> (predictions, loss)"
And you can see model_fn is called in line 1125 in the source code of Estimator:
model_fn_results = self._model_fn(features, labels, **kwargs)

Resources