KNN Python implementation - python-3.x

this is what shows when i try running my code:
FutureWarning: Unlike other reduction functions (e.g. skew, kurtosis), the default behavior of mode typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of keepdims will become False, the axis over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set keepdims to True or False to avoid this warning.
lab = mode(labels)
This is my Python code, and i find some difficulties trying find a suited solution:
# Importing the required modules
import numpy as np
from scipy.stats import mode
# Euclidean Distance
def eucledian(p1, p2):
dist = np.sqrt(np.sum((p1 - p2) ** 2))
return dist
# Function to calculate KNN
def predict(x_train, y, x_input, k):
op_labels = []
# Loop through the Datapoints to be classified
for item in x_input:
# Array to store distances
point_dist = []
# Loop through each training Data
for j in range(len(x_train)):
distances = eucledian(np.array(x_train[j, :]), item)
# Calculating the distance
point_dist.append(distances)
point_dist = np.array(point_dist)
# Sorting the array while preserving the index
# Keeping the first K datapoints
dist = np.argsort(point_dist)[:k]
# Labels of the K datapoints from above
labels = y[dist]
** # Majority voting
lab = mode(labels)
lab = lab.mode[0]
op_labels.append(lab)**
return op_labels
# Importing the required modules
# Importing required modules
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from numpy.random import randint
# Loading the Data
iris= load_iris()
# Store features matrix in X
X= iris.data
# Store target vector in
y = iris.target
# Creating the training Data
train_idx = xxx = randint(0, 150, 100)
X_train = X[train_idx]
y_train = y[train_idx]
# Creating the testing Data
test_idx = xxx = randint(0, 150, 50) # taking 50 random samples
X_test = X[test_idx]
y_test = y[test_idx]
# Applying our function
y_pred = predict(X_train, y_train, X_test, 7)
# Checking the accuracy
accuracy_score(y_test, y_pred)
I am expecting a prediction/accuracy to be the prompt.

KNN can be done like this.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# Assign colum names to the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
# Read dataset to pandas dataframe
dataset = pd.read_csv(url, names=names)
dataset.head()
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Result:
precision recall f1-score support
Iris-setosa 1.00 1.00 1.00 13
Iris-versicolor 1.00 0.89 0.94 9
Iris-virginica 0.89 1.00 0.94 8
accuracy 0.97 30
macro avg 0.96 0.96 0.96 30
weighted avg 0.97 0.97 0.97 30
error = []
# Calculating error for K values between 1 and 40
for i in range(1, 40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

Related

can't resolve it please help IndexError: index 1 is out of bounds for axis 1 with size 1,

import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.offline as pyoff
import pandas as pd
Load excel data
df = pd.read_excel(r'C:\Users\....\Desktop\SVM.xlsx')
Name of columns
properties = list(df.columns)
Column names of numeric data
num_properties = list(set(properties))
df
no. gap dec
0 1 3.566 1
1 2 3.633 1
2 3 3.500 1
3 4 1.333 0
4 5 1.300 0
... ... ... ...
125 126 1.700 0
126 127 3.300 1
127 128 3.500 1
128 129 2.200 0
129 130 3.300 1
130 rows × 3 columns
# split the data into inputs and outputs X = df.iloc[:, [1]].values`
y =df.iloc[:, [2]].values
target_class = go.Bar(
name = 'ga',
x = ['r ', 'a '],
y = target_balance['dec']
)
fig = go.Figure(target_class) pyoff.iplot(fig)
training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.25, random_state=0)
importing StandardScaler
from sklearn.preprocessing import StandardScaler
scalling the input data
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
# importing SVM module
from sklearn.svm import SVC
kernel to be set linear as it is binary class
classifier = SVC(kernel='linear')
traininf the model
classifier.fit(X_train, y_train)
testing the model
y_pred = classifier.predict(X_test)
importing accuracy score
from sklearn.metrics import accuracy_score
printing the accuracy of the model
print(accuracy_score(y_test, y_pred))
importing the modules
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
plotting the fgiure
plt.figure(figsize = (7,7))
assigning the input values
X_set, y_set = X_train, y_train
ploting the linear graph
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X1, ``X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:,`` 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = ``X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), ``X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('black', 'white')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

Confidence Interval from RandomForestRegressor in scikit-learn

scikit-learn has a quantile regression based confidence interval implementation for GBM (example form the docs).
Is there a reason why it doesn't provide a similar quantile based loss implementation for RandomForestRegressor?
There is an scikit-learn compatible/compliant Quantile Regression Forest implementation that can be used to generate confidence intervals here: https://github.com/zillow/quantile-forest
Setup should be as easy as:
pip install quantile-forest
Then, as an example, to generate CIs on a full dataset:
import matplotlib.pyplot as plt
import numpy as np
from quantile_forest import RandomForestQuantileRegressor
from sklearn import datasets
from sklearn.model_selection import KFold
X, y = datasets.fetch_california_housing(return_X_y=True)
qrf = RandomForestQuantileRegressor(n_estimators=100, random_state=0)
kf = KFold(n_splits=5)
kf.get_n_splits(X)
y_true = []
y_pred = []
y_pred_lower = []
y_pred_upper = []
for train_index, test_index in kf.split(X):
X_train, X_test, y_train, y_test = (
X[train_index], X[test_index], y[train_index], y[test_index]
)
qrf.set_params(max_features=X_train.shape[1] // 3)
qrf.fit(X_train, y_train)
# Get predictions at 95% prediction intervals and median.
y_pred_i = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])
y_true = np.concatenate((y_true, y_test))
y_pred = np.concatenate((y_pred, y_pred_i[:, 1]))
y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[:, 0]))
y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[:, 2]))
fig = plt.figure(figsize=(10, 4))
y_pred_interval = y_pred_upper - y_pred_lower
sort_idx = np.argsort(y_pred_interval)
y_true = y_true[sort_idx]
y_pred_lower = y_pred_lower[sort_idx]
y_pred_upper = y_pred_upper[sort_idx]
# Center data, with the mean of the prediction interval at 0.
mean = (y_pred_lower + y_pred_upper) / 2
y_true -= mean
y_pred_lower -= mean
y_pred_upper -= mean
plt.plot(y_true, marker=".", ms=5, c="r", lw=0)
plt.fill_between(
np.arange(len(y_pred_upper)),
y_pred_lower,
y_pred_upper,
alpha=0.2,
color="gray",
)
plt.plot(np.arange(len(y)), y_pred_lower, marker="_", c="0.2", lw=0)
plt.plot(np.arange(len(y)), y_pred_upper, marker="_", c="0.2", lw=0)
plt.xlim([0, len(y)])
plt.xlabel("Ordered Samples")
plt.ylabel("Observed Values and Prediction Intervals (Centered)")
plt.show()
There seems to be contributed scikit-learn package (example copy pasted from there for RandomForestRegressor)
I had to install development version in order to have correct path to current scikit-learn by:
pip install git+git://github.com/scikit-learn-contrib/forest-confidence-interval.git
https://github.com/scikit-learn-contrib/forest-confidence-interval
Example (copy pasted from the link above):
# Regression Forest Example
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as xval
from sklearn.datasets import fetch_openml
import forestci as fci
# retreive mpg data from machine learning library
mpg_data = fetch_openml('autompg')
# separate mpg data into predictors and outcome variable
mpg_X = mpg_data["data"]
mpg_y = mpg_data["target"]
# remove rows where the data is nan
not_null_sel = np.invert(
np.sum(np.isnan(mpg_data["data"]), axis=1).astype(bool))
mpg_X = mpg_X[not_null_sel]
mpg_y = mpg_y[not_null_sel]
# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(mpg_X, mpg_y,
test_size=0.25,
random_state=42)
# Create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)
# Plot predicted MPG without error bars
plt.scatter(mpg_y_test, mpg_y_hat)
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
# Calculate the variance
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train,
mpg_X_test)
# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()

Train_test_split set with 50-50 returns high accuracy but low when separated in 2 files

I have 1 dataset (called train_plus_test.csv) that has 1275 rows with corresponding columns and labels for classification of two activities, namely Walking and Lying. This is a balanced dataset with same number of each class.
I implement Random Forest in 2 scenarios
Scenario 1: Training on train_plus_test.csv with the train-test split of 0.75 - 0.25, it gives 91.8% accuracy
Scenario 2: Divide the above file train_plus_test.csv into 2 files (training.csv) and testing (testing.csv) at the split of 75% - 25%. Then I train the model on train.csv and predict on test.csv, but the accuracy is 52%. I am Now wondering where exactly I am wrong? ##
Thank you for your reading!
The python code (below) and 3 csv files above I have included here:
[GoogleDrive] https://drive.google.com/drive/folders/1AAOOFhR1QpoPPtSNTofBnouBaYHfFbir?usp=sharing&fbclid=IwAR10SjHCu-6Sszd-okes-IneAA8pWzals9-NNtAsmrw0ql28mk3geZfmnQI
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
# Scenario 1 ==================>
dataset = pd.read_csv('train_plus_test.csv')
feature_cols = list(dataset.columns.values)
feature_cols.remove('label')
X = dataset[feature_cols] # Features
y = dataset['label'] # Target
clf_RF = RandomForestClassifier(n_estimators=100, random_state=0, max_features=8, min_samples_leaf=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
clf_RF.fit(X_train, y_train)
y_pred_RF = clf_RF.predict(X_test)
print('Accuracy of training')
print(metrics.accuracy_score(y_test, y_pred_RF))
# Scenario 2 ======= comment Secenario 1 before running the scenario 2 ===========>
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')
feature_cols = list(train_dataset.columns.values)
feature_cols.remove('label')
clf_RF = RandomForestClassifier(n_estimators=100, random_state=0, max_features=8, min_samples_leaf=3 )
X = train_dataset[feature_cols] # Features
y = train_dataset['label'] # Target
clf_RF.fit(X, y)
X_test_data = test_dataset[feature_cols]
y_test_data = test_dataset['label']
y_test_pred = clf_RF.predict(X_test_data)
print('Accuracy of testing')
print(metrics.accuracy_score(y_test_data, y_test_pred))

ValueError: Can only tuple-index with a MultiIndex

For a Multilabel Classification problem i am trying to plot precission and recall curve.
The sample code is taken from "https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py" under section Create multi-label data, fit, and predict.
I am trying to fit it in my code but i get below error as "ValueError: Can only tuple-index with a MultiIndex" when i try below code.
train_df.columns.values
array(['DefId', 'DefectCount', 'SprintNo', 'ReqName', 'AreaChange',
'CodeChange', 'TestSuite'], dtype=object)
Test Suite is the value to be predicted
X_train = train_df.drop("TestSuite", axis=1)
Y_train = train_df["TestSuite"]
X_test = test_df.drop("DefId", axis=1).copy()
classes --> i have hardcorded with the testsuite values
from sklearn.preprocessing import label_binarize
# Use label_binarize to be multi-label like settings
Y = label_binarize(Y_train, classes=np.array([0, 1, 2,3,4])
n_classes = Y.shape[1]
# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier
# Run classifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=3))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import pandas as pd
# For each class
precision = dict()
recall = dict()
average_precision = dict()
#n_classes = Y.shape[1]
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(Y_train[:, i], y_score[:, i])
average_precision[i] = average_precision_score(Y_train[:, i], y_score[:, i])
Input Data -> Values has been categorised

Input size (depth of inputs) must be accessible via shape inference, but saw value None error whaen trying to set tf.expand_dims axis to 0

I am trying to use 20 news groups data set available in sklearn to train a LSTM to do incremental learning (classification). I used the sklearn's TfidfVectorizer to pre-process the data. Then I turned the resulting sparse matrix into a numpy array before feeding it. After that when coding the below line:
outputs, final_state = tf.nn.dynamic_rnn(cell, inputs_, initial_state=initial_state)
It gave an error saying that the 'inputs_' should have 3 dimensions. so I used:
inputs_ = tf.expand_dims(inputs_, 0)
To expand the dimension. But when I do that i get the error:
ValueError: Input size (depth of inputs) must be accessible via shape
inference, but saw value None.
The shape of 'input_' is:
(1, 134410)
I already went through this post, but it did not help.
I cannot seem to understand how to solve this issue. Any help is much appreciated. Thank you in advance!
show below is my complete code:
import os
from collections import Counter
import tensorflow as tf
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
import matplotlib as mplt
from matplotlib import cm
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.metrics import f1_score, recall_score, precision_score
from string import punctuation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
def pre_process():
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(newsgroups_data.data)
lb = LabelBinarizer()
labels = np.reshape(newsgroups_data.target, [-1])
labels = lb.fit_transform(labels)
return features, labels
def get_batches(x, y, batch_size=1):
for ii in range(0, len(y), batch_size):
yield x[ii:ii + batch_size], y[ii:ii + batch_size]
def plot_error(errorplot, datapoint, numberOfWrongPreds):
errorplot.set_xdata(np.append(errorplot.get_xdata(), datapoint))
errorplot.set_ydata(np.append(errorplot.get_ydata(), numberOfWrongPreds))
errorplot.autoscale(enable=True, axis='both', tight=None)
plt.draw()
def train_test():
features, labels = pre_process()
#Defining Hyperparameters
epochs = 1
lstm_layers = 1
batch_size = 1
lstm_size = 30
learning_rate = 0.003
print(lstm_size)
print(batch_size)
print(epochs)
#--------------placeholders-------------------------------------
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
tf.set_random_seed(1)
inputs_ = tf.placeholder(tf.float32, [None,None], name = "inputs")
# labels_ = tf.placeholder(dtype= tf.int32)
labels_ = tf.placeholder(tf.int32, [None,None], name = "labels")
#getting dynamic batch size according to the input tensor size
# dynamic_batch_size = tf.shape(inputs_)[0]
#output_keep_prob is the dropout added to the RNN's outputs, the dropout will have no effect on the calculation of the subsequent states.
keep_prob = tf.placeholder(tf.float32, name = "keep_prob")
# Your basic LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
# Add dropout to the cell
drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
#Stack up multiple LSTM layers, for deep learning
cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
# Getting an initial state of all zeros
initial_state = cell.zero_state(batch_size, tf.float32)
inputs_ = tf.expand_dims(inputs_, 0)
outputs, final_state = tf.nn.dynamic_rnn(cell, inputs_, initial_state=initial_state)
#hidden layer
hidden = tf.layers.dense(outputs[:, -1], units=25, activation=tf.nn.relu)
logit = tf.contrib.layers.fully_connected(hidden, 1, activation_fn=None)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=labels_))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
saver = tf.train.Saver()
# ----------------------------online training-----------------------------------------
with tf.Session(graph=graph) as sess:
tf.set_random_seed(1)
sess.run(tf.global_variables_initializer())
iteration = 1
state = sess.run(initial_state)
wrongPred = 0
errorplot, = plt.plot([], [])
for ii, (x, y) in enumerate(get_batches(features, labels, batch_size), 1):
feed = {inputs_: x.toarray(),
labels_: y,
keep_prob: 0.5,
initial_state: state}
predictions = tf.round(tf.nn.softmax(logit)).eval(feed_dict=feed)
print("----------------------------------------------------------")
print("Iteration: {}".format(iteration))
print("Prediction: ", predictions)
print("Actual: ",y)
pred = np.array(predictions)
print(pred)
print(y)
if not ((pred==y).all()):
wrongPred += 1
if ii % 27 == 0:
plot_error(errorplot,ii,wrongPred)
loss, states, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
print("Train loss: {:.3f}".format(loss))
iteration += 1
saver.save(sess, "checkpoints/sentiment.ckpt")
errorRate = wrongPred/len(labels)
print("ERROR RATE: ", errorRate )
if __name__ == '__main__':
train_test()
ValueError: Input size (depth of inputs) must be accessible via shape inference, but saw value None.
This error is given because you don't specify the size nor the amount of inputs.
I got the script working like this:
inputs_ = tf.placeholder(tf.float32, [1,None], name = "inputs")
inputs_withextradim = tf.expand_dims(inputs_, 2)
outputs, final_state = tf.nn.dynamic_rnn(cell, inputs_withextradim, initial_state=initial_state)

Resources