In sklearn 0.17.1 there was-->> grid_scores_ : list of named tuples (https://scikit-learn.org/0.17/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV)
Now in sklearn 0.21.2 it is replaced with-->> cv_results_ : dict of numpy (masked) ndarrays (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
Previously with sklearn 0.17.1, I was able to plot all grid parameters on a single plot using grid_scores_ but now I am unable to aggregate the values obtained from cv_results_ as there is no "mean_validation_score" in newer version.
I have an existing code which plotted all the parameters score in sklearn 0.17.1 (https://scikit-learn.org/0.17/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV) where grid_scores_ was used and it perfectly plotted all the values on one plot.
In newer version of slearn cv_results_ has been replaced with grid_scores_. I have tried to append all the values in want to plot all the parameters on one plot, currently I am unable to add the correct values to plot on the graph.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics.ranking import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.metrics import accuracy_score
import sklearn
import itertools
from pandas.tools.plotting import scatter_matrix
import os
import datetime as dt
from operator import itemgetter
from itertools import chain
import graphviz
from sklearn.metrics import precision_recall_fscore_support
import scikitplot as skplt
X_train = np.random.randint(0,1, size=[500,5000])
y_train = np.random.randint(0,1, size=500)
print(X_train.shape, y_train.shape)
# (500, 5000) (500,)
#grid_search = GridSearchCV(clf, param_grid, cv=3) # 10 fold cross validation
### hyperparameter estimator
param_grid = {"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": np.arange(1,9,7),
"min_samples_split": np.arange(2,150,90),
"min_samples_leaf": np.arange(1,60,45),
"min_weight_fraction_leaf": np.arange(0.1,0.4, 0.3),
"max_features": [1000, 500, 5000],
"max_leaf_nodes": np.arange(2,60,45),
"min_impurity_decrease": [0.0, 0.5],
}
def evaluate_param(parameter, param_range, index):
grid_search = GridSearchCV(clf, param_grid = {parameter: param_range}, cv=3) # 3 fold cross validation
grid_search.fit(X_train, y_train) ### grid_search.fit(X_train[features], y_train)
df = {}
#for i, score in enumerate(grid_search.grid_scores_): # previously used methods
for i, score in enumerate(grid_search.cv_results_["params"]):
## How do we save the correct values here for plotting
df[parameter] = grid_search.cv_results_["params"][i][parameter]
#df[parameter].update(grid_search.cv_results_["params"][i][parameter])
#print("df : ", df)
#df[parameter].append(grid_search.cv_results_["params"][i][parameter])
#print("df : ", df) # the values are not appended to the keys
df = pd.DataFrame.from_dict(df, orient='index')
df.reset_index(level=0, inplace=True)
df = df.sort_values(by='index')
plt.subplot(5,2,index) # Change here according to the number of parameters
plt.xlabel(parameter, color = "red")
plt.ylabel("GridSearchCV Score", color= "blue")
plot = plt.plot(df['index'], df[0])
plt.title(parameter.capitalize(), color = "red")
plt.savefig('DT_GridSearchCV_Score_Hyperparameter.png')
return plot, df
clf = tree.DecisionTreeClassifier(random_state=99) # verbose=True, n_jobs=-1 :: Dt does not support it
### hyperparameter estimator
index = 1
plt.figure(figsize=(30,30))
for parameter, param_range in dict.items(param_grid):
evaluate_param(parameter, param_range, index) ## 120 features
index += 1
This image is not filled as there is no "mean_validation_score" which can be filled for each subplot now:
https://ibb.co/Z6jwnMr
## Keys() gives the list of keys that gridsearchcv has:
grid_search.cv_results_.keys()
# output
# dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_max_features', 'param_max_leaf_nodes', 'param_min_impurity_decrease', 'param_min_samples_leaf', 'param_min_samples_split', 'param_min_weight_fraction_leaf', 'param_splitter', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'mean_train_score', 'std_train_score'])
grid_search.best_estimator_
# output
# DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
#max_features=1000, max_leaf_nodes=2, min_impurity_decrease=0.0,
#min_impurity_split=None, min_samples_leaf=1,
#min_samples_split=2, min_weight_fraction_leaf=0.1,
#presort=False, random_state=99, splitter='best')
Expected Result (should be filled): https://ibb.co/Z6jwnMr
However each subplot on the plot should have a curve depicting best value for the parameter. The keys do not have a "mean_validation_score" to plot the actual test score which was there in sklearn 0.17.1 but not in sklearn 0.20.2
Kindly let me know if there is still a way to plot all test scores on subplots of a single plot. Thanks in advance!!
Related
How is feature importance calculated in RandomForestClassifier in scikit-learn?
Here's a reproducible code. I run the classifier once with criterion set to gini and once to entropy. For each of them, I print the feature importance and plot the tree.
In neither of the instances, the root tree is the same as the most important feature. Why is that?
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from IPython.display import Image, display
from subprocess import call
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_wine
from sklearn.datasets import load_iris
wines = load_wine()
iris = load_iris()
def create_and_fit(clf,model_name):
print(clf)
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=5, random_state=seed)
# X,y = iris.data; iris.target
# X,y = wines.data, wines.target
# fit the mode
clf.fit(X, y)
# get importance
importance = clf.feature_importances_
indices = np.argsort(importance)[::-1]
for f in range(X.shape[1]):
print("feature {}: ({})".format(indices[f], importance[indices[f]]))
filename = model_name+model.criterion
if model_name == 'forest_':
print('forest')
export_graphviz(clf.estimators_[0], out_file=filename+'.dot')
else:
export_graphviz(clf, out_file=filename+'.dot')
f = 'tree_'+model.criterion+'.png'
call(['dot', '-Tpng', filename+'.dot', '-o', filename+'.png', '-Gdpi=600'])
seed=0
models = [
RandomForestClassifier(criterion='gini',max_depth=5, random_state=seed),
RandomForestClassifier(criterion='entropy',max_depth=5, random_state=seed),
]
names =['forest_', 'forest_']
for name, model in zip(names, models):
create_and_fit(model,name)
Here's the snippet to load the image:
Image(filename = 'forest_gini'+'.png')
and for the entropy
Image(filename = 'forest_entropy'+'.png')
This behaviour seems to only happen with ensembles not trees (I'm generalizing as I only tried on Random forest and Decision Tree).
Here's the snippet for decision trees
models = [
DecisionTreeClassifier(criterion='gini',max_depth=5, random_state=seed),
DecisionTreeClassifier(criterion='entropy',max_depth=5, random_state=seed)
]
names =['tree_', 'tree_']
for name, model in zip(names, models):
create_and_fit(model,name)
Here's the snippet to load the image:
Image(filename = 'tree_gini'+'.png')
and for the entropy
Image(filename = 'tree_entropy'+'.png')
I think I found the answer, which is related to max_features parameter in RandomForestClassifier. Here's scikit-learn documentation:
max_features{“sqrt”, “log2”, None}, int or float,
default=”sqrt”
The number of features to consider when looking for
the best split:
If int, then consider max_features features at each split.
If float, then max_features is a fraction and round(max_features *
n_features) features are considered at each split.
If “auto”, then max_features=sqrt(n_features).
If “sqrt”, then max_features=sqrt(n_features).
If “log2”, then max_features=log2(n_features).
If None, then max_features=n_features.
Edited from a tutorial in Kaggle, I try to run the code below and data (available to download from here):
Code:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for plotting facilities
from datetime import datetime, date
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("./data/Aquifer_Petrignano.csv")
df['Date'] = pd.to_datetime(df.Date, format = '%d/%m/%Y')
df = df[df.Rainfall_Bastia_Umbra.notna()].reset_index(drop=True)
df = df.interpolate(method ='ffill')
df = df[['Date', 'Rainfall_Bastia_Umbra', 'Depth_to_Groundwater_P24', 'Depth_to_Groundwater_P25', 'Temperature_Bastia_Umbra', 'Temperature_Petrignano', 'Volume_C10_Petrignano', 'Hydrometry_Fiume_Chiascio_Petrignano']].resample('7D', on='Date').mean().reset_index(drop=False)
X = df.drop(['Depth_to_Groundwater_P24','Depth_to_Groundwater_P25','Date'], axis=1)
y1 = df.Depth_to_Groundwater_P24
y2 = df.Depth_to_Groundwater_P25
scaler = StandardScaler()
X = scaler.fit_transform(X)
model = xgb.XGBRegressor()
param_search = {'max_depth': range(1, 2, 2),
'min_child_weight': range(1, 2, 2),
'n_estimators' : [1000],
'learning_rate' : [0.1]}
tscv = TimeSeriesSplit(n_splits=2)
gsearch = GridSearchCV(estimator=model, cv=tscv,
param_grid=param_search)
gsearch.fit(X, y1)
xgb_grid = xgb.XGBRegressor(**gsearch.best_params_)
xgb_grid.fit(X, y1)
ax = xgb.plot_importance(xgb_grid)
ax.figure.tight_layout()
ax.figure.savefig('test.png')
y_val = y1[-80:]
X_val = X[-80:]
y_pred = xgb_grid.predict(X_val)
print(mean_absolute_error(y_val, y_pred))
print(math.sqrt(mean_squared_error(y_val, y_pred)))
I plotted a features importance figure whose original features names are hidden:
If I comment out these two lines:
scaler = StandardScaler()
X = scaler.fit_transform(X)
I get the output:
How could I use scaler.fit_transform() for X and get a feature importance plot with the original feature names?
The reason behind this is that StandardScaler returns a numpy.ndarray of your feature values (same shape as pandas.DataFrame.values, but not normalized) and you need to convert it back to pandas.DataFrame with the same column names.
Here's the part of your code that needs changing.
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
The code below is working. I have just a routine to run a cross validation scheme using a linear model previous defined in sklearn. I do not have a problem with this. My problem is that: if I replace the code model=linear_model.LinearRegression() by the model=RBF('multiquadric') (please see line 14 and 15 in the __main__, it does not work anymore. So my problem is actually in the class RBF where I try to mimic a sklearn model.
If I replace the code described above, I get the following error:
FitFailedWarning)
/home/daniel/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: All arrays must be equal length.
FitFailedWarning)
1) Should I define a score function in the Class RBF?
2) How to do that? I am lost. Since I am inherit BaseEstimator and RegressorMixin, I expected that this was internally solved.
3) Is there something else missing?
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import Rbf
np.random.seed(0)
from sklearn.base import BaseEstimator, RegressorMixin
class RBF(BaseEstimator, RegressorMixin):
def __init__(self,function):
self.function=function
def fit(self,x,y):
self.rbf = Rbf(x, y,function=self.function)
def predict(self,x):
return self.rbf(x)
if __name__ == "__main__":
# Load Data
targetName='HousePrice'
data=datasets.load_boston()
featuresNames=list(data.feature_names)
featuresData=data.data
targetData = data.target
df=pd.DataFrame(featuresData,columns=featuresNames)
df[targetName]=targetData
independent_variable_list=featuresNames
dependent_variable=targetName
X=df[independent_variable_list].values
y=np.squeeze(df[[dependent_variable]].values)
# Model Definition
model=linear_model.LinearRegression()
#model=RBF('multiquadric')
# Cross validation routine
number_splits=5
score_list=['neg_mean_squared_error','neg_mean_absolute_error','r2']
kfold = model_selection.KFold(n_splits=number_splits,shuffle=True, random_state=0)
scalar = StandardScaler()
pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
results = model_selection.cross_validate(pipeline, X, y, cv=kfold, scoring=score_list,return_train_score=True)
for score in score_list:
print(score+':')
print('Train: '+'Mean',np.mean(results['train_'+score]),'Standard Error',np.std(results['train_'+score]))
print('Test: '+'Mean',np.mean(results['test_'+score]),'Standard Error',np.std(results['test_'+score]))
Lets look at the documentation here
*args : arrays
x, y, z, …, d, where x, y, z, … are the coordinates of the nodes and d is the array of values at the nodes
So it takes variable length argument with the last argument being the value which is y in your case. Argument k is the kth coordinates of all the data point (same for all other argument z, y, z, ….
Following the documentation, your code should be
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import Rbf
np.random.seed(0)
from sklearn.base import BaseEstimator, RegressorMixin
class RBF(BaseEstimator, RegressorMixin):
def __init__(self,function):
self.function=function
def fit(self,X,y):
self.rbf = Rbf(*X.T, y,function=self.function)
def predict(self,X):
return self.rbf(*X.T)
# Load Data
data=datasets.load_boston()
X = data.data
y = data.target
number_splits=5
score_list=['neg_mean_squared_error','neg_mean_absolute_error','r2']
kfold = model_selection.KFold(n_splits=number_splits,shuffle=True, random_state=0)
scalar = StandardScaler()
model = RBF(function='multiquadric')
pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
results = model_selection.cross_validate(pipeline, X, y, cv=kfold, scoring=score_list,return_train_score=True)
for score in score_list:
print(score+':')
print('Train: '+'Mean',np.mean(results['train_'+score]),'Standard Error',np.std(results['train_'+score]))
print('Test: '+'Mean',np.mean(results['test_'+score]),'Standard Error',np.std(results['test_'+score]))
Output
neg_mean_squared_error:
Train: Mean -1.552450953914355e-20 Standard Error 7.932530906290208e-21
Test: Mean -23.007377210596463 Standard Error 4.254629143836107
neg_mean_absolute_error:
Train: Mean -9.398502208736061e-11 Standard Error 2.4673749061941226e-11
Test: Mean -3.1319779583728673 Standard Error 0.2162343985534446
r2:
Train: Mean 1.0 Standard Error 0.0
Test: Mean 0.7144217179633185 Standard Error 0.08526294242760363
Why *X.T : As we saw, each argument correspond to an axis of all the data points, so we transpose them and then use * operator to expand and pass each of the sub array as an argument to the variable length function.
Looks like the latest implementation has a mode parameter where we can pass the N-D array directly.
File "D:\Users\Watson Rockstar\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 205, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError:
Found input variables with inconsistent numbers of samples: [2883, 1236]
This dataset totally has 4119 data, and the Xtrain volum= (2883,18), Xtest volum = (1236,18)
I have tried to use LabelEncoder and OneHotEncoder to sovle the problems, but it is not helpful:
# Ignore the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
#import the necessary modelling algos.
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#preprocessing
from sklearn.preprocessing import LabelEncoder
telebanking = pd.read_csv('bank-additional.csv')
telebank = telebanking.drop(['duration','default'],axis =1)
def transform(feature):
le = LabelEncoder()
telebank[feature] = le.fit_transform(telebank[feature])
print(le.classes_)
cat_telebank=telebank.select_dtypes(include='object')
cat_telebank.columns
for col in cat_telebank.columns:
transform(col)
scaler=StandardScaler()
scaled_telebank=scaler.fit_transform(telebank.drop('y',axis=1))
X=scaled_telebank
Y=telebank['y'].as_matrix()
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3)
def compare(model):
clf = model
clf.fit(Xtrain,Ytrain)
pred = clf.predict(Xtrain)
acc.append(accuracy_score(pred,Ytest))
prec.append(precision_score(pred,Ytest))
rec.append(recall_score(pred,Ytest))
auroc.append(roc_auc_score(pred,Ytest))
acc=[]
prec=[]
rec=[]
auroc=[]
models=[RandomForestClassifier(),DecisionTreeClassifier()]
model_names=['RandomForestClassifier','DecisionTreeClassifier']
for model in range(len(models)):
compare(models[model])
d={'Modelling Algo':model_names,'Accuracy':acc,'Precision':prec,'Recall':rec,'Area Under ROC Curve':auroc}
met_telebank=pd.DataFrame(d)
met_telebank
It is the first warning's detail.
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3)
should be
Xtrain,Ytrain,Xtest,Ytest = train_test_split(X,Y,test_size=0.3)
This is causing the error, because it wants to use Xtest as the Ytrain values.
I am trying to show a simple thing (allegedly). I want to show that the more examples I have in a range the better the performance is the test should be. I manipulate the number of data points in dataset [10,20,40,200,2000]. I use the val_loss, but it doesn't seem that the more data points there are the lower the loss should be. My question is why is the loss not getting lower the more points I add to the dataset (is keras do what I expect?).
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD,Adam
from keras import regularizers
import numpy as np
#np.random.seed(seed=1)
import matplotlib.pyplot as plt
%matplotlib inline
import random
import math
from numpy.random import seed
import random
import sklearn
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
def xrange(start_point,end_point,N,base):
temp = np.logspace(0.1, 1, N,base=base,endpoint=False)
temp=temp-temp.min()
temp=(0.0+temp)/(0.0+temp.max()) #this is between 0 and 1
return (end_point-start_point)*temp +start_point #this is the range
def func(x):
function=np.sin(3*x)/2 + np.cos(5*x)/2 + np.sin(20*x)
return function
def train_model(x_train,y_train,x_test):
#seed(5)
model=Sequential()
num_units=100
act='relu'
model.add(Dense(num_units,input_shape=(1,),activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(1,activation='tanh')) #output layer 1 unit ; activation='tanh'
model.compile(Adam(),'mean_squared_error',metrics=['mse'])
history=model.fit(x_train,y_train,batch_size=32,epochs=500,verbose=1,validation_split = 0.2 ) #train on the noise (not moshe)
fit=model.predict(x_test)
loss = history.history['loss']
val_loss = history.history['val_loss']
return loss,val_loss,fit
Ns=[10,20,40,200,2000]
start_point=-5.25 #-1
end_point=5.25 #1
#base=550#[0.001,1.5,5,40,400]#545 np.arange(0.05,1,0.05).tolist()#[450,0.75]#[0.5,2,5,10,100,300]#Final:[0.001,500
test_step=0.0007
x_test=np.arange(start_point,end_point,test_step)
y_test=func(x_test)
loss=[]
val_loss=[]
fit=[]
list_y_train=[]
list_x_train=[]
for N in Ns:
x_train=np.linspace(start_point, end_point, num=N, endpoint=True)#xrange(start_point,end_point,N,b)
func_train=func(x_train)#np.sin(3*x_train)/2 + np.cos(5*x_train)/2 + np.sin(7*x_train) #### write a functino
noise=np.random.uniform(-0.2,0.2,len(x_train))
y_train=func_train+noise
l,v,f=train_model(x_train,y_train,x_test)
loss.append(y_train)
val_loss.append(v)
fit.append(f)
list_x_train.append(x_train)
list_y_train.append(y_train)
y_ideal=func(x_test)#func np.sin(3*x_train)/2
k=1
for f in fit:
p=plt.subplot(len(fit), 1, k)
#plt.plot(x_train,y_ideal,'k')
plt.scatter(list_x_train[k-1], list_y_train[k-1], facecolors='none', edgecolors='g') #plt.plot(x_value,sample,'bo')
plt.scatter(x_test, f, facecolors='none', edgecolors='b') #plt.plot(x_value,sample,'bo')
plt.plot(x_test,y_ideal,'k')
k=k+1
plt.plot(np.asarray(val_loss[0]),label=str(Ns[0]))
plt.plot(np.asarray(val_loss[1]),label=str(Ns[1]))
plt.plot(np.asarray(val_loss[2]),label=str(Ns[2]))
plt.plot(np.asarray(val_loss[3]),label=str(Ns[3]))
plt.plot(np.asarray(val_loss[4]),label=str(Ns[4]))
plt.legend()