'GridSearchCV' object has no attribute 'estimators_' using dtreeviz - python-3.x

After carrying out a GridSearchCV on a Randomforest classifer, I am attempting to display a tree plot. I tried the code below, but I get this error:
AttributeError: 'GridSearchCV' object has no attribute 'estimators_'
Can you tell me how to fix this error and get a view of a tree?
Here is my code from the classifier:
model = RandomForestClassifier()
parameter_space = {
'n_estimators': [10,50,100],
'criterion': ['gini', 'entropy'],
'max_depth': np.linspace(10,50,11),
}
clf = GridSearchCV(model, parameter_space, cv = 5, scoring = "accuracy", verbose = True) # model
clf.fit(X_train,y_train)
train_pred = clf.predict(X_train) # Train predict
test_pred = clf.predict(X_test) # Test predict
# Load packages
import pandas as pd
from sklearn import tree
from dtreeviz.trees import dtreeviz # will be used for tree visualization
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})
plt.figure(figsize=(20,20))
_ = tree.plot_tree(clf.n_estimators_[0], feature_names=X_train.columns, filled=True)

You need to select the best Random Forest model from the grid search. You need to change your last line of code :
_ = tree.plot_tree(clf.best_estimator_.estimators_[0], feature_names=X_train.columns, filled=True)

Related

Iterate GridSearchCV Over Multiple Datasets and Classifiers (Python)

I have multiple datasets that I want to estimate parameters for using different classifiers (logistic and randomforest).
I want to run each data for both classifiers using gridsearchcv, and then get the best parameters for each classifier per dataset. I am just a bit stumped on how to go about that. My code is below.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# features
X = {'df1': np.random.normal(0, 1, (200, 5)),
'df2': np.random.normal(0, 1, (200, 5))}
# labels
y = {'df1': np.random.choice([0, 1], 200),
'df2': np.random.choice([0, 1], 200)}
num_columns = list(subset_features[1:])
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# the classifiers
clf1 = LogisticRegression(solver='liblinear', random_state=None)
clf2 = RandomForestClassifier(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
params1 = {'classifier__penalty': ['l1', 'l2'],
'classifier__C': [0.1, 1, 10],
'classifier': [clf1]}
params2 = {'classifier__n_estimators': [100, 150, 200],
'classifier__min_samples_leaf': [1, 2],
'classifier' = [clf2]
params = [params1, params2]
gs = GridSearchCV(pipe, params)
gs.fit(X, y)
gs.best_params_
How about this?
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
# Modeling
import lightgbm as lgb
# Evaluation of the model
from sklearn.model_selection import KFold
MAX_EVALS = 500
N_FOLDS = 10
# Read in data and separate into training and testing sets
data = pd.read_csv('C:\\caravan-insurance-challenge.csv')
train = data[data['ORIGIN'] == 'train']
test = data[data['ORIGIN'] == 'test']
# Extract the labels and format properly
train_labels = np.array(train['CARAVAN'].astype(np.int32)).reshape((-1,))
test_labels = np.array(test['CARAVAN'].astype(np.int32)).reshape((-1,))
# Drop the unneeded columns
train = train.drop(columns = ['ORIGIN', 'CARAVAN'])
test = test.drop(columns = ['ORIGIN', 'CARAVAN'])
# Convert to numpy array for splitting in cross validation
features = np.array(train)
test_features = np.array(test)
labels = train_labels[:]
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
train.head()
import matplotlib.pyplot as plt
import seaborn as sns
plt.hist(labels, edgecolor = 'k');
plt.xlabel('Label'); plt.ylabel('Count'); plt.title('Counts of Labels')
# Model with default hyperparameters
model = lgb.LGBMClassifier()
model
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer
start = timer()
model.fit(features, labels)
train_time = timer() - start
predictions = model.predict_proba(test_features)[:, 1]
auc = roc_auc_score(test_labels, predictions)
print('The baseline score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))
import random
lgb.LGBMClassifier()
# Hyperparameter grid
param_grid = {
'class_weight': [None, 'balanced'],
'boosting_type': ['gbdt', 'goss', 'dart'],
'num_leaves': list(range(30, 150)),
'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
'subsample_for_bin': list(range(20000, 300000, 20000)),
'min_child_samples': list(range(20, 500, 5)),
'reg_alpha': list(np.linspace(0, 1)),
'reg_lambda': list(np.linspace(0, 1)),
'colsample_bytree': list(np.linspace(0.6, 1, 10))
}
# Subsampling (only applicable with 'goss')
subsample_dist = list(np.linspace(0.5, 1, 100))
plt.hist(param_grid['learning_rate'], color = 'r', edgecolor = 'k');
plt.xlabel('Learning Rate', size = 14); plt.ylabel('Count', size = 14); plt.title('Learning Rate Distribution', size = 18)
plt.hist(param_grid['num_leaves'], color = 'm', edgecolor = 'k')
plt.xlabel('Learning Number of Leaves', size = 14); plt.ylabel('Count', size = 14); plt.title('Number of Leaves Distribution', size = 18)
# Randomly sample parameters for gbm
params = {key: random.sample(value, 1)[0] for key, value in param_grid.items()}
params
params['subsample'] = random.sample(subsample_dist, 1)[0] if params['boosting_type'] != 'goss' else 1.0
params
Result:
{'class_weight': 'balanced',
'boosting_type': 'goss',
'num_leaves': 58,
'learning_rate': 0.010197109660117238,
'subsample_for_bin': 40000,
'min_child_samples': 230,
'reg_alpha': 0.7755102040816326,
'reg_lambda': 0.7755102040816326,
'colsample_bytree': 0.8666666666666667,
'subsample': 1.0}
Data:
https://www.kaggle.com/datasets/uciml/caravan-insurance-challenge
Source Code:
https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb
Using different classifiers/estimators, I was able to do what I posted the question for. I am sure, the code can be optimized.
Some of the ideas I used came from this stackoverflow link.
Below is my attempt at answering the question I asked using anomaly detection estimators, instead of logistic regression and randomforest.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
# the anomaly detection estimators
from sklearn.ensemble import IsolationForest
from inne import IsolationNNE
from scorers import scorer_decision # user defined scoring
# define numeric columns
num_columns = list(df1.columns)
class GSP:
def __init__(self):
pass
def mods(self, x):
num_columns # indicates list of numeric columns in dfs
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# classifiers
clf1 = IsolationForest(n_jobs=-1, random_state=None, bootstrap=False)
clf2 = IsolationNNE(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
# grid search parameters
num_estimators = list(np.linspace(100, 200, num=5, endpoint=True).astype(int))
max_samples = list(np.linspace(0.70, 1.00, num=5))
contamination = list(np.linspace(0.05, 0.10, num=5, endpoint=True))
max_features = [0.25, 0.50, 0.75, 0.80, 0.90, 1.00]
params1 = {# set isolation forest grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier__max_features': max_features,
'classifier': [clf1]}
params2 = {# set inne grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier': [clf2]}
params = [params1, params2]
gsresults = pd.DataFrame()
for key in x.keys():
print('running key:', key)
gs = GridSearchCV(estimator=pipe,
param_grid=params,
cv=2,
n_jobs=4,
verbose=1,
scoring=scorer_decision,
error_score='raise',
refit=True)
# fit the model
gs.fit(x[key])
dftemp = pd.DataFrame(gs.cv_results_)
dftemp['dataset'] = key
gsresults = pd.concat([gsresults, dftemp], ignore_index=True)
gsresults = (gsresults.set_index(gsresults['params']
.apply(lambda x: ' '.join(str(val) for val in x.values()))).rename_axis('kernel'))
selected_columns = ['dataset', 'mean_test_score', 'rank_test_score',
'param_classifier', 'param_classifier__contamination',
'param_classifier__max_features', 'param_classifier__max_samples',
'param_classifier__n_estimators']
gsresults2 = (gsresults.loc[:, selected_columns]
.rename(columns={'mean_test_score': 'mean_score',
'rank_test_score': 'rank_score',
'param_classifier': 'classifier',
'param_classifier__contamination': 'contamination',
'param_classifier__max_features': 'max_features',
'param_classifier__max_samples': 'max_samples',
'param_classifier__n_estimators': 'n_estimators'}))
gsresults3 = (gsresults2.sort_values(['rank_score', 'mean_score'], ascending=True)
.groupby(['dataset']))
# check output by dataframes
dfs = {}
for key, df in gsresults3:
dfs[key] = df
return dfs
# running the mods method below returns a dictionary of dataframes
best_params = GSP().mods(X)
Note: the models are fitted on a dictionary of dataframes, X.

Why is sklearn RandomForestClassifier root node different from the most important feature?

How is feature importance calculated in RandomForestClassifier in scikit-learn?
Here's a reproducible code. I run the classifier once with criterion set to gini and once to entropy. For each of them, I print the feature importance and plot the tree.
In neither of the instances, the root tree is the same as the most important feature. Why is that?
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from IPython.display import Image, display
from subprocess import call
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_wine
from sklearn.datasets import load_iris
wines = load_wine()
iris = load_iris()
def create_and_fit(clf,model_name):
print(clf)
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=5, random_state=seed)
# X,y = iris.data; iris.target
# X,y = wines.data, wines.target
# fit the mode
clf.fit(X, y)
# get importance
importance = clf.feature_importances_
indices = np.argsort(importance)[::-1]
for f in range(X.shape[1]):
print("feature {}: ({})".format(indices[f], importance[indices[f]]))
filename = model_name+model.criterion
if model_name == 'forest_':
print('forest')
export_graphviz(clf.estimators_[0], out_file=filename+'.dot')
else:
export_graphviz(clf, out_file=filename+'.dot')
f = 'tree_'+model.criterion+'.png'
call(['dot', '-Tpng', filename+'.dot', '-o', filename+'.png', '-Gdpi=600'])
seed=0
models = [
RandomForestClassifier(criterion='gini',max_depth=5, random_state=seed),
RandomForestClassifier(criterion='entropy',max_depth=5, random_state=seed),
]
names =['forest_', 'forest_']
for name, model in zip(names, models):
create_and_fit(model,name)
Here's the snippet to load the image:
Image(filename = 'forest_gini'+'.png')
and for the entropy
Image(filename = 'forest_entropy'+'.png')
This behaviour seems to only happen with ensembles not trees (I'm generalizing as I only tried on Random forest and Decision Tree).
Here's the snippet for decision trees
models = [
DecisionTreeClassifier(criterion='gini',max_depth=5, random_state=seed),
DecisionTreeClassifier(criterion='entropy',max_depth=5, random_state=seed)
]
names =['tree_', 'tree_']
for name, model in zip(names, models):
create_and_fit(model,name)
Here's the snippet to load the image:
Image(filename = 'tree_gini'+'.png')
and for the entropy
Image(filename = 'tree_entropy'+'.png')
I think I found the answer, which is related to max_features parameter in RandomForestClassifier. Here's scikit-learn documentation:
max_features{“sqrt”, “log2”, None}, int or float,
default=”sqrt”
The number of features to consider when looking for
the best split:
If int, then consider max_features features at each split.
If float, then max_features is a fraction and round(max_features *
n_features) features are considered at each split.
If “auto”, then max_features=sqrt(n_features).
If “sqrt”, then max_features=sqrt(n_features).
If “log2”, then max_features=log2(n_features).
If None, then max_features=n_features.

How to make a GridSearchCV with a proper FunctionTransformer in a pipeline?

I'm trying to make a Pipeline with GridSearchCV to filter data (with iforest) and perform a regression with StandarSclaler+MLPRegressor.
I made a FunctionTransformer to include my iForest filter in the pipeline. I also define a parameters grid for the iForest filter (using kw_args methods).
All seems OK but when un mahe the fit, nothing happens ... No error message. Nothing.
After, when I want to make a predict, I have the message : "This RandomizedSearchCV instance is not fitted yet"
from sklearn.preprocessing import FunctionTransformer
#Definition of the function auto_filter using the iForest algo
def auto_filter(DF, conta=0.1):
#iForest made on the DF dataframe
iforest = IsolationForest(behaviour='new', n_estimators=300, max_samples='auto', contamination=conta)
iforest = iforest.fit(DF)
# The DF (dataframe in input) is filtered taking into account only the inlier observations
data_filtered = DF[iforest.predict(DF) == 1]
# Only few variables are kept for the next step (regression by MLPRegressor)
# this function delivers X_filtered and y
X_filtered = data_filtered[['SessionTotalTime','AverageHR','MaxHR','MinHR','EETotal','EECH','EEFat','TRIMP','BeatByBeatRMSSD','BeatByBeatSD','HFAverage','LFAverage','LFHFRatio','Weight']]
y = data_filtered['MaxVO2']
return (X_filtered, y)
#Pipeline definition ('auto_filter' --> 'scaler' --> 'MLPRegressor')
pipeline_steps = [('auto_filter', FunctionTransformer(auto_filter)), ('scaler', StandardScaler()), ('MLPR', MLPRegressor(solver='lbfgs', activation='relu', early_stopping=True, n_iter_no_change=20, validation_fraction=0.2, max_iter=10000))]
#Gridsearch Definition with differents values of 'conta' for the first stage of the pipeline ('auto_filter)
parameters = {'auto_filter__kw_args': [{'conta': 0.1}, {'conta': 0.2}, {'conta': 0.3}], 'MLPR__hidden_layer_sizes':[(sp_randint.rvs(1, nb_features, 1),), (sp_randint.rvs(1, nb_features, 1), sp_randint.rvs(1, nb_features, 1))], 'MLPR__alpha':sp_rand.rvs(0, 1, 1)}
pipeline = Pipeline(pipeline_steps)
estimator = RandomizedSearchCV(pipeline, parameters, cv=5, n_iter=10)
estimator.fit(X_train, y_train)
You can try to run step by step manually to find a problem:
auto_filter_transformer = FunctionTransformer(auto_filter)
X_train = auto_filter_transformer.fit_transform(X_train)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
MLPR = MLPRegressor(solver='lbfgs', activation='relu', early_stopping=True, n_iter_no_change=20, validation_fraction=0.2, max_iter=10000)
MLPR.fit(X_train, y_train)
If each of the steps works fine, build a pipeline. Check the pipeline. If it works fine, try to use RandomizedSearchCV.
The func parameter of FunctionTransformer should be a callable that accepts the
same arguments as transform method (array-like X of shape
(n_samples, n_features) and kwargs for func) and returns a transformed X of
the same shape. Your function auto_filter doesn't fit these requirements.
Additionally, anomaly/outlier detection techniques from scikit-learn cannot be
used as intermediate steps in scikit-learn pipelines since a pipeline assembles
one or more transformers and an optional final estimator. IsolationForest or,
say, OneClassSVM is not a transformer: it implements fit and predict.
Thus, a possible solution is to cut off possible outliers separately and build
a pipeline composing of transformers and a regressor:
>>> import warnings
>>> from sklearn.exceptions import ConvergenceWarning
>>> warnings.filterwarnings(category=ConvergenceWarning, action='ignore')
>>> import numpy as np
>>> from scipy import stats
>>> from sklearn.datasets import make_regression
>>> from sklearn.ensemble import IsolationForest
>>> from sklearn.model_selection import RandomizedSearchCV
>>> from sklearn.neural_network import MLPRegressor
>>> from sklearn.pipeline import Pipeline
>>> from sklearn.preprocessing import StandardScaler
>>> X, y = make_regression(n_samples=50, n_features=2, n_informative=2)
>>> detect = IsolationForest(contamination=0.1, behaviour='new')
>>> inliers_mask = detect.fit_predict(X) == 1
>>> pipe = Pipeline([('scale', StandardScaler()),
... ('estimate', MLPRegressor(max_iter=500, tol=1e-5))])
>>> param_distributions = dict(estimate__alpha=stats.uniform(0, 0.1))
>>> search = RandomizedSearchCV(pipe, param_distributions,
... n_iter=2, cv=3, iid=True)
>>> search = search.fit(X[inliers_mask], y[inliers_mask])
The problem is that you won't be able to optimize the hyperparameters of
IsolationForest. One way to handle it is to define hyperparameter space
for the forest, sample hyperparameters with ParameterSampler or
ParameterGrid, predict inliers and fit randomized search:
>>> from sklearn.model_selection import ParameterGrid
>>> forest_param_dict = dict(contamination=[0.1, 0.15, 0.2])
>>> forest_param_grid = ParameterGrid(forest_param_dict)
>>> for sample in forest_param_grid:
... detect = detect.set_params(contamination=sample['contamination'])
... inliers_mask = detect.fit_predict(X) == 1
... search.fit(X[inliers_mask], y[inliers_mask])

Plotting Grid Search CV grid parameter results on one graph

In sklearn 0.17.1 there was-->> grid_scores_ : list of named tuples (https://scikit-learn.org/0.17/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV)
Now in sklearn 0.21.2 it is replaced with-->> cv_results_ : dict of numpy (masked) ndarrays (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
Previously with sklearn 0.17.1, I was able to plot all grid parameters on a single plot using grid_scores_ but now I am unable to aggregate the values obtained from cv_results_ as there is no "mean_validation_score" in newer version.
I have an existing code which plotted all the parameters score in sklearn 0.17.1 (https://scikit-learn.org/0.17/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV) where grid_scores_ was used and it perfectly plotted all the values on one plot.
In newer version of slearn cv_results_ has been replaced with grid_scores_. I have tried to append all the values in want to plot all the parameters on one plot, currently I am unable to add the correct values to plot on the graph.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics.ranking import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.metrics import accuracy_score
import sklearn
import itertools
from pandas.tools.plotting import scatter_matrix
import os
import datetime as dt
from operator import itemgetter
from itertools import chain
import graphviz
from sklearn.metrics import precision_recall_fscore_support
import scikitplot as skplt
X_train = np.random.randint(0,1, size=[500,5000])
y_train = np.random.randint(0,1, size=500)
print(X_train.shape, y_train.shape)
# (500, 5000) (500,)
#grid_search = GridSearchCV(clf, param_grid, cv=3) # 10 fold cross validation
### hyperparameter estimator
param_grid = {"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": np.arange(1,9,7),
"min_samples_split": np.arange(2,150,90),
"min_samples_leaf": np.arange(1,60,45),
"min_weight_fraction_leaf": np.arange(0.1,0.4, 0.3),
"max_features": [1000, 500, 5000],
"max_leaf_nodes": np.arange(2,60,45),
"min_impurity_decrease": [0.0, 0.5],
}
def evaluate_param(parameter, param_range, index):
grid_search = GridSearchCV(clf, param_grid = {parameter: param_range}, cv=3) # 3 fold cross validation
grid_search.fit(X_train, y_train) ### grid_search.fit(X_train[features], y_train)
df = {}
#for i, score in enumerate(grid_search.grid_scores_): # previously used methods
for i, score in enumerate(grid_search.cv_results_["params"]):
## How do we save the correct values here for plotting
df[parameter] = grid_search.cv_results_["params"][i][parameter]
#df[parameter].update(grid_search.cv_results_["params"][i][parameter])
#print("df : ", df)
#df[parameter].append(grid_search.cv_results_["params"][i][parameter])
#print("df : ", df) # the values are not appended to the keys
df = pd.DataFrame.from_dict(df, orient='index')
df.reset_index(level=0, inplace=True)
df = df.sort_values(by='index')
plt.subplot(5,2,index) # Change here according to the number of parameters
plt.xlabel(parameter, color = "red")
plt.ylabel("GridSearchCV Score", color= "blue")
plot = plt.plot(df['index'], df[0])
plt.title(parameter.capitalize(), color = "red")
plt.savefig('DT_GridSearchCV_Score_Hyperparameter.png')
return plot, df
clf = tree.DecisionTreeClassifier(random_state=99) # verbose=True, n_jobs=-1 :: Dt does not support it
### hyperparameter estimator
index = 1
plt.figure(figsize=(30,30))
for parameter, param_range in dict.items(param_grid):
evaluate_param(parameter, param_range, index) ## 120 features
index += 1
This image is not filled as there is no "mean_validation_score" which can be filled for each subplot now:
https://ibb.co/Z6jwnMr
## Keys() gives the list of keys that gridsearchcv has:
grid_search.cv_results_.keys()
# output
# dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_max_features', 'param_max_leaf_nodes', 'param_min_impurity_decrease', 'param_min_samples_leaf', 'param_min_samples_split', 'param_min_weight_fraction_leaf', 'param_splitter', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'mean_train_score', 'std_train_score'])
grid_search.best_estimator_
# output
# DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
#max_features=1000, max_leaf_nodes=2, min_impurity_decrease=0.0,
#min_impurity_split=None, min_samples_leaf=1,
#min_samples_split=2, min_weight_fraction_leaf=0.1,
#presort=False, random_state=99, splitter='best')
Expected Result (should be filled): https://ibb.co/Z6jwnMr
However each subplot on the plot should have a curve depicting best value for the parameter. The keys do not have a "mean_validation_score" to plot the actual test score which was there in sklearn 0.17.1 but not in sklearn 0.20.2
Kindly let me know if there is still a way to plot all test scores on subplots of a single plot. Thanks in advance!!

GridSearchCV of spark_sklearn package fails when I try to call score method

I am trying to use GridSearchCV from spark_sklearn package instead of sklearn to leverage spark.
But it fails when I invoke score method of the estimator.
I have taken the example code from http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html.
The code looks like this:
def example_ppl():
import numpy as np
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV
from spark_sklearn import GridSearchCV
logistic = linear_model.LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3)
# Create spark context
spark_session = SparkSession.builder.appName('test').getOrCreate()
sc = spark_session.sparkContext
estimator = GridSearchCV(sc,
estimator=pipe,
param_grid=dict(pca__n_components=n_components,
logistic__C=Cs))
print(type(estimator))
estimator.fit(X_digits, y_digits)
# print(estimator.cv_results_)
estimator.score(X_digits,y_digits)
It throws an error as follows:
File "D:/Python_Project/test/sklearn_pyspark.py", line 72, in example_ppl
estimator.score(X_digits,y_digits)
File "D:\PyEnvs\test\lib\site-packages\sklearn\model_selection\_search.py", line 436, in score
score = self.scorer_[self.refit] if self.multimetric_ else self.scorer_
AttributeError: 'GridSearchCV' object has no attribute 'multimetric_'
Is it an issue with spark_sklearn or am I missing something in my code?

Resources