Iterate GridSearchCV Over Multiple Datasets and Classifiers (Python) - python-3.x

I have multiple datasets that I want to estimate parameters for using different classifiers (logistic and randomforest).
I want to run each data for both classifiers using gridsearchcv, and then get the best parameters for each classifier per dataset. I am just a bit stumped on how to go about that. My code is below.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# features
X = {'df1': np.random.normal(0, 1, (200, 5)),
'df2': np.random.normal(0, 1, (200, 5))}
# labels
y = {'df1': np.random.choice([0, 1], 200),
'df2': np.random.choice([0, 1], 200)}
num_columns = list(subset_features[1:])
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# the classifiers
clf1 = LogisticRegression(solver='liblinear', random_state=None)
clf2 = RandomForestClassifier(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
params1 = {'classifier__penalty': ['l1', 'l2'],
'classifier__C': [0.1, 1, 10],
'classifier': [clf1]}
params2 = {'classifier__n_estimators': [100, 150, 200],
'classifier__min_samples_leaf': [1, 2],
'classifier' = [clf2]
params = [params1, params2]
gs = GridSearchCV(pipe, params)
gs.fit(X, y)
gs.best_params_

How about this?
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
# Modeling
import lightgbm as lgb
# Evaluation of the model
from sklearn.model_selection import KFold
MAX_EVALS = 500
N_FOLDS = 10
# Read in data and separate into training and testing sets
data = pd.read_csv('C:\\caravan-insurance-challenge.csv')
train = data[data['ORIGIN'] == 'train']
test = data[data['ORIGIN'] == 'test']
# Extract the labels and format properly
train_labels = np.array(train['CARAVAN'].astype(np.int32)).reshape((-1,))
test_labels = np.array(test['CARAVAN'].astype(np.int32)).reshape((-1,))
# Drop the unneeded columns
train = train.drop(columns = ['ORIGIN', 'CARAVAN'])
test = test.drop(columns = ['ORIGIN', 'CARAVAN'])
# Convert to numpy array for splitting in cross validation
features = np.array(train)
test_features = np.array(test)
labels = train_labels[:]
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
train.head()
import matplotlib.pyplot as plt
import seaborn as sns
plt.hist(labels, edgecolor = 'k');
plt.xlabel('Label'); plt.ylabel('Count'); plt.title('Counts of Labels')
# Model with default hyperparameters
model = lgb.LGBMClassifier()
model
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer
start = timer()
model.fit(features, labels)
train_time = timer() - start
predictions = model.predict_proba(test_features)[:, 1]
auc = roc_auc_score(test_labels, predictions)
print('The baseline score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))
import random
lgb.LGBMClassifier()
# Hyperparameter grid
param_grid = {
'class_weight': [None, 'balanced'],
'boosting_type': ['gbdt', 'goss', 'dart'],
'num_leaves': list(range(30, 150)),
'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
'subsample_for_bin': list(range(20000, 300000, 20000)),
'min_child_samples': list(range(20, 500, 5)),
'reg_alpha': list(np.linspace(0, 1)),
'reg_lambda': list(np.linspace(0, 1)),
'colsample_bytree': list(np.linspace(0.6, 1, 10))
}
# Subsampling (only applicable with 'goss')
subsample_dist = list(np.linspace(0.5, 1, 100))
plt.hist(param_grid['learning_rate'], color = 'r', edgecolor = 'k');
plt.xlabel('Learning Rate', size = 14); plt.ylabel('Count', size = 14); plt.title('Learning Rate Distribution', size = 18)
plt.hist(param_grid['num_leaves'], color = 'm', edgecolor = 'k')
plt.xlabel('Learning Number of Leaves', size = 14); plt.ylabel('Count', size = 14); plt.title('Number of Leaves Distribution', size = 18)
# Randomly sample parameters for gbm
params = {key: random.sample(value, 1)[0] for key, value in param_grid.items()}
params
params['subsample'] = random.sample(subsample_dist, 1)[0] if params['boosting_type'] != 'goss' else 1.0
params
Result:
{'class_weight': 'balanced',
'boosting_type': 'goss',
'num_leaves': 58,
'learning_rate': 0.010197109660117238,
'subsample_for_bin': 40000,
'min_child_samples': 230,
'reg_alpha': 0.7755102040816326,
'reg_lambda': 0.7755102040816326,
'colsample_bytree': 0.8666666666666667,
'subsample': 1.0}
Data:
https://www.kaggle.com/datasets/uciml/caravan-insurance-challenge
Source Code:
https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb

Using different classifiers/estimators, I was able to do what I posted the question for. I am sure, the code can be optimized.
Some of the ideas I used came from this stackoverflow link.
Below is my attempt at answering the question I asked using anomaly detection estimators, instead of logistic regression and randomforest.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
# the anomaly detection estimators
from sklearn.ensemble import IsolationForest
from inne import IsolationNNE
from scorers import scorer_decision # user defined scoring
# define numeric columns
num_columns = list(df1.columns)
class GSP:
def __init__(self):
pass
def mods(self, x):
num_columns # indicates list of numeric columns in dfs
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# classifiers
clf1 = IsolationForest(n_jobs=-1, random_state=None, bootstrap=False)
clf2 = IsolationNNE(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
# grid search parameters
num_estimators = list(np.linspace(100, 200, num=5, endpoint=True).astype(int))
max_samples = list(np.linspace(0.70, 1.00, num=5))
contamination = list(np.linspace(0.05, 0.10, num=5, endpoint=True))
max_features = [0.25, 0.50, 0.75, 0.80, 0.90, 1.00]
params1 = {# set isolation forest grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier__max_features': max_features,
'classifier': [clf1]}
params2 = {# set inne grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier': [clf2]}
params = [params1, params2]
gsresults = pd.DataFrame()
for key in x.keys():
print('running key:', key)
gs = GridSearchCV(estimator=pipe,
param_grid=params,
cv=2,
n_jobs=4,
verbose=1,
scoring=scorer_decision,
error_score='raise',
refit=True)
# fit the model
gs.fit(x[key])
dftemp = pd.DataFrame(gs.cv_results_)
dftemp['dataset'] = key
gsresults = pd.concat([gsresults, dftemp], ignore_index=True)
gsresults = (gsresults.set_index(gsresults['params']
.apply(lambda x: ' '.join(str(val) for val in x.values()))).rename_axis('kernel'))
selected_columns = ['dataset', 'mean_test_score', 'rank_test_score',
'param_classifier', 'param_classifier__contamination',
'param_classifier__max_features', 'param_classifier__max_samples',
'param_classifier__n_estimators']
gsresults2 = (gsresults.loc[:, selected_columns]
.rename(columns={'mean_test_score': 'mean_score',
'rank_test_score': 'rank_score',
'param_classifier': 'classifier',
'param_classifier__contamination': 'contamination',
'param_classifier__max_features': 'max_features',
'param_classifier__max_samples': 'max_samples',
'param_classifier__n_estimators': 'n_estimators'}))
gsresults3 = (gsresults2.sort_values(['rank_score', 'mean_score'], ascending=True)
.groupby(['dataset']))
# check output by dataframes
dfs = {}
for key, df in gsresults3:
dfs[key] = df
return dfs
# running the mods method below returns a dictionary of dataframes
best_params = GSP().mods(X)
Note: the models are fitted on a dictionary of dataframes, X.

Related

Linear Regression Using sklearn issues with reshape code

I've got my data cleaned and prepped. I've done a split test and am now trying to do a linear regression. The issue is, when I first tried it, it say that I needed to create an array and reshape the data. I have done this, but now it's giving me an error " _reshape_dispatcher() missing 1 required positional argument: 'newshape'". All of the methods I've looked up to declare a newshape have not worked.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
df = pd.read_csv('googleplaystore.csv') # 1
df = df.dropna() # 3
df['Size'] = df['Size'].str.extract(r'(\d+\.?\d)', expand=False).astype(float) * df['Size'].str[-1].replace({'M': 1024, 'k': 1}) # 4
df = df.dropna() # remove nan from "Varies with device"
df['Price'] = df['Price'].str.strip('$').astype(float) # 5
df['Installs'] = df['Installs'].str.strip('+')
df['Installs'] = df['Installs'].str.replace(',',"").astype(int)
df['Reviews'] = df['Reviews'].astype(float)
df['Size'] = df['Size'].astype(float)
df = df.loc[df['Rating'].between(1, 5)] # 6
df = df.loc[df['Type'] != 'Free'] # 7
df.drop(df[df['Price'] >= 200].index, inplace = True)
df.drop(df[df['Reviews'] >2000000].index, inplace = True)
df.drop(df[df['Installs'] >10000].index, inplace = True)
inp1 = df.copy()
df_reviewslog=np.log10(df['Reviews'])
df_installslog=np.log10(df['Installs'])
del df['App']
del df['Last Updated']
del df['Current Ver']
del df['Android Ver']
pd.get_dummies(df, columns=['Category', 'Genres', 'Content Rating'], drop_first=True)
inp2 = df.copy()
df_train = X_train,X_test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.7, random_state=0)
df_test = X_train,X_Test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.3, random_state=0)
df_train = np.array(df_train)
df_test = np.array(df_test)
df_train = np.reshape(df_train.shape)
df_test = np.reshape(df_test.shape)
lr = LinearRegression()
lr.fit(X_train,y_train)
print(lr.score(X_Test,y_test))

'GridSearchCV' object has no attribute 'estimators_' using dtreeviz

After carrying out a GridSearchCV on a Randomforest classifer, I am attempting to display a tree plot. I tried the code below, but I get this error:
AttributeError: 'GridSearchCV' object has no attribute 'estimators_'
Can you tell me how to fix this error and get a view of a tree?
Here is my code from the classifier:
model = RandomForestClassifier()
parameter_space = {
'n_estimators': [10,50,100],
'criterion': ['gini', 'entropy'],
'max_depth': np.linspace(10,50,11),
}
clf = GridSearchCV(model, parameter_space, cv = 5, scoring = "accuracy", verbose = True) # model
clf.fit(X_train,y_train)
train_pred = clf.predict(X_train) # Train predict
test_pred = clf.predict(X_test) # Test predict
# Load packages
import pandas as pd
from sklearn import tree
from dtreeviz.trees import dtreeviz # will be used for tree visualization
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})
plt.figure(figsize=(20,20))
_ = tree.plot_tree(clf.n_estimators_[0], feature_names=X_train.columns, filled=True)
You need to select the best Random Forest model from the grid search. You need to change your last line of code :
_ = tree.plot_tree(clf.best_estimator_.estimators_[0], feature_names=X_train.columns, filled=True)

Recovering features names of StandardScaler().fit_transform() with sklearn

Edited from a tutorial in Kaggle, I try to run the code below and data (available to download from here):
Code:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for plotting facilities
from datetime import datetime, date
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("./data/Aquifer_Petrignano.csv")
df['Date'] = pd.to_datetime(df.Date, format = '%d/%m/%Y')
df = df[df.Rainfall_Bastia_Umbra.notna()].reset_index(drop=True)
df = df.interpolate(method ='ffill')
df = df[['Date', 'Rainfall_Bastia_Umbra', 'Depth_to_Groundwater_P24', 'Depth_to_Groundwater_P25', 'Temperature_Bastia_Umbra', 'Temperature_Petrignano', 'Volume_C10_Petrignano', 'Hydrometry_Fiume_Chiascio_Petrignano']].resample('7D', on='Date').mean().reset_index(drop=False)
X = df.drop(['Depth_to_Groundwater_P24','Depth_to_Groundwater_P25','Date'], axis=1)
y1 = df.Depth_to_Groundwater_P24
y2 = df.Depth_to_Groundwater_P25
scaler = StandardScaler()
X = scaler.fit_transform(X)
model = xgb.XGBRegressor()
param_search = {'max_depth': range(1, 2, 2),
'min_child_weight': range(1, 2, 2),
'n_estimators' : [1000],
'learning_rate' : [0.1]}
tscv = TimeSeriesSplit(n_splits=2)
gsearch = GridSearchCV(estimator=model, cv=tscv,
param_grid=param_search)
gsearch.fit(X, y1)
xgb_grid = xgb.XGBRegressor(**gsearch.best_params_)
xgb_grid.fit(X, y1)
ax = xgb.plot_importance(xgb_grid)
ax.figure.tight_layout()
ax.figure.savefig('test.png')
y_val = y1[-80:]
X_val = X[-80:]
y_pred = xgb_grid.predict(X_val)
print(mean_absolute_error(y_val, y_pred))
print(math.sqrt(mean_squared_error(y_val, y_pred)))
I plotted a features importance figure whose original features names are hidden:
If I comment out these two lines:
scaler = StandardScaler()
X = scaler.fit_transform(X)
I get the output:
How could I use scaler.fit_transform() for X and get a feature importance plot with the original feature names?
The reason behind this is that StandardScaler returns a numpy.ndarray of your feature values (same shape as pandas.DataFrame.values, but not normalized) and you need to convert it back to pandas.DataFrame with the same column names.
Here's the part of your code that needs changing.
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

Get support and ranking attributes for RFE using Pipeline in Python 3

The code I have so far is below and it works perfectly. However, I would like to print the following RFE attributes for each number of features tested: "rfe.support_[i]", "rfe.ranking_[i]" and the name of the selected features since "i" refers to the index, the first attribute returns True or False (if the columns were selected or not) and the second one returns their respective rankings.
In other words, I would like to print the columns considered in each RFE and that they do not remain as something abstract.
# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Get the dataset
def get_dataset(df, target):
X, y = df.drop(columns = target), df[[target]].values.flatten()
return X, y
# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
('one-hot-encoder', OneHotEncoder())])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
('cat', cat_transformer, list_cat_cols)])
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)])
return models
# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
scores = cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv,
n_jobs = -1, error_score = 'raise')
return scores
# Define the dataset
X, y = get_dataset(my_df, 'my_target') # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(),
X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
The following is returning errors:
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_[0] # Returns: AttributeError: 'RFE' object has no attribute 'support_'
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_[0] # Returns: AttributeError: 'RFE' object has no attribute 'ranking_'
Point is that you haven't explicitly fitted the 'DecisionTreeRegressor_2' pipeline.
Indeed, though cross_val_score already takes care of fitting the estimator as you might see here, cross_val_score does not return the estimator instance, as .fit() method does. Therefore you're not able to access the RFE instance attributes.
Here's a toy example from your setting:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
X, y = make_regression()
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(
[
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)
])
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_ # this does not work
You might see, instead, that after fitting your model, you'll be able to access the support_ and ranking_ attributes:
models['DecisionTreeRegressor_2'].fit(X,y)
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_ # this works
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_ # this works
I answered the question. I'm posting it in case it can help someone. It consists of using "cross_validate", instead of "cross_val_score", with the option "return_estimator = True" to be able to retrieve the pipelines in the different folds and RFE, and access them by index. Then you can use "named_steps".
# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Get the dataset
def get_dataset(df, target):
X, y = df.drop(columns = target), df[[target]].values.flatten()
return X, y
# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
('one-hot-encoder', OneHotEncoder())])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
('cat', cat_transformer, list_cat_cols)])
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)])
return models
# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
output = cross_validate(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv,
n_jobs = -1, error_score = 'raise', return_estimator = True)
return output
# Define the dataset
X, y = get_dataset(my_df, 'my_target') # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(),
X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
output = evaluate_model(model, X, y)
results.append(output['test_score'])
names.append(name)
print('%s %.3f (%.3f)' % (name, mean(output['test_score']), std(output['test_score'])))
print(output)
print(output['estimator'][0].named_steps['s_dtr'].support_)
print(output['estimator'][0].named_steps['s_dtr'].ranking_)
print(output['estimator'][0].named_steps['s_dtr'].support_[2])
print(output['estimator'][0].named_steps['s_dtr'].ranking_[2])

Error saving model in sklearn2pmml using VotingClassifier

I'm new to programming and I'm having a little trouble saving a model in pmml. I have a database and I need to make a selection of attributes, then use the majority vote and finally save in pmml. Even the majority vote part works, but when I save the model on the last line using sklearn2pmml it gives an error.
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score
from sklearn2pmml import make_pmml_pipeline
from sklearn2pmml import sklearn2pmml
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble import VotingClassifier
import joblib
url = 'D:/treinamento.CSV'
df = read_csv(url, header=None)
data = df.values
url_test = 'D:/TESTE.CSV'
df_test = read_csv(url_test, header=None)
data_test = df_test.values
X = data[:, :-1]
y = data_test[:, -1]
X_train = data[:, :-1]
X_test = data_test[:, :-1]
y_train = data[:, -1]
y_test = y
#features selection
features1 = [2, 5, 7]
features2 = [0, 1, 4, 5, 7]
features3 = [0, 1, 4, 5, 6]
features4 = [1, 4]
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor1 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features1)])
preprocessor2 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features2)])
preprocessor3 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features3)])
preprocessor4 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features4)])
pipe1 = PMMLPipeline(steps=[('preprocessor', preprocessor1),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe2 = PMMLPipeline(steps=[('preprocessor', preprocessor2),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe3 = PMMLPipeline(steps=[('preprocessor', preprocessor3),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe4 = PMMLPipeline(steps=[('preprocessor', preprocessor4),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
eclf = VotingClassifier(estimators=[('pipe1', PMMLPipeline(steps=[('preprocessor', preprocessor1),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe2', PMMLPipeline(steps=[('preprocessor', preprocessor2),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe3', PMMLPipeline(steps=[('preprocessor', preprocessor3),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe4', PMMLPipeline(steps=[('preprocessor', preprocessor4),('classifier', DecisionTreeClassifier(min_samples_split = 2))]))], voting='hard', weights=[1,1,1,1])
eclf.fit(X_train, y_train)
yhat = eclf.predict(X_test)
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy * 100))
sklearn2pmml(eclf, "D:/Mestrado/ARTIGO DRC/dados_pos_revisao/cross validation - dados reavaliados/4 revisao/5 FOLDS/1 FOLD/eclf.pmml", with_repr = True)
Code error
65 sklearn2pmml(eclf, "D:/mest/eclf.pmml", with_repr = True)
~\anaconda3\lib\site-packages\sklearn2pmml\__init__.py in sklearn2pmml(pipeline, pmml, user_classpath, with_repr, debug, java_encoding)
222 print("{0}: {1}".format(java_version[0], java_version[1]))
223 if not isinstance(pipeline, PMMLPipeline):
--> 224 raise TypeError("The pipeline object is not an instance of " + PMMLPipeline.__name__ + ". Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline")
225 estimator = pipeline._final_estimator
226 cmd = ["java", "-cp", os.pathsep.join(_classpath(user_classpath)), "org.jpmml.sklearn.Main"]
TypeError: The pipeline object is not an instance of PMMLPipeline. Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline
The pipeline object is not an instance of PMMLPipeline
Did you read the SkLearn2PMML error message or not? Probably not, because it clearly states what's the issue!
You're using the PMMLPipeline class in completely wrong places. It should be used only as the topmost wrapper to the VotingClassifier estimator.
Please reorganize your code like this:
pipeline = PMMLPipeline([
("classifier", VotingClassifier([
("pipe1", Pipeline(...)),
("pipe2", Pipeline(...)),
("pipe3", Pipeline(...))
]))
])
sklearn2pmml(pipeline, "pipeline.pmml")

Resources