Error saving model in sklearn2pmml using VotingClassifier - python-3.x

I'm new to programming and I'm having a little trouble saving a model in pmml. I have a database and I need to make a selection of attributes, then use the majority vote and finally save in pmml. Even the majority vote part works, but when I save the model on the last line using sklearn2pmml it gives an error.
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score
from sklearn2pmml import make_pmml_pipeline
from sklearn2pmml import sklearn2pmml
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble import VotingClassifier
import joblib
url = 'D:/treinamento.CSV'
df = read_csv(url, header=None)
data = df.values
url_test = 'D:/TESTE.CSV'
df_test = read_csv(url_test, header=None)
data_test = df_test.values
X = data[:, :-1]
y = data_test[:, -1]
X_train = data[:, :-1]
X_test = data_test[:, :-1]
y_train = data[:, -1]
y_test = y
#features selection
features1 = [2, 5, 7]
features2 = [0, 1, 4, 5, 7]
features3 = [0, 1, 4, 5, 6]
features4 = [1, 4]
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor1 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features1)])
preprocessor2 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features2)])
preprocessor3 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features3)])
preprocessor4 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features4)])
pipe1 = PMMLPipeline(steps=[('preprocessor', preprocessor1),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe2 = PMMLPipeline(steps=[('preprocessor', preprocessor2),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe3 = PMMLPipeline(steps=[('preprocessor', preprocessor3),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe4 = PMMLPipeline(steps=[('preprocessor', preprocessor4),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
eclf = VotingClassifier(estimators=[('pipe1', PMMLPipeline(steps=[('preprocessor', preprocessor1),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe2', PMMLPipeline(steps=[('preprocessor', preprocessor2),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe3', PMMLPipeline(steps=[('preprocessor', preprocessor3),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe4', PMMLPipeline(steps=[('preprocessor', preprocessor4),('classifier', DecisionTreeClassifier(min_samples_split = 2))]))], voting='hard', weights=[1,1,1,1])
eclf.fit(X_train, y_train)
yhat = eclf.predict(X_test)
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy * 100))
sklearn2pmml(eclf, "D:/Mestrado/ARTIGO DRC/dados_pos_revisao/cross validation - dados reavaliados/4 revisao/5 FOLDS/1 FOLD/eclf.pmml", with_repr = True)
Code error
65 sklearn2pmml(eclf, "D:/mest/eclf.pmml", with_repr = True)
~\anaconda3\lib\site-packages\sklearn2pmml\__init__.py in sklearn2pmml(pipeline, pmml, user_classpath, with_repr, debug, java_encoding)
222 print("{0}: {1}".format(java_version[0], java_version[1]))
223 if not isinstance(pipeline, PMMLPipeline):
--> 224 raise TypeError("The pipeline object is not an instance of " + PMMLPipeline.__name__ + ". Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline")
225 estimator = pipeline._final_estimator
226 cmd = ["java", "-cp", os.pathsep.join(_classpath(user_classpath)), "org.jpmml.sklearn.Main"]
TypeError: The pipeline object is not an instance of PMMLPipeline. Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline

The pipeline object is not an instance of PMMLPipeline
Did you read the SkLearn2PMML error message or not? Probably not, because it clearly states what's the issue!
You're using the PMMLPipeline class in completely wrong places. It should be used only as the topmost wrapper to the VotingClassifier estimator.
Please reorganize your code like this:
pipeline = PMMLPipeline([
("classifier", VotingClassifier([
("pipe1", Pipeline(...)),
("pipe2", Pipeline(...)),
("pipe3", Pipeline(...))
]))
])
sklearn2pmml(pipeline, "pipeline.pmml")

Related

Linear Regression Using sklearn issues with reshape code

I've got my data cleaned and prepped. I've done a split test and am now trying to do a linear regression. The issue is, when I first tried it, it say that I needed to create an array and reshape the data. I have done this, but now it's giving me an error " _reshape_dispatcher() missing 1 required positional argument: 'newshape'". All of the methods I've looked up to declare a newshape have not worked.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
df = pd.read_csv('googleplaystore.csv') # 1
df = df.dropna() # 3
df['Size'] = df['Size'].str.extract(r'(\d+\.?\d)', expand=False).astype(float) * df['Size'].str[-1].replace({'M': 1024, 'k': 1}) # 4
df = df.dropna() # remove nan from "Varies with device"
df['Price'] = df['Price'].str.strip('$').astype(float) # 5
df['Installs'] = df['Installs'].str.strip('+')
df['Installs'] = df['Installs'].str.replace(',',"").astype(int)
df['Reviews'] = df['Reviews'].astype(float)
df['Size'] = df['Size'].astype(float)
df = df.loc[df['Rating'].between(1, 5)] # 6
df = df.loc[df['Type'] != 'Free'] # 7
df.drop(df[df['Price'] >= 200].index, inplace = True)
df.drop(df[df['Reviews'] >2000000].index, inplace = True)
df.drop(df[df['Installs'] >10000].index, inplace = True)
inp1 = df.copy()
df_reviewslog=np.log10(df['Reviews'])
df_installslog=np.log10(df['Installs'])
del df['App']
del df['Last Updated']
del df['Current Ver']
del df['Android Ver']
pd.get_dummies(df, columns=['Category', 'Genres', 'Content Rating'], drop_first=True)
inp2 = df.copy()
df_train = X_train,X_test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.7, random_state=0)
df_test = X_train,X_Test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.3, random_state=0)
df_train = np.array(df_train)
df_test = np.array(df_test)
df_train = np.reshape(df_train.shape)
df_test = np.reshape(df_test.shape)
lr = LinearRegression()
lr.fit(X_train,y_train)
print(lr.score(X_Test,y_test))

Iterate GridSearchCV Over Multiple Datasets and Classifiers (Python)

I have multiple datasets that I want to estimate parameters for using different classifiers (logistic and randomforest).
I want to run each data for both classifiers using gridsearchcv, and then get the best parameters for each classifier per dataset. I am just a bit stumped on how to go about that. My code is below.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# features
X = {'df1': np.random.normal(0, 1, (200, 5)),
'df2': np.random.normal(0, 1, (200, 5))}
# labels
y = {'df1': np.random.choice([0, 1], 200),
'df2': np.random.choice([0, 1], 200)}
num_columns = list(subset_features[1:])
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# the classifiers
clf1 = LogisticRegression(solver='liblinear', random_state=None)
clf2 = RandomForestClassifier(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
params1 = {'classifier__penalty': ['l1', 'l2'],
'classifier__C': [0.1, 1, 10],
'classifier': [clf1]}
params2 = {'classifier__n_estimators': [100, 150, 200],
'classifier__min_samples_leaf': [1, 2],
'classifier' = [clf2]
params = [params1, params2]
gs = GridSearchCV(pipe, params)
gs.fit(X, y)
gs.best_params_
How about this?
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
# Modeling
import lightgbm as lgb
# Evaluation of the model
from sklearn.model_selection import KFold
MAX_EVALS = 500
N_FOLDS = 10
# Read in data and separate into training and testing sets
data = pd.read_csv('C:\\caravan-insurance-challenge.csv')
train = data[data['ORIGIN'] == 'train']
test = data[data['ORIGIN'] == 'test']
# Extract the labels and format properly
train_labels = np.array(train['CARAVAN'].astype(np.int32)).reshape((-1,))
test_labels = np.array(test['CARAVAN'].astype(np.int32)).reshape((-1,))
# Drop the unneeded columns
train = train.drop(columns = ['ORIGIN', 'CARAVAN'])
test = test.drop(columns = ['ORIGIN', 'CARAVAN'])
# Convert to numpy array for splitting in cross validation
features = np.array(train)
test_features = np.array(test)
labels = train_labels[:]
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
train.head()
import matplotlib.pyplot as plt
import seaborn as sns
plt.hist(labels, edgecolor = 'k');
plt.xlabel('Label'); plt.ylabel('Count'); plt.title('Counts of Labels')
# Model with default hyperparameters
model = lgb.LGBMClassifier()
model
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer
start = timer()
model.fit(features, labels)
train_time = timer() - start
predictions = model.predict_proba(test_features)[:, 1]
auc = roc_auc_score(test_labels, predictions)
print('The baseline score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))
import random
lgb.LGBMClassifier()
# Hyperparameter grid
param_grid = {
'class_weight': [None, 'balanced'],
'boosting_type': ['gbdt', 'goss', 'dart'],
'num_leaves': list(range(30, 150)),
'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
'subsample_for_bin': list(range(20000, 300000, 20000)),
'min_child_samples': list(range(20, 500, 5)),
'reg_alpha': list(np.linspace(0, 1)),
'reg_lambda': list(np.linspace(0, 1)),
'colsample_bytree': list(np.linspace(0.6, 1, 10))
}
# Subsampling (only applicable with 'goss')
subsample_dist = list(np.linspace(0.5, 1, 100))
plt.hist(param_grid['learning_rate'], color = 'r', edgecolor = 'k');
plt.xlabel('Learning Rate', size = 14); plt.ylabel('Count', size = 14); plt.title('Learning Rate Distribution', size = 18)
plt.hist(param_grid['num_leaves'], color = 'm', edgecolor = 'k')
plt.xlabel('Learning Number of Leaves', size = 14); plt.ylabel('Count', size = 14); plt.title('Number of Leaves Distribution', size = 18)
# Randomly sample parameters for gbm
params = {key: random.sample(value, 1)[0] for key, value in param_grid.items()}
params
params['subsample'] = random.sample(subsample_dist, 1)[0] if params['boosting_type'] != 'goss' else 1.0
params
Result:
{'class_weight': 'balanced',
'boosting_type': 'goss',
'num_leaves': 58,
'learning_rate': 0.010197109660117238,
'subsample_for_bin': 40000,
'min_child_samples': 230,
'reg_alpha': 0.7755102040816326,
'reg_lambda': 0.7755102040816326,
'colsample_bytree': 0.8666666666666667,
'subsample': 1.0}
Data:
https://www.kaggle.com/datasets/uciml/caravan-insurance-challenge
Source Code:
https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb
Using different classifiers/estimators, I was able to do what I posted the question for. I am sure, the code can be optimized.
Some of the ideas I used came from this stackoverflow link.
Below is my attempt at answering the question I asked using anomaly detection estimators, instead of logistic regression and randomforest.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
# the anomaly detection estimators
from sklearn.ensemble import IsolationForest
from inne import IsolationNNE
from scorers import scorer_decision # user defined scoring
# define numeric columns
num_columns = list(df1.columns)
class GSP:
def __init__(self):
pass
def mods(self, x):
num_columns # indicates list of numeric columns in dfs
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# classifiers
clf1 = IsolationForest(n_jobs=-1, random_state=None, bootstrap=False)
clf2 = IsolationNNE(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
# grid search parameters
num_estimators = list(np.linspace(100, 200, num=5, endpoint=True).astype(int))
max_samples = list(np.linspace(0.70, 1.00, num=5))
contamination = list(np.linspace(0.05, 0.10, num=5, endpoint=True))
max_features = [0.25, 0.50, 0.75, 0.80, 0.90, 1.00]
params1 = {# set isolation forest grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier__max_features': max_features,
'classifier': [clf1]}
params2 = {# set inne grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier': [clf2]}
params = [params1, params2]
gsresults = pd.DataFrame()
for key in x.keys():
print('running key:', key)
gs = GridSearchCV(estimator=pipe,
param_grid=params,
cv=2,
n_jobs=4,
verbose=1,
scoring=scorer_decision,
error_score='raise',
refit=True)
# fit the model
gs.fit(x[key])
dftemp = pd.DataFrame(gs.cv_results_)
dftemp['dataset'] = key
gsresults = pd.concat([gsresults, dftemp], ignore_index=True)
gsresults = (gsresults.set_index(gsresults['params']
.apply(lambda x: ' '.join(str(val) for val in x.values()))).rename_axis('kernel'))
selected_columns = ['dataset', 'mean_test_score', 'rank_test_score',
'param_classifier', 'param_classifier__contamination',
'param_classifier__max_features', 'param_classifier__max_samples',
'param_classifier__n_estimators']
gsresults2 = (gsresults.loc[:, selected_columns]
.rename(columns={'mean_test_score': 'mean_score',
'rank_test_score': 'rank_score',
'param_classifier': 'classifier',
'param_classifier__contamination': 'contamination',
'param_classifier__max_features': 'max_features',
'param_classifier__max_samples': 'max_samples',
'param_classifier__n_estimators': 'n_estimators'}))
gsresults3 = (gsresults2.sort_values(['rank_score', 'mean_score'], ascending=True)
.groupby(['dataset']))
# check output by dataframes
dfs = {}
for key, df in gsresults3:
dfs[key] = df
return dfs
# running the mods method below returns a dictionary of dataframes
best_params = GSP().mods(X)
Note: the models are fitted on a dictionary of dataframes, X.

Get support and ranking attributes for RFE using Pipeline in Python 3

The code I have so far is below and it works perfectly. However, I would like to print the following RFE attributes for each number of features tested: "rfe.support_[i]", "rfe.ranking_[i]" and the name of the selected features since "i" refers to the index, the first attribute returns True or False (if the columns were selected or not) and the second one returns their respective rankings.
In other words, I would like to print the columns considered in each RFE and that they do not remain as something abstract.
# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Get the dataset
def get_dataset(df, target):
X, y = df.drop(columns = target), df[[target]].values.flatten()
return X, y
# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
('one-hot-encoder', OneHotEncoder())])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
('cat', cat_transformer, list_cat_cols)])
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)])
return models
# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
scores = cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv,
n_jobs = -1, error_score = 'raise')
return scores
# Define the dataset
X, y = get_dataset(my_df, 'my_target') # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(),
X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
The following is returning errors:
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_[0] # Returns: AttributeError: 'RFE' object has no attribute 'support_'
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_[0] # Returns: AttributeError: 'RFE' object has no attribute 'ranking_'
Point is that you haven't explicitly fitted the 'DecisionTreeRegressor_2' pipeline.
Indeed, though cross_val_score already takes care of fitting the estimator as you might see here, cross_val_score does not return the estimator instance, as .fit() method does. Therefore you're not able to access the RFE instance attributes.
Here's a toy example from your setting:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
X, y = make_regression()
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(
[
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)
])
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_ # this does not work
You might see, instead, that after fitting your model, you'll be able to access the support_ and ranking_ attributes:
models['DecisionTreeRegressor_2'].fit(X,y)
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_ # this works
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_ # this works
I answered the question. I'm posting it in case it can help someone. It consists of using "cross_validate", instead of "cross_val_score", with the option "return_estimator = True" to be able to retrieve the pipelines in the different folds and RFE, and access them by index. Then you can use "named_steps".
# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Get the dataset
def get_dataset(df, target):
X, y = df.drop(columns = target), df[[target]].values.flatten()
return X, y
# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
('one-hot-encoder', OneHotEncoder())])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
('cat', cat_transformer, list_cat_cols)])
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)])
return models
# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
output = cross_validate(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv,
n_jobs = -1, error_score = 'raise', return_estimator = True)
return output
# Define the dataset
X, y = get_dataset(my_df, 'my_target') # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(),
X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
output = evaluate_model(model, X, y)
results.append(output['test_score'])
names.append(name)
print('%s %.3f (%.3f)' % (name, mean(output['test_score']), std(output['test_score'])))
print(output)
print(output['estimator'][0].named_steps['s_dtr'].support_)
print(output['estimator'][0].named_steps['s_dtr'].ranking_)
print(output['estimator'][0].named_steps['s_dtr'].support_[2])
print(output['estimator'][0].named_steps['s_dtr'].ranking_[2])

DataType of InputField is double although in the PMMLPipeline it is string

I am exporting a PMMLPipeline with a categorical string feature day_of_week as a PMML file. When I open the file in Java and list the InputFields I see that the data type of day_of_week field is double:
InputField{name=day_of_week, fieldName=day_of_week, displayName=null, dataType=double, opType=categorical}
Hence when I evaluate an input I get the error:
org.jpmml.evaluator.InvalidResultException: Field "day_of_week" cannot accept user input value "tuesday"
On the Python side the pipeline works with a string column:
data = pd.DataFrame(data=[{"age": 10, "day_of_week": "tuesday"}])
y = trained_model.predict(X=data)
Miminal example for creating the PMML file:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline
if __name__ == '__main__':
data_dict = {
'age': [1, 2, 3],
'day_of_week': ['monday', 'tuesday', 'wednesday'],
'y': [5, 6, 7]
}
data = pd.DataFrame(data_dict, columns=data_dict)
numeric_features = ['age']
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
categorical_features = ['day_of_week']
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])
preprocessor = ColumnTransformer(
transformers=[
('numerical', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_features)])
pipeline = PMMLPipeline(
steps=[
('preprocessor', preprocessor),
('classifier', RandomForestRegressor(n_estimators=60))])
X = data.drop(labels=['y'], axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
trained_model = pipeline.fit(X=X_train, y=y_train)
sklearn2pmml(pipeline=pipeline, pmml='RandomForestRegressor2.pmml', with_repr=True)
EDIT:
sklearn2pmml creates a PMML file with A DataDictionary with DataField "day_of_week" that has dataType="double". I think it should be "String". Do I have to set the dataType somewhere to correct this?
<DataDictionary>
<DataField name="day_of_week" optype="categorical" dataType="double">
You can assist SkLearn2PMML by providing "feature type hints" using sklearn2pmml.decoration.CategoricalDomain and sklearn2pmml.decoration.ContinuousDomain decorators (see here for more details).
In the current case, you should prepend a CategoricalDomain step to the pipeline that deals with categorical features:
from sklearn2pmml.decoration import CategoricalDomain
categorical_transformer = Pipeline(steps=[
('domain', CategoricalDomain(dtype = str))
('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))
])
Thanks for your reply #user1808924.
The given solution works. Now, to add in his answer; I would like to note that CategoricalDomain works for the single feature only.
Problem:
So, when you use it in to pipeline like:
# pipeline creatiion
categorical_transformer = Pipeline(steps=[
('domain', CategoricalDomain(dtype = str)),
('onehot', Ordinalecndoer())
])
# fit and transform of `df` with 3 features
categorical_transformer.fit_transform(df)
### >>> ERROR: Expected 1d array, got 2d array of shape (1000, 3)
Which means you will need to use multiple CategoricalDomains in there.
NOTE: We often use it in the ColumnTransformer as well. You need to know how many categorical features are there before hand.
What can we do?
We will simply use the MultiDomain from the same library.
from sklearn2pmml.decoration import MultiDomain
categorical_transformer = Pipeline(steps=[
('domain', MultiDomain([CategoricalDomain(dtype = str) for _ in range(3)])),
('onehot', OrdinalEncoder())
])
Note that the 3 is the number of categorical columns there. Hence, there will be n CategoricalDomains per categorical columns.
Then performing the transformation will work.

TypeError: __init__() got an unexpected keyword argument 'categorical_features'

Spyder(python 3.7)
I am facing following errors here. I have already update all library from anaconda prompt. But can't findout the solution of the problem.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
Traceback (most recent call last):
File "<ipython-input-4-05deb1f02719>", line 2, in <module>
onehotencoder = OneHotEncoder(categorical_features = [1])
TypeError: __init__() got an unexpected keyword argument 'categorical_features'
So based on your code, you'd have to:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
# Country column
ct = ColumnTransformer([("Country", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)
# Male/Female
labelencoder_X = LabelEncoder()
X[:, 2] = labelencoder_X.fit_transform(X[:, 2])
Noticed how the first LabelEncoder was removed, you do not need to apply both the label encoded and the one hot encoder on the column anymore.
(I've kinda assumed your example came from the ML Udemy course, and the first column was a list of countries, while the second one a male/female binary choice)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X=np.array(columnTransformer.fit_transform(X),dtype=np.str)
Since the latest build of sklearn library removed categorical_features parameter for onehotencoder class. It is advised to use ColumnTransformer class for categorical datasets. Refer the sklearn's official documentation for futher clarifications.
According to the documentation this is the __init__ line:
class sklearn.preprocessing.OneHotEncoder(categories='auto', drop=None, sparse=True, dtype=<class 'numpy.float64'>, handle_unknown='error')
As you can see the init does not get the variable categorical_features
You have an categories flag:
categories‘auto’ or a list of array-like, default=’auto’
Categories (unique values) per feature:
‘auto’ : Determine categories automatically from the training data.
list : categories[i] holds the categories expected in the ith column.
The passed categories should not mix strings and numeric values within
a single feature, and should be sorted in case of numeric values.
The used categories can be found in the categories_ attribute.
Attributes: categories_list of arrays The categories of each feature
determined during fitting (in order of the features in X and
corresponding with the output of transform). This includes the
category specified in drop (if any).
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
label_encoder_x_1 = LabelEncoder()
X[: , 2] = label_encoder_x_1.fit_transform(X[:,2])
transformer = ColumnTransformer(
transformers=[
("OneHot", # Just a name
OneHotEncoder(), # The transformer class
[1] # The column(s) to be applied on.
)
],
remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())
X = X.astype('float64')
working like charm :)
Assuming this is problem from ML course from Udemy
complete code
I did replaced label encoder 1 with column transformer as suggested by Antoine Jaussoin in above comment.
Categorical Data
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Geography", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)
Your Gender column will have index 4 now
labelencoder_x_2=LabelEncoder()
X[:,4]=labelencoder_x_2.fit_transform(X[:,4])
to avoid dummy variable trap
X=X[:, 1:]
You need to add call another class on sklearn which will eliminate 1 column to avoid dummies trap.
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer # Here is the one
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
#onehotencoder = OneHotEncoder(categorical_features = [1]) Not this one
# use this instead
ct = ColumnTransformer([("Country", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)
X = X[:, 1:])
Happy Helping!!!
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Geography",OneHotEncoder(),[1])], remainder= 'passthrough')
X = ct.fit_transform(X)
labelencoder_X2 = LabelEncoder()
X[:, 4] = labelencoder_X2.fit_transform(X[:, 4])
X = X[: , 1:]
X = np.array(X, dtype=float)
Just adding an extra line to convert it from array of objects.
Replace the following code
# onehotencoder = OneHotEncoder(categorical_features = [1])
# X = onehotencoder.fit_transform(X).toarray()
# X = X[:, 1:]
with the following chunk and your code must
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder = 'passthrough')
X = np.array(columnTransformer.fit_transform(X), dtype = np.float64)
X = X[:, 1:]
Assuming you're learning Deep Learning from udemy.
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
# remove categorical_features, it works 100% perfectly
onehotencoder = OneHotEncoder()
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
Here is only one extension for onehotencoder.
if X have lot of columns.
instead
ct = ColumnTransformer([("encoder", OneHotEncoder(), list(categorical_features))], remainder = 'passthrough')
X = ct.fit_transform(X)
Another solution including the transformation of the X object in array type in a float64 type
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float)
one_hot_encode = OneHotEncoder(categorical_features=[0]) is working for scikit-learn 0.20.3 and the parameter removed from scikit-learn 0.24.2 (versions I am checking).
Either Downgrade scikit-learn version
Or Use
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
"""2 classes- Known/unknown Face"""
ct = ColumnTransformer([("Faces", OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)
"""Country column"""
ct = ColumnTransformer([("Country", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)```

Resources