Get support and ranking attributes for RFE using Pipeline in Python 3 - python-3.x

The code I have so far is below and it works perfectly. However, I would like to print the following RFE attributes for each number of features tested: "rfe.support_[i]", "rfe.ranking_[i]" and the name of the selected features since "i" refers to the index, the first attribute returns True or False (if the columns were selected or not) and the second one returns their respective rankings.
In other words, I would like to print the columns considered in each RFE and that they do not remain as something abstract.
# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Get the dataset
def get_dataset(df, target):
X, y = df.drop(columns = target), df[[target]].values.flatten()
return X, y
# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
('one-hot-encoder', OneHotEncoder())])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
('cat', cat_transformer, list_cat_cols)])
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)])
return models
# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
scores = cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv,
n_jobs = -1, error_score = 'raise')
return scores
# Define the dataset
X, y = get_dataset(my_df, 'my_target') # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(),
X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
The following is returning errors:
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_[0] # Returns: AttributeError: 'RFE' object has no attribute 'support_'
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_[0] # Returns: AttributeError: 'RFE' object has no attribute 'ranking_'

Point is that you haven't explicitly fitted the 'DecisionTreeRegressor_2' pipeline.
Indeed, though cross_val_score already takes care of fitting the estimator as you might see here, cross_val_score does not return the estimator instance, as .fit() method does. Therefore you're not able to access the RFE instance attributes.
Here's a toy example from your setting:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
X, y = make_regression()
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(
[
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)
])
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_ # this does not work
You might see, instead, that after fitting your model, you'll be able to access the support_ and ranking_ attributes:
models['DecisionTreeRegressor_2'].fit(X,y)
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_ # this works
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_ # this works

I answered the question. I'm posting it in case it can help someone. It consists of using "cross_validate", instead of "cross_val_score", with the option "return_estimator = True" to be able to retrieve the pipelines in the different folds and RFE, and access them by index. Then you can use "named_steps".
# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Get the dataset
def get_dataset(df, target):
X, y = df.drop(columns = target), df[[target]].values.flatten()
return X, y
# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
('one-hot-encoder', OneHotEncoder())])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
('cat', cat_transformer, list_cat_cols)])
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)])
return models
# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
output = cross_validate(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv,
n_jobs = -1, error_score = 'raise', return_estimator = True)
return output
# Define the dataset
X, y = get_dataset(my_df, 'my_target') # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(),
X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
output = evaluate_model(model, X, y)
results.append(output['test_score'])
names.append(name)
print('%s %.3f (%.3f)' % (name, mean(output['test_score']), std(output['test_score'])))
print(output)
print(output['estimator'][0].named_steps['s_dtr'].support_)
print(output['estimator'][0].named_steps['s_dtr'].ranking_)
print(output['estimator'][0].named_steps['s_dtr'].support_[2])
print(output['estimator'][0].named_steps['s_dtr'].ranking_[2])

Related

Iterate GridSearchCV Over Multiple Datasets and Classifiers (Python)

I have multiple datasets that I want to estimate parameters for using different classifiers (logistic and randomforest).
I want to run each data for both classifiers using gridsearchcv, and then get the best parameters for each classifier per dataset. I am just a bit stumped on how to go about that. My code is below.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# features
X = {'df1': np.random.normal(0, 1, (200, 5)),
'df2': np.random.normal(0, 1, (200, 5))}
# labels
y = {'df1': np.random.choice([0, 1], 200),
'df2': np.random.choice([0, 1], 200)}
num_columns = list(subset_features[1:])
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# the classifiers
clf1 = LogisticRegression(solver='liblinear', random_state=None)
clf2 = RandomForestClassifier(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
params1 = {'classifier__penalty': ['l1', 'l2'],
'classifier__C': [0.1, 1, 10],
'classifier': [clf1]}
params2 = {'classifier__n_estimators': [100, 150, 200],
'classifier__min_samples_leaf': [1, 2],
'classifier' = [clf2]
params = [params1, params2]
gs = GridSearchCV(pipe, params)
gs.fit(X, y)
gs.best_params_
How about this?
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
# Modeling
import lightgbm as lgb
# Evaluation of the model
from sklearn.model_selection import KFold
MAX_EVALS = 500
N_FOLDS = 10
# Read in data and separate into training and testing sets
data = pd.read_csv('C:\\caravan-insurance-challenge.csv')
train = data[data['ORIGIN'] == 'train']
test = data[data['ORIGIN'] == 'test']
# Extract the labels and format properly
train_labels = np.array(train['CARAVAN'].astype(np.int32)).reshape((-1,))
test_labels = np.array(test['CARAVAN'].astype(np.int32)).reshape((-1,))
# Drop the unneeded columns
train = train.drop(columns = ['ORIGIN', 'CARAVAN'])
test = test.drop(columns = ['ORIGIN', 'CARAVAN'])
# Convert to numpy array for splitting in cross validation
features = np.array(train)
test_features = np.array(test)
labels = train_labels[:]
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
train.head()
import matplotlib.pyplot as plt
import seaborn as sns
plt.hist(labels, edgecolor = 'k');
plt.xlabel('Label'); plt.ylabel('Count'); plt.title('Counts of Labels')
# Model with default hyperparameters
model = lgb.LGBMClassifier()
model
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer
start = timer()
model.fit(features, labels)
train_time = timer() - start
predictions = model.predict_proba(test_features)[:, 1]
auc = roc_auc_score(test_labels, predictions)
print('The baseline score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))
import random
lgb.LGBMClassifier()
# Hyperparameter grid
param_grid = {
'class_weight': [None, 'balanced'],
'boosting_type': ['gbdt', 'goss', 'dart'],
'num_leaves': list(range(30, 150)),
'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
'subsample_for_bin': list(range(20000, 300000, 20000)),
'min_child_samples': list(range(20, 500, 5)),
'reg_alpha': list(np.linspace(0, 1)),
'reg_lambda': list(np.linspace(0, 1)),
'colsample_bytree': list(np.linspace(0.6, 1, 10))
}
# Subsampling (only applicable with 'goss')
subsample_dist = list(np.linspace(0.5, 1, 100))
plt.hist(param_grid['learning_rate'], color = 'r', edgecolor = 'k');
plt.xlabel('Learning Rate', size = 14); plt.ylabel('Count', size = 14); plt.title('Learning Rate Distribution', size = 18)
plt.hist(param_grid['num_leaves'], color = 'm', edgecolor = 'k')
plt.xlabel('Learning Number of Leaves', size = 14); plt.ylabel('Count', size = 14); plt.title('Number of Leaves Distribution', size = 18)
# Randomly sample parameters for gbm
params = {key: random.sample(value, 1)[0] for key, value in param_grid.items()}
params
params['subsample'] = random.sample(subsample_dist, 1)[0] if params['boosting_type'] != 'goss' else 1.0
params
Result:
{'class_weight': 'balanced',
'boosting_type': 'goss',
'num_leaves': 58,
'learning_rate': 0.010197109660117238,
'subsample_for_bin': 40000,
'min_child_samples': 230,
'reg_alpha': 0.7755102040816326,
'reg_lambda': 0.7755102040816326,
'colsample_bytree': 0.8666666666666667,
'subsample': 1.0}
Data:
https://www.kaggle.com/datasets/uciml/caravan-insurance-challenge
Source Code:
https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb
Using different classifiers/estimators, I was able to do what I posted the question for. I am sure, the code can be optimized.
Some of the ideas I used came from this stackoverflow link.
Below is my attempt at answering the question I asked using anomaly detection estimators, instead of logistic regression and randomforest.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
# the anomaly detection estimators
from sklearn.ensemble import IsolationForest
from inne import IsolationNNE
from scorers import scorer_decision # user defined scoring
# define numeric columns
num_columns = list(df1.columns)
class GSP:
def __init__(self):
pass
def mods(self, x):
num_columns # indicates list of numeric columns in dfs
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# classifiers
clf1 = IsolationForest(n_jobs=-1, random_state=None, bootstrap=False)
clf2 = IsolationNNE(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
# grid search parameters
num_estimators = list(np.linspace(100, 200, num=5, endpoint=True).astype(int))
max_samples = list(np.linspace(0.70, 1.00, num=5))
contamination = list(np.linspace(0.05, 0.10, num=5, endpoint=True))
max_features = [0.25, 0.50, 0.75, 0.80, 0.90, 1.00]
params1 = {# set isolation forest grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier__max_features': max_features,
'classifier': [clf1]}
params2 = {# set inne grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier': [clf2]}
params = [params1, params2]
gsresults = pd.DataFrame()
for key in x.keys():
print('running key:', key)
gs = GridSearchCV(estimator=pipe,
param_grid=params,
cv=2,
n_jobs=4,
verbose=1,
scoring=scorer_decision,
error_score='raise',
refit=True)
# fit the model
gs.fit(x[key])
dftemp = pd.DataFrame(gs.cv_results_)
dftemp['dataset'] = key
gsresults = pd.concat([gsresults, dftemp], ignore_index=True)
gsresults = (gsresults.set_index(gsresults['params']
.apply(lambda x: ' '.join(str(val) for val in x.values()))).rename_axis('kernel'))
selected_columns = ['dataset', 'mean_test_score', 'rank_test_score',
'param_classifier', 'param_classifier__contamination',
'param_classifier__max_features', 'param_classifier__max_samples',
'param_classifier__n_estimators']
gsresults2 = (gsresults.loc[:, selected_columns]
.rename(columns={'mean_test_score': 'mean_score',
'rank_test_score': 'rank_score',
'param_classifier': 'classifier',
'param_classifier__contamination': 'contamination',
'param_classifier__max_features': 'max_features',
'param_classifier__max_samples': 'max_samples',
'param_classifier__n_estimators': 'n_estimators'}))
gsresults3 = (gsresults2.sort_values(['rank_score', 'mean_score'], ascending=True)
.groupby(['dataset']))
# check output by dataframes
dfs = {}
for key, df in gsresults3:
dfs[key] = df
return dfs
# running the mods method below returns a dictionary of dataframes
best_params = GSP().mods(X)
Note: the models are fitted on a dictionary of dataframes, X.

Error saving model in sklearn2pmml using VotingClassifier

I'm new to programming and I'm having a little trouble saving a model in pmml. I have a database and I need to make a selection of attributes, then use the majority vote and finally save in pmml. Even the majority vote part works, but when I save the model on the last line using sklearn2pmml it gives an error.
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score
from sklearn2pmml import make_pmml_pipeline
from sklearn2pmml import sklearn2pmml
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble import VotingClassifier
import joblib
url = 'D:/treinamento.CSV'
df = read_csv(url, header=None)
data = df.values
url_test = 'D:/TESTE.CSV'
df_test = read_csv(url_test, header=None)
data_test = df_test.values
X = data[:, :-1]
y = data_test[:, -1]
X_train = data[:, :-1]
X_test = data_test[:, :-1]
y_train = data[:, -1]
y_test = y
#features selection
features1 = [2, 5, 7]
features2 = [0, 1, 4, 5, 7]
features3 = [0, 1, 4, 5, 6]
features4 = [1, 4]
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor1 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features1)])
preprocessor2 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features2)])
preprocessor3 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features3)])
preprocessor4 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features4)])
pipe1 = PMMLPipeline(steps=[('preprocessor', preprocessor1),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe2 = PMMLPipeline(steps=[('preprocessor', preprocessor2),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe3 = PMMLPipeline(steps=[('preprocessor', preprocessor3),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe4 = PMMLPipeline(steps=[('preprocessor', preprocessor4),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
eclf = VotingClassifier(estimators=[('pipe1', PMMLPipeline(steps=[('preprocessor', preprocessor1),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe2', PMMLPipeline(steps=[('preprocessor', preprocessor2),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe3', PMMLPipeline(steps=[('preprocessor', preprocessor3),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe4', PMMLPipeline(steps=[('preprocessor', preprocessor4),('classifier', DecisionTreeClassifier(min_samples_split = 2))]))], voting='hard', weights=[1,1,1,1])
eclf.fit(X_train, y_train)
yhat = eclf.predict(X_test)
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy * 100))
sklearn2pmml(eclf, "D:/Mestrado/ARTIGO DRC/dados_pos_revisao/cross validation - dados reavaliados/4 revisao/5 FOLDS/1 FOLD/eclf.pmml", with_repr = True)
Code error
65 sklearn2pmml(eclf, "D:/mest/eclf.pmml", with_repr = True)
~\anaconda3\lib\site-packages\sklearn2pmml\__init__.py in sklearn2pmml(pipeline, pmml, user_classpath, with_repr, debug, java_encoding)
222 print("{0}: {1}".format(java_version[0], java_version[1]))
223 if not isinstance(pipeline, PMMLPipeline):
--> 224 raise TypeError("The pipeline object is not an instance of " + PMMLPipeline.__name__ + ". Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline")
225 estimator = pipeline._final_estimator
226 cmd = ["java", "-cp", os.pathsep.join(_classpath(user_classpath)), "org.jpmml.sklearn.Main"]
TypeError: The pipeline object is not an instance of PMMLPipeline. Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline
The pipeline object is not an instance of PMMLPipeline
Did you read the SkLearn2PMML error message or not? Probably not, because it clearly states what's the issue!
You're using the PMMLPipeline class in completely wrong places. It should be used only as the topmost wrapper to the VotingClassifier estimator.
Please reorganize your code like this:
pipeline = PMMLPipeline([
("classifier", VotingClassifier([
("pipe1", Pipeline(...)),
("pipe2", Pipeline(...)),
("pipe3", Pipeline(...))
]))
])
sklearn2pmml(pipeline, "pipeline.pmml")

TypeError: __init__() got an unexpected keyword argument 'categorical_features'

Spyder(python 3.7)
I am facing following errors here. I have already update all library from anaconda prompt. But can't findout the solution of the problem.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
Traceback (most recent call last):
File "<ipython-input-4-05deb1f02719>", line 2, in <module>
onehotencoder = OneHotEncoder(categorical_features = [1])
TypeError: __init__() got an unexpected keyword argument 'categorical_features'
So based on your code, you'd have to:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
# Country column
ct = ColumnTransformer([("Country", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)
# Male/Female
labelencoder_X = LabelEncoder()
X[:, 2] = labelencoder_X.fit_transform(X[:, 2])
Noticed how the first LabelEncoder was removed, you do not need to apply both the label encoded and the one hot encoder on the column anymore.
(I've kinda assumed your example came from the ML Udemy course, and the first column was a list of countries, while the second one a male/female binary choice)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X=np.array(columnTransformer.fit_transform(X),dtype=np.str)
Since the latest build of sklearn library removed categorical_features parameter for onehotencoder class. It is advised to use ColumnTransformer class for categorical datasets. Refer the sklearn's official documentation for futher clarifications.
According to the documentation this is the __init__ line:
class sklearn.preprocessing.OneHotEncoder(categories='auto', drop=None, sparse=True, dtype=<class 'numpy.float64'>, handle_unknown='error')
As you can see the init does not get the variable categorical_features
You have an categories flag:
categories‘auto’ or a list of array-like, default=’auto’
Categories (unique values) per feature:
‘auto’ : Determine categories automatically from the training data.
list : categories[i] holds the categories expected in the ith column.
The passed categories should not mix strings and numeric values within
a single feature, and should be sorted in case of numeric values.
The used categories can be found in the categories_ attribute.
Attributes: categories_list of arrays The categories of each feature
determined during fitting (in order of the features in X and
corresponding with the output of transform). This includes the
category specified in drop (if any).
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
label_encoder_x_1 = LabelEncoder()
X[: , 2] = label_encoder_x_1.fit_transform(X[:,2])
transformer = ColumnTransformer(
transformers=[
("OneHot", # Just a name
OneHotEncoder(), # The transformer class
[1] # The column(s) to be applied on.
)
],
remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())
X = X.astype('float64')
working like charm :)
Assuming this is problem from ML course from Udemy
complete code
I did replaced label encoder 1 with column transformer as suggested by Antoine Jaussoin in above comment.
Categorical Data
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Geography", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)
Your Gender column will have index 4 now
labelencoder_x_2=LabelEncoder()
X[:,4]=labelencoder_x_2.fit_transform(X[:,4])
to avoid dummy variable trap
X=X[:, 1:]
You need to add call another class on sklearn which will eliminate 1 column to avoid dummies trap.
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer # Here is the one
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
#onehotencoder = OneHotEncoder(categorical_features = [1]) Not this one
# use this instead
ct = ColumnTransformer([("Country", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)
X = X[:, 1:])
Happy Helping!!!
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Geography",OneHotEncoder(),[1])], remainder= 'passthrough')
X = ct.fit_transform(X)
labelencoder_X2 = LabelEncoder()
X[:, 4] = labelencoder_X2.fit_transform(X[:, 4])
X = X[: , 1:]
X = np.array(X, dtype=float)
Just adding an extra line to convert it from array of objects.
Replace the following code
# onehotencoder = OneHotEncoder(categorical_features = [1])
# X = onehotencoder.fit_transform(X).toarray()
# X = X[:, 1:]
with the following chunk and your code must
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder = 'passthrough')
X = np.array(columnTransformer.fit_transform(X), dtype = np.float64)
X = X[:, 1:]
Assuming you're learning Deep Learning from udemy.
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
# remove categorical_features, it works 100% perfectly
onehotencoder = OneHotEncoder()
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
Here is only one extension for onehotencoder.
if X have lot of columns.
instead
ct = ColumnTransformer([("encoder", OneHotEncoder(), list(categorical_features))], remainder = 'passthrough')
X = ct.fit_transform(X)
Another solution including the transformation of the X object in array type in a float64 type
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float)
one_hot_encode = OneHotEncoder(categorical_features=[0]) is working for scikit-learn 0.20.3 and the parameter removed from scikit-learn 0.24.2 (versions I am checking).
Either Downgrade scikit-learn version
Or Use
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
"""2 classes- Known/unknown Face"""
ct = ColumnTransformer([("Faces", OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)
"""Country column"""
ct = ColumnTransformer([("Country", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)```

How to make a GridSearchCV with a proper FunctionTransformer in a pipeline?

I'm trying to make a Pipeline with GridSearchCV to filter data (with iforest) and perform a regression with StandarSclaler+MLPRegressor.
I made a FunctionTransformer to include my iForest filter in the pipeline. I also define a parameters grid for the iForest filter (using kw_args methods).
All seems OK but when un mahe the fit, nothing happens ... No error message. Nothing.
After, when I want to make a predict, I have the message : "This RandomizedSearchCV instance is not fitted yet"
from sklearn.preprocessing import FunctionTransformer
#Definition of the function auto_filter using the iForest algo
def auto_filter(DF, conta=0.1):
#iForest made on the DF dataframe
iforest = IsolationForest(behaviour='new', n_estimators=300, max_samples='auto', contamination=conta)
iforest = iforest.fit(DF)
# The DF (dataframe in input) is filtered taking into account only the inlier observations
data_filtered = DF[iforest.predict(DF) == 1]
# Only few variables are kept for the next step (regression by MLPRegressor)
# this function delivers X_filtered and y
X_filtered = data_filtered[['SessionTotalTime','AverageHR','MaxHR','MinHR','EETotal','EECH','EEFat','TRIMP','BeatByBeatRMSSD','BeatByBeatSD','HFAverage','LFAverage','LFHFRatio','Weight']]
y = data_filtered['MaxVO2']
return (X_filtered, y)
#Pipeline definition ('auto_filter' --> 'scaler' --> 'MLPRegressor')
pipeline_steps = [('auto_filter', FunctionTransformer(auto_filter)), ('scaler', StandardScaler()), ('MLPR', MLPRegressor(solver='lbfgs', activation='relu', early_stopping=True, n_iter_no_change=20, validation_fraction=0.2, max_iter=10000))]
#Gridsearch Definition with differents values of 'conta' for the first stage of the pipeline ('auto_filter)
parameters = {'auto_filter__kw_args': [{'conta': 0.1}, {'conta': 0.2}, {'conta': 0.3}], 'MLPR__hidden_layer_sizes':[(sp_randint.rvs(1, nb_features, 1),), (sp_randint.rvs(1, nb_features, 1), sp_randint.rvs(1, nb_features, 1))], 'MLPR__alpha':sp_rand.rvs(0, 1, 1)}
pipeline = Pipeline(pipeline_steps)
estimator = RandomizedSearchCV(pipeline, parameters, cv=5, n_iter=10)
estimator.fit(X_train, y_train)
You can try to run step by step manually to find a problem:
auto_filter_transformer = FunctionTransformer(auto_filter)
X_train = auto_filter_transformer.fit_transform(X_train)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
MLPR = MLPRegressor(solver='lbfgs', activation='relu', early_stopping=True, n_iter_no_change=20, validation_fraction=0.2, max_iter=10000)
MLPR.fit(X_train, y_train)
If each of the steps works fine, build a pipeline. Check the pipeline. If it works fine, try to use RandomizedSearchCV.
The func parameter of FunctionTransformer should be a callable that accepts the
same arguments as transform method (array-like X of shape
(n_samples, n_features) and kwargs for func) and returns a transformed X of
the same shape. Your function auto_filter doesn't fit these requirements.
Additionally, anomaly/outlier detection techniques from scikit-learn cannot be
used as intermediate steps in scikit-learn pipelines since a pipeline assembles
one or more transformers and an optional final estimator. IsolationForest or,
say, OneClassSVM is not a transformer: it implements fit and predict.
Thus, a possible solution is to cut off possible outliers separately and build
a pipeline composing of transformers and a regressor:
>>> import warnings
>>> from sklearn.exceptions import ConvergenceWarning
>>> warnings.filterwarnings(category=ConvergenceWarning, action='ignore')
>>> import numpy as np
>>> from scipy import stats
>>> from sklearn.datasets import make_regression
>>> from sklearn.ensemble import IsolationForest
>>> from sklearn.model_selection import RandomizedSearchCV
>>> from sklearn.neural_network import MLPRegressor
>>> from sklearn.pipeline import Pipeline
>>> from sklearn.preprocessing import StandardScaler
>>> X, y = make_regression(n_samples=50, n_features=2, n_informative=2)
>>> detect = IsolationForest(contamination=0.1, behaviour='new')
>>> inliers_mask = detect.fit_predict(X) == 1
>>> pipe = Pipeline([('scale', StandardScaler()),
... ('estimate', MLPRegressor(max_iter=500, tol=1e-5))])
>>> param_distributions = dict(estimate__alpha=stats.uniform(0, 0.1))
>>> search = RandomizedSearchCV(pipe, param_distributions,
... n_iter=2, cv=3, iid=True)
>>> search = search.fit(X[inliers_mask], y[inliers_mask])
The problem is that you won't be able to optimize the hyperparameters of
IsolationForest. One way to handle it is to define hyperparameter space
for the forest, sample hyperparameters with ParameterSampler or
ParameterGrid, predict inliers and fit randomized search:
>>> from sklearn.model_selection import ParameterGrid
>>> forest_param_dict = dict(contamination=[0.1, 0.15, 0.2])
>>> forest_param_grid = ParameterGrid(forest_param_dict)
>>> for sample in forest_param_grid:
... detect = detect.set_params(contamination=sample['contamination'])
... inliers_mask = detect.fit_predict(X) == 1
... search.fit(X[inliers_mask], y[inliers_mask])

AttributeError: 'module' object has no attribute 'cuda'

I was trying to run this repository: https://github.com/WaqasSultani/AnomalyDetectionCVPR2018
In the Test_Anomaly_Detector_public.py I am stuck with error:theano.sandbox.cuda.use('gpu0')
AttributeError: 'module' object has no attribute 'cuda'.
I am using theano as backend
This is Test_Anomaly_Detector_public.py:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2
from keras.optimizers import SGD ,Adagrad
from scipy.io import loadmat, savemat
from keras.models import model_from_json
import theano.tensor as T
import theano
import csv
import ConfigParser
import collections
import time
import csv
import os
from os import listdir
import skimage.transform
from skimage import color
from os.path import isfile, join
import numpy as np
import numpy
from datetime import datetime
from scipy.spatial.distance import cdist,pdist,squareform
import theano.sandbox
import shutil
theano.sandbox.cuda.use('gpu0')
seed = 7
numpy.random.seed(seed)
def load_model(json_path): # Function to load the model
model = model_from_json(open(json_path).read())
return model
def load_weights(model, weight_path): # Function to load the model weights
dict2 = loadmat(weight_path)
dict = conv_dict(dict2)
i = 0
for layer in model.layers:
weights = dict[str(i)]
layer.set_weights(weights)
i += 1
return model
def conv_dict(dict2):
i = 0
dict = {}
for i in range(len(dict2)):
if str(i) in dict2:
if dict2[str(i)].shape == (0, 0):
dict[str(i)] = dict2[str(i)]
else:
weights = dict2[str(i)][0]
weights2 = []
for weight in weights:
if weight.shape in [(1, x) for x in range(0, 5000)]:
weights2.append(weight[0])
else:
weights2.append(weight)
dict[str(i)] = weights2
return dict
# Load Video
def load_dataset_One_Video_Features(Test_Video_Path):
VideoPath =Test_Video_Path
f = open(VideoPath, "r")
words = f.read().split()
num_feat = len(words) / 4096
# Number of features per video to be loaded. In our case num_feat=32, as we divide the video into 32 segments. Note that
# we have already computed C3D features for the whole video and divided the video features into 32 segments.
count = -1;
VideoFeatues = []
for feat in xrange(0, num_feat):
feat_row1 = np.float32(words[feat * 4096:feat * 4096 + 4096])
count = count + 1
if count == 0:
VideoFeatues = feat_row1
if count > 0:
VideoFeatues = np.vstack((VideoFeatues, feat_row1))
AllFeatures = VideoFeatues
return AllFeatures
print("Starting testing...")
AllTest_Video_Path = '/newdata/UCF_Anomaly_Dataset/Dataset/CVPR_Data/C3D_Complete_Video_txt/Test/'
# AllTest_Video_Path contains C3D features (txt file) of each video. Each file contains 32 features, each of 4096 dimensions.
Results_Path = '../Eval_Res/'
# Results_Path is the folder where you can save your results
Model_dir='../Trained_AnomalyModel/'
# Model_dir is the folder where we have placed our trained weights
weights_path = Model_dir + 'weights_L1L2.mat'
# weights_path is Trained model weights
model_path = Model_dir + 'model.json'
if not os.path.exists(Results_Path):
os.makedirs(Results_Path)
All_Test_files= listdir(AllTest_Video_Path)
All_Test_files.sort()
model=load_model(model_path)
load_weights(model, weights_path)
nVideos=len(All_Test_files)
time_before = datetime.now()
for iv in range(nVideos):
Test_Video_Path = os.path.join(AllTest_Video_Path, All_Test_files[iv])
inputs=load_dataset_One_Video_Features(Test_Video_Path) # 32 segments features for one testing video
predictions = model.predict_on_batch(inputs) # Get anomaly prediction for each of 32 video segments.
aa=All_Test_files[iv]
aa=aa[0:-4]
A_predictions_path = Results_Path + aa + '.mat' # Save array of 1*32, containing anomaly score for each segment. Please see Evaluate Anomaly Detector to compute ROC.
print "Total Time took: " + str(datetime.now() - time_before)
My .theanorc file:
[global]
floatX = float32
device = cuda0
[gpuarray]
preallocate = 1
You can comment out this line. When you run please follow this
THEANO_FLAGS=mode=FAST_RUN,device=cuda0,floatX=float32 python [...]

Resources