Difference between optuna (optuna.samplers.RandomSampler) and sklearn (RandomizedSearchCV) - scikit-learn

I would like to use the RandomSearch sample from optuna and I notice that compared to the RandomSearchCV block from sklearn there are big differences in terms of calculation time.
The model is a LightGBM classifier. Do you understand where it can come from?
Code optuna : (1200 seconds)
def fit_random_search_optuna(trial,data,targets) :
params_fixed = {
"is_unbalance" : trial.set_user_attr("is_unbalance",True),
"objective": trial.set_user_attr("objective","binary"),
"verbosity": trial.set_user_attr("verbosity",1),
'bagging_freq': trial.set_user_attr("bagging_freq",1),
'n_jobs' : trial.set_user_attr("n_jobs",-1),
'boosting' : trial.set_user_attr('boosting','gbdt'),
}
params_search = {
'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
'num_leaves': trial.suggest_int('num_leaves', 32, 256),
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 0.8),
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.7, 1),
'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 4, 50),
'num_iterations' : trial.suggest_int('num_iterations',8,250),
'max_bin' : trial.suggest_int('max_bin',10,50),
'learning_rate' : trial.suggest_float('learning_rate', 0.005, 0.1, log = True),
}
params = {**trial.params, **trial.user_attrs}
score = 0
cv = 3
shape = data.shape[0]
list_indexs = np.arange(shape)
kfold = KFold(n_splits=cv,shuffle=True,random_state=42)
indexs = [(indexs_train,indexs_validation) for (indexs_train,indexs_validation) in kfold.split(list_indexs)]
###### INDEX FOR CROSS VAL
for num_cv, (index_train, index_validation) in enumerate(indexs) :
## GET GOOD INDEXS
data_train, data_validation = data.iloc[index_train], data.iloc[index_validation]
target_train, target_validation = targets.iloc[index_train], targets.iloc[index_validation]
## SET LIGHTGBM AND TRAIN SET
model = lgb.LGBMClassifier()
model.set_params(**params)
model.fit(data_train, target_train)
## PREDICTION ON VALIDATION SET
pred_probas = model.predict_proba(data_validation)[:,1]
score+=sklearn.metrics.roc_auc_score(y_score=pred_probas, y_true=target_validation)
return score/cv
## CREATION OF OPTUNA STUDY AND OPTIMIZATION
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())
objective = lambda trial : fit_random_search_optuna(trial, data, targets)
n_jobs_for_random_search=1
study.optimize(objective, n_trials=100, n_jobs = n_jobs_for_random_search)
Code random search : For the same dataset the duration is about 400 seconds
def fit_random_search_sklearn(data, targets) :
params_fixed = {
"is_unbalance" : [True],
"objective": ["binary"],
"verbosity": [1],
'bagging_freq': [1],
'n_jobs' : [-1],
'boosting' : ['gbdt'],
}
params_search = {
'lambda_l2': loguniform( 1e-8, 10.0),
'num_leaves': np.arange(32, 256),
'feature_fraction': uniform(0.3, 0.8),
'bagging_fraction': uniform(0.7, 1),
'min_data_in_leaf': np.arange(4, 50),
'num_iterations' : np.arange(8,250),
'max_bin' : np.arange(10,50),
'learning_rate' : loguniform( 0.005, 0.1),
}
params = {**params_fixed, **params_search}
model = lgb.LGBMClassifier()
n_jobs_for_random_search = 1
random_search = RandomizedSearchCV(model, param_distributions=params,n_iter = 100,
n_jobs = n_jobs_for_random_search, random_state = 42,
cv = 3, scoring = 'roc_auc', refit = False)
random_search.fit(data, targets)
return random_search.best_params_

Related

tune hyperparameters of XGBRanker

I try to optimize my hyperparameters of my XGBoost Ranker model, but I can't
Here is what my table (df on code) looks like :
query
relevance
features
1
5
5.4.7....
1
3
6........
2
5
3........
2
3
8........
3
2
1........
Then I split my table on train test with on the test table only one query:
gss = GroupShuffleSplit(test_size=1, n_splits=1,).split(df, groups=df['query'])
X_train_inds, X_test_inds = next(gss)
train_data= df.iloc[X_train_inds]
X_train=train_data.drop(columns=["relevance"])
Y_train=train_data.relevance
test_data= df.iloc[X_test_inds]
X_test=test_data.drop(columns=["relevance"])
Y_test=test_data.relevance
and constitute groups which is the number of lines by query:
groups = train_data.groupby('query').size().to_frame('size')['size'].to_numpy()
And then I run my model and try to optimize the hyperparameters with a RandomizedSearchCV:
param_dist = {'n_estimators': randint(40, 1000),
'learning_rate': uniform(0.01, 0.59),
'subsample': uniform(0.3, 0.6),
'max_depth': [3, 4, 5, 6, 7, 8, 9],
'colsample_bytree': uniform(0.5, 0.4),
'min_child_weight': [0.05, 0.1, 0.02]
}
scoring = sklearn.metrics.make_scorer(sklearn.metrics.ndcg_score, k=10,
greater_is_better=True)
model = xgb.XGBRanker(
tree_method='hist',
booster='gbtree',
objective='rank:ndcg',)
clf = RandomizedSearchCV(model,
param_distributions=param_dist,
cv=5,
n_iter=5,
scoring=scoring,
error_score=0,
verbose=3,
n_jobs=-1)
clf.fit(X_train,Y_train, group=groups)
Then I have the following error message which it seems be related to my construction of groups but I don't see why (Knowing that without the randomsearch the model works) :
Check failed: group_ptr_.back() == num_row_ (11544 vs. 9235) : Invalid group structure. Number of rows obtained from groups doesn't equal to actual number of rows given by data.
Same problem as here:(Tuning XGBRanker produces error for groups)

Why Spark evaluator has avgMetrics attributes if it only returns 1 value?

I'm using MulticlassClassificationEvaluator to retrieve some metrics like F1-Score or accuracy in a Cross Validation in PySpark:
cross_result = CrossValidator(estimator=RandomForestClassifier(),
estimatorParamMaps=ParamGridBuilder().build(),
evaluator=MulticlassClassificationEvaluator(metricName='f1'),
numFolds=5,
parallelism=-1)
f1_score = cross_result.avgMetrics[0]
Now, my question is: why is avgMetrics a list if it only has one value? Doesn't It should be a scalar value? Am I missing something about this attribute?
Following the source code, I realized that avgMetrics is a list with the average of all the cross-validation folds of the metric for each parameter defined in ParamGrid. So:
dataset = spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
# Note that there are three values for maxIter: 0, 1 and 5
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1, 5]).build()
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
cv = CrossValidator(
estimator=lr,
estimatorParamMaps=grid,
evaluator=evaluator,
parallelism=2
)
cvModel = cv.fit(dataset)
cvModel.avgMetrics[0] # Average accuracy for maxIter = 0
cvModel.avgMetrics[1] # Average accuracy for maxIter = 1
cvModel.avgMetrics[2] # Average accuracy for maxIter = 5

Hyperopt tuning parameters get stuck

I'm testing to tune parameters of SVM with hyperopt library.
Often, when i execute this code, the progress bar stop and the code get stuck.
I do not understand why.
Here is my code :
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
X_train = normalize(X_train)
def hyperopt_train_test(params):
if 'decision_function_shape' in params:
if params['decision_function_shape'] == "ovo":
params['break_ties'] = False
clf = svm.SVC(**params)
y_pred = clf.fit(X_train, y_train).predict(X_test)
return precision_recall_fscore_support(y_test, y_pred, average='macro')[0]
space4svm = {
'C': hp.uniform('C', 0, 20),
'kernel': hp.choice('kernel', ['linear', 'sigmoid', 'poly', 'rbf']),
'degree': hp.uniform('degree', 10, 30),
'gamma': hp.uniform('gamma', 10, 30),
'coef0': hp.uniform('coef0', 15, 30),
'shrinking': hp.choice('shrinking', [True, False]),
'probability': hp.choice('probability', [True, False]),
'tol': hp.uniform('tol', 0, 3),
'decision_function_shape': hp.choice('decision_function_shape', ['ovo', 'ovr']),
'break_ties': hp.choice('break_ties', [True, False])
}
def f(params):
print(params)
precision = hyperopt_train_test(params)
return {'loss': -precision, 'status': STATUS_OK}
trials = Trials()
best = fmin(f, space4svm, algo=tpe.suggest, max_evals=35, trials=trials)
print('best:')
print(best)
I would suggest restricting the space of your parameters and see if that works. Fix the probability parameter to False and see if the model trains. Also, gamma needs to be {‘scale’, ‘auto’} according to the documentation.
Also at every iteration print out your params to better understand which combination is causing the model to get stuck.

Training a Random Forest on Tensorflow

I am trying to train a tensorflow based random forest regression on numerical and continuos data.
When I try to fit my estimator it begins with the message below:
INFO:tensorflow:Constructing forest with params =
INFO:tensorflow:{'num_trees': 10, 'max_nodes': 1000, 'bagging_fraction': 1.0, 'feature_bagging_fraction': 1.0, 'num_splits_to_consider': 10, 'max_fertile_nodes': 0, 'split_after_samples': 250, 'valid_leaf_threshold': 1, 'dominate_method': 'bootstrap', 'dominate_fraction': 0.99, 'model_name': 'all_dense', 'split_finish_name': 'basic', 'split_pruning_name': 'none', 'collate_examples': False, 'checkpoint_stats': False, 'use_running_stats_method': False, 'initialize_average_splits': False, 'inference_tree_paths': False, 'param_file': None, 'split_name': 'less_or_equal', 'early_finish_check_every_samples': 0, 'prune_every_samples': 0, 'feature_columns': [_NumericColumn(key='Average_Score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='lat', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='lng', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)], 'num_classes': 1, 'num_features': 2, 'regression': True, 'bagged_num_features': 2, 'bagged_features': None, 'num_outputs': 1, 'num_output_columns': 2, 'base_random_seed': 0, 'leaf_model_type': 2, 'stats_model_type': 2, 'finish_type': 0, 'pruning_type': 0, 'split_type': 0}
Then the process breaks down and I get a value error below:
ValueError: Shape must be at least rank 2 but is rank 1 for 'concat' (op: 'ConcatV2') with input shapes: [?], [?], [?], [] and with computed input tensors: input[3] = <1>.
This is the code I am using:
import tensorflow as tf
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
import pandas as pd
from tensorflow.contrib.tensor_forest.client import random_forest
from tensorflow.python.estimator.inputs import numpy_io
import numpy as np
def getFeatures():
Average_Score = tf.feature_column.numeric_column('Average_Score')
lat = tf.feature_column.numeric_column('lat')
lng = tf.feature_column.numeric_column('lng')
return [Average_Score,lat ,lng]
# Import hotel data
Hotel_Reviews=pd.read_csv("./DataMining/Hotel_Reviews.csv")
Hotel_Reviews_Filtered=Hotel_Reviews[(Hotel_Reviews.lat.notnull() |
Hotel_Reviews.lng.notnull())]
Hotel_Reviews_Filtered_Target = Hotel_Reviews_Filtered[["Reviewer_Score"]]
Hotel_Reviews_Filtered_Features = Hotel_Reviews_Filtered[["Average_Score","lat","lng"]]
#Preprocess the data
x=Hotel_Reviews_Filtered_Features.to_dict('list')
for key in x:
x[key] = np.array(x[key])
y=Hotel_Reviews_Filtered_Target.values
#specify params
params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
feature_colums= getFeatures(),
num_classes=1,
num_features=2,
regression=True,
num_trees=10,
max_nodes=1000)
#build the graph
graph_builder_class = tensor_forest.RandomForestGraphs
est=random_forest.TensorForestEstimator(
params, graph_builder_class=graph_builder_class)
#define input function
train_input_fn = numpy_io.numpy_input_fn(
x=x,
y=y,
batch_size=1000,
num_epochs=1,
shuffle=True)
est.fit(input_fn=train_input_fn, steps=500)
The variables x is a list of numpy array of shape (512470,):
{'Average_Score': array([ 7.7, 7.7, 7.7, ..., 8.1, 8.1, 8.1]),
'lat': array([ 52.3605759, 52.3605759, 52.3605759, ..., 48.2037451,
48.2037451, 48.2037451]),
'lng': array([ 4.9159683, 4.9159683, 4.9159683, ..., 16.3356767,
16.3356767, 16.3356767])}
The variable y is numpy array of shape (512470,1):
array([[ 2.9],
[ 7.5],
[ 7.1],
...,
[ 2.5],
[ 8.8],
[ 8.3]])
Force each array in x to be 2 dim using ndmin=2. Then the shapes should match and concat should be able to operate.

Custom loss function in Keras, how to deal with placeholders

I am trying to generate a custom loss function in TF/Keras,the loss function works if it is run in a session and passed constants, however, it stops working when compiled into a Keras.
The cost function (thanks to Lior for converting it to TF)
def ginicTF(actual,pred):
n = int(actual.get_shape()[-1])
inds = K.reverse(tf.nn.top_k(pred,n)[1],axes=[0])
a_s = K.gather(actual,inds)
a_c = K.cumsum(a_s)
giniSum = K.sum(a_c)/K.sum(a_s) - (n+1)/2.0
return giniSum / n
def gini_normalizedTF(a,p):
return -ginicTF(a, p) / ginicTF(a, a)
#Test the cost function
sess = tf.InteractiveSession()
p = [0.9, 0.3, 0.8, 0.75, 0.65, 0.6, 0.78, 0.7, 0.05, 0.4, 0.4, 0.05, 0.5, 0.1, 0.1]
a = [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
ac = tf.placeholder(shape=(len(a),),dtype=K.floatx())
pr = tf.placeholder(shape=(len(p),),dtype=K.floatx())
print(gini_normalizedTF(ac,pr).eval(feed_dict={ac:a,pr:p}))
this prints -0.62962962963, which is the correct value.
Now let's put this into Keras MLP
def makeModel(n_feat):
model = Sequential()
#hidden layer #1
model.add(layers.Dense(12, input_shape=(n_feat,)))
model.add(layers.Activation('selu'))
model.add(layers.Dropout(0.2))
#output layer
model.add(layers.Dense(1))
model.add(layers.Activation('softmax'))
model.compile(loss=gini_normalizedTF, optimizer='sgd', metrics=['binary_accuracy'])
return model
model=makeModel(n_feats)
model.fit(x=Mout,y=targets,epochs=n_epochs,validation_split=0.2,batch_size=batch_size)
This generates error
<ipython-input-62-6ade7307336f> in ginicTF(actual, pred)
9 def ginicTF(actual,pred):
10
---> 11 n = int(actual.get_shape()[-1])
12
13 inds = K.reverse(tf.nn.top_k(pred,n)[1],axes=[0])
TypeError: __int__ returned non-int (type NoneType)
I tried going around it by giving a default value of n/etc but this doesn't seem to be going anywhere.
Can someone explain the nature of this problem and how I can remedy it?
Thank you!
Edit:
Updated things to keep it as tensor and then cast
def ginicTF(actual,pred):
nT = K.shape(actual)[-1]
n = K.cast(nT,dtype='int32')
inds = K.reverse(tf.nn.top_k(pred,n)[1],axes=[0])
a_s = K.gather(actual,inds)
a_c = K.cumsum(a_s)
n = K.cast(nT,dtype=K.floatx())
giniSum = K.cast(K.sum(a_c)/K.sum(a_s),dtype=K.floatx()) - (n+1)/2.0
return giniSum / n
def gini_normalizedTF(a,p):
return ginicTF(a, p) / ginicTF(a, a)
Still has the issue of getting "none" when used as a cost function

Resources