from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
classifier = SVC(C=100, # penalty parameter, setting it to a larger value
kernel='rbf', # kernel type, rbf working fine here
degree=3, # default value, not tuned yet
gamma=1, # kernel coefficient, not tuned yet
coef0=1, # change to 1 from default value of 0.0
shrinking=True, # using shrinking heuristics
tol=0.001, # stopping criterion tolerance
probability=False, # no need to enable probability estimates
cache_size=200, # 200 MB cache size
class_weight=None, # all classes are treated equally
verbose=False, # print the logs
max_iter=-1, # no limit, let it run
decision_function_shape=None, # will use one vs rest explicitly
random_state=None)
model = OneVsRestClassifier(classifier, n_jobs=4)
model.fit(X_train,y_train)
I am getting this error:
ValueError: WRITEBACKIFCOPY base is read-only.
Please perform scaling of input data before training the model i.e. OneVsRestClassifier.
eg.
from sklearn.preprocessing import MinMaxScaler #if its a dense matrix else use MaxAbsScaler in case of sparse matrix
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
classifier = SVC(C=100, # penalty parameter, setting it to a larger value
kernel='rbf', # kernel type, rbf working fine here
degree=3, # default value, not tuned yet
gamma=1, # kernel coefficient, not tuned yet
coef0=1, # change to 1 from default value of 0.0
shrinking=True, # using shrinking heuristics
tol=0.001, # stopping criterion tolerance
probability=False, # no need to enable probability estimates
cache_size=200, # 200 MB cache size
class_weight=None, # all classes are treated equally
verbose=False, # print the logs
max_iter=-1, # no limit, let it run
decision_function_shape=None, # will use one vs rest explicitly
random_state=None)
model = OneVsRestClassifier(classifier, n_jobs=-1)
model.fit(X_train,y_train)
Related
I have followed the basic example as given below, from: https://huggingface.co/transformers/training.html
from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
model = TFBertForSequenceClassification.from_pretrained("bert-large-uncased")
training_args = TFTrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total # of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
)
trainer = TFTrainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=tfds_train_dataset, # tensorflow_datasets training dataset
eval_dataset=tfds_test_dataset # tensorflow_datasets evaluation dataset
)
trainer.train()
But there seems to be no way to specify the loss function for the classifier. For-ex if I finetune on a binary classification problem, I would use
tf.keras.losses.BinaryCrossentropy(from_logits=True)
else I would use
tf.keras.losses.CategoricalCrossentropy(from_logits=True)
My set up is as follows:
transformers==4.3.2
tensorflow==2.3.1
python==3.6.12
Trainer has this capability to use compute_loss
For more you can look into the documentation:
https://huggingface.co/docs/transformers/main_classes/trainer#:~:text=passed%20at%20init.-,compute_loss,-%2D%20Computes%20the%20loss
Here is an example of how to customize Trainer to use a weighted loss (useful when you have an unbalanced training set):
from torch import nn
from transformers import Trainer
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.get("labels")
# forward pass
outputs = model(**inputs)
logits = outputs.get("logits")
# compute custom loss (suppose one has 3 labels with different weights)
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
create a class which inherits from PreTrainedModel and then in it's forward function create your respective loss function.
I am trying to fit a model in using Gradient boosted machine, after selecting some features using roc-AUC and using a baseline to remove the features I don't need. Then I tried to fit the train set using GBM but I got an error message.
I implemented GBM
# lets drop roc-auc values below 0.54 baseline
x_train.drop(labels=removed_roc_values, axis=1, inplace=True)
x_test.drop(labels=removed_roc_values, axis=1, inplace=True)
x_train.shape, x_test.shape
The output of shape after dropping baseline features:((4930, 17), (2113, 23))
# using baseline GBM without tunning
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV
baseline = GradientBoostingClassifier(learning_rate=0.1,
n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1,
subsample=1,max_features='sqrt', random_state=10)
baseline.fit(x_train,y_train)
predictors=list(x_train)
feat_imp = pd.Series(baseline.feature_importances_,
predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features')
plt.ylabel('Feature Importance Score')
print('Accuracy of the GBM on test set: {:.3f}'.format(baseline.score(x_test,
y_test)))
pred=baseline.predict(x_test)
print(classification_report(y_test, pred))
I expected to get the classification report, instead, I got the below error
ValueError: The number of features of the model must match the input.
Model
n_features is 17 and input n_features is 23
Thanks.
So I found out that StandardScaler() can make my RFECV inside my GridSearchCV with each on a nested 3-fold cross validation run faster. Without StandardScaler(), my code was running for more than 2 days, so I canceled and decided to inject StandardScaler into the process. But now it is has been running for more than 4 hours and I am not sure if I have done it right. Here is my code:
# Choose Linear SVM as classifier
LSVM = SVC(kernel='linear')
selector = RFECV(LSVM, step=1, cv=3, scoring='f1')
param_grid = [{'estimator__C': [0.001, 0.01, 0.1, 1, 10, 100]}]
clf = make_pipeline(StandardScaler(),
GridSearchCV(selector,
param_grid,
cv=3,
refit=True,
scoring='f1'))
clf.fit(X, Y)
I think I haven't gotten it right to be honest because I think the StandardScaler() should be put inside the GridSearchCV() function for it to normalize the data each fold, not only just once (?). Please correct me if I am wrong or if my pipeline is incorrect and hence why it is still running for a long time.
I have 8,000 rows of 145 features to be pruned by RFECV, and 6 C-Values to be pruned by GridSearchCV. So for each C-Value, the best feature set is determined by the RFECV.
Thanks!
Update:
So I put the StandardScaler inside the RFECV like this:
clf = SVC(kernel='linear')
kf = KFold(n_splits=3, shuffle=True, random_state=0)
estimators = [('standardize' , StandardScaler()),
('clf', clf)]
class Mypipeline(Pipeline):
#property
def coef_(self):
return self._final_estimator.coef_
#property
def feature_importances_(self):
return self._final_estimator.feature_importances_
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, cv=kf, scoring='f1', verbose=10)
param_grid = [{'estimator__svc__C': [0.001, 0.01, 0.1, 1, 10, 100]}]
clf = GridSearchCV(rfecv, param_grid, cv=3, scoring='f1', verbose=10)
But it still throws out the following error:
ValueError: Invalid parameter C for estimator Pipeline(memory=None,
steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, >with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, >coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False))]). Check the list of available parameters with >estimator.get_params().keys().
Kumar is right. Also, what You might want to do, turn on verbose in the GridSearchCV. Also, You could add a limit to the number of iterations of the SVC, starting from a very small number, like 5, just to make sure that the problem is not with the convergence.
I'm trying to create an InceptionV3 CNN which has previously been trained on Imagenet. While the creation and the loading of the checkpoint seems to be working correctly, the result seems to be random, as everytime I run the script, I get a different result, even though I don't change anything. The network is recreated from scratch, the same unchanged network is loaded and the same image is classified (which to my understanding should still lead to the same result, even if it can't decide what the image actually is).
I just noticed that even if I try to classify the same image multiple times within the same execution of the script, I end up with a random result.
I create the CNN using like this
from tensorflow.contrib.slim.nets import inception as nn_architecture
from tensorflow.contrib import slim
with slim.arg_scope([slim.conv2d, slim.fully_connected], normalizer_fn=slim.batch_norm,
normalizer_params={'updates_collections': None}): ## this is a fix for an issue where the model doesn't fit the checkpoint https://github.com/tensorflow/models/issues/2977
logits, endpoints = nn_architecture.inception_v3(input, # input
1001, #NUM_CLASSES, #num classes
# num classes #maybe set to 0 or none to ommit logit layer and return input for logit layer instead.
True, # is training (dropout = zero if false for eval
0.8, # dropout keep rate
16, # min depth
1.0, # depth multiplayer
layers_lib.softmax, # prediction function
True, # spatial squeeze
tf.AUTO_REUSE,
# reuse, use get variable to get variables directly... probably
'InceptionV3') # scope
afterwards I load the imagenet trained checkpoint like this
saver = tf.train.Saver()
saver.restore(sess, CHECKPOINT_PATH)
then I verify that it is workingby classifying this image
which I squish from it's original resolution to 299x299 which is required as input for the network
from skimage import io
car = io.imread("data/car.jpg")
car_scaled = zoom(car, [299 / car.shape[0], 299 / car.shape[1], 1])
car_cnnable = np.array([car_scaled])
Then I try to classify the image and print which class the image belongs to most likely and with what likelihood.
predictions = sess.run(logits, feed_dict={images: car_cnnable})
predictions = np.squeeze(predictions) #shape (1, 1001) to shape (1001)
print(np.argmax(predictions))
print(predictions[np.argmax(predictions)])
The class is (or seems to be) random and the likelihood varies as well.
My last few executions were:
Class - likelihood
899 - 0.98858
660 - 0.887204
734 - 0.904047
675 - 0.886952
Here is my full code: https://gist.github.com/Syzygy2048/ddb8602652b547a71316ee0febfddbef
Since I set isTraining to true, it applied the dropout rate every time the network was used. I was under the impression that this only happened during back propagation.
To get it to work correctly, the code should be
logits, endpoints = nn_architecture.inception_v3(input, # input
1001, #NUM_CLASSES, #num classes
# num classes #maybe set to 0 or none to ommit logit layer and return input for logit layer instead.
False, # is training (dropout = zero if false for eval
0.8, # dropout keep rate
16, # min depth
1.0, # depth multiplayer
layers_lib.softmax, # prediction function
True, # spatial squeeze
tf.AUTO_REUSE,
# reuse, use get variable to get variables directly... probably
'InceptionV3') # scope
I am trying to generate a range of synthetic data sets using make_classification in scikit-learn, with varying sample sizes, prevalences (i.e., proportions of the positive class), and accuracies. Varying the sample size and prevalence is fairly straightforward, but I am having difficult generating any data sets that have less than 50% accuracy using logistic regression. Playing around with the number of informative columns, the number of clusters per class, and the flip_y parameter (which randomly flips the class of a given proportion of observations) seem to reduce the accuracy, but not as much as I would like. Is there a way to vary the parameters of make_classification in such a way to reduce this further (e.g., to 20%)?
Thanks!
Generally, the combination of a fairly low number of n_samples, a high probability of randomly flipping the label flip_y and a large number of n_classes should get you where you want.
You can try the following:
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
# 2-class problem
X, y = make_classification(n_samples=100, n_informative=2, flip_y=0.8, random_state=42)
cross_val_score(estimator=lr, X=X, y=y, scoring='accuracy', cv=10)
# Output
array([ 0.54545455, 0.27272727, 0.45454545, 0.2 , 0.4 ,
0.5 , 0.7 , 0.55555556, 0.55555556, 0.44444444])
# 8-class problem
X, y = make_classification(n_samples=100, n_classes=8, n_informative=4, n_clusters_per_class=1, flip_y=0.5, random_state=42)
cross_val_score(estimator=lr, X=X, y=y, scoring='accuracy', cv=5)
# Output
array([ 0.16666667, 0.19047619, 0.15 , 0.16666667, 0.29411765])
In case you go with binary classification only, you should carefully choose flip_y. If, for example, you choose flip_y to be high, that means you flip almost every label, hence making the problem easier!. (the consistency is preserved)
Hence, in binary classification, flip_y is really min(flip_y,1-flip_y), and setting it as 0.5 will make the classification really hard.
Another thing you can do: after creating the data, do dimension reduction, using PCA:
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
X, y = make_classification(n_samples=10000, n_informative=18,n_features=20, flip_y=0.15, random_state=217)
print cross_val_score(estimator=clf, X=X, y=y, scoring='accuracy', cv=4)
#prints [ 0.80287885 0.7904 0.796 0.78751501]
pca = PCA(n_components=10)
X = pca.fit_transform(X)
print cross_val_score(estimator=clf, X=X, y=y, scoring='accuracy', cv=4)
#prints [ 0.76409436 0.7684 0.7628 0.75830332]
you can reduce n_components to get even poorer results, while having the original number of features:
pca = PCA(n_components=1)
X = pca.fit_transform(X)
X = np.concatenate((X, np.random.rand(X.shape[0],19)),axis=1) #concatenating random features
cross_val_score(estimator=clf, X=X, y=y, scoring='accuracy', cv=10)
print cross_val_score(estimator=clf, X=X, y=y, scoring='accuracy', cv=4)
#prints [ 0.5572 0.566 0.5552 0.5664]
Getting less than 50% accuracy is 'hard' - even when you take random vectors, the expectancy of accuracy is still 0.5:
X = np.random.rand(10000,20)
print np.average(cross_val_score(estimator=clf, X=X, y=y, scoring='accuracy', cv=100))
#prints 0.501489999
So 55% accuracy is considered very low.