Class label not present sklearn.ensemble.RandomForestClassifier for class_weight - python-3.x

I am using RandomForestClassifier from sklearn.ensemble. It works when I use it without class_weight but when I add class_weight It gives this error.
lr = RandomForestClassifier(n_estimators = 22,criterion =
'entropy',max_depth=5,class_weight={'Sex':2.})
lr.fit(X_train.values[:,1:],Y_train)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-248-411a1c135d08> in <module>
1 print(X_train)
----> 2 lr.fit(X_train.values[:,1:],Y_train)
/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
273 self.n_outputs_ = y.shape[1]
274
--> 275 y, expanded_class_weight = self._validate_y_class_weight(y)
276
277 if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_y_class_weight(self, y)
519 class_weight = self.class_weight
520 expanded_class_weight = compute_sample_weight(class_weight,
--> 521 y_original)
522
523 return y, expanded_class_weight
/opt/conda/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_sample_weight(class_weight, y, indices)
161 weight_k = compute_class_weight(class_weight_k,
162 classes_full,
--> 163 y_full)
164
165 weight_k = weight_k[np.searchsorted(classes_full, y_full)]
/opt/conda/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_class_weight(class_weight, classes, y)
63 i = np.searchsorted(classes, c)
64 if i >= len(classes) or classes[i] != c:
---> 65 raise ValueError("Class label {} not present.".format(c))
66 else:
67 weight[i] = class_weight[c]
ValueError: Class label Sex not present.
This is my X_train :
PassengerId Pclass Sex ... Ticket Fare Embarked

How many classes do you have in Y_train?
The class_weight concerns the Y_train i.e. the labels.
Example:
class_weight={0:1,1:2}
means weight 1 to class 0 and weight 2 to class 1.
Using class_weight={'Sex':2.} is wrong and it refers to X_train.

Related

Problem in getting the accuracy 'train_score' and 'test_score' using Random Forest regressor

I tried to run the code below, but I have received the error below. I have a problem to get the 'train_score' and 'test_score'. I would be happy if you can advise me how to fix this error.
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import forest
import sklearn.ensemble
mse_train1 = []
mse_test1 = []
num_trees1 = []
train_scores, test_scores = list(), list()
model = RandomForestRegressor (n_estimators = 1 ,min_samples_leaf = 7, n_jobs=-1, oob_score=True,random_state=0)
i = 0
for iter in range(10):
num_trees1.append(iter)
i=+1
model.fit(train_set_RF, train_set_pred)
y_train_predicted = model.predict(train_set_RF)
train_score = model.score(train_set_pred, y_train_predicted)
train_scores.append(train_score)
y_test_predicted = model.predict(test_set_RF)
test_score = model.score(test_set_pred, y_test_predicted)
test_scores.append(test_score)
mse_train = mean_squared_error(train_set_pred, y_train_predicted)
mse_train1.append(mse_train)
mse_test = mean_squared_error(test_set_pred, y_test_predicted)
mse_test1.append(mse_test)
print("Iteration: {} Train mse: {} Test mse: {}".format(iter, mse_train, mse_test))
model.n_estimators += 1
print (train_scores)
print (test_scores)
print (mse_train1)
print (mse_test1)
This more detail about the error I got:
ValueError Traceback (most recent call
last)
<ipython-input-17-ff545aa1896c> in <module>
19 y_train_predicted = model.predict(train_set_RF)
20 #y_train_predicted =
np.nan_to_num(y_train_predicted.astype(np.float32))
---> 21 train_score = model.score(train_set_pred, y_train_predicted)
22
23 #train_acc = accuracy_score(train_set_pred,
y_train_predicted)
~\anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y,
sample_weight)
549
550 from .metrics import r2_score
--> 551 y_pred = self.predict(X)
552 return r2_score(y, y_pred, sample_weight=sample_weight)
553
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in
predict(self, X)
781 check_is_fitted(self)
782 # Check data
--> 783 X = self._validate_X_predict(X)
784
785 # Assign chunk of trees to jobs
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in
_validate_X_predict(self, X)
419 check_is_fitted(self)
420
--> 421 return self.estimators_[0]._validate_X_predict(X,
check_input=True)
422
423 #property
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in
_validate_X_predict(self, X, check_input)
386 """Validate X whenever one tries to predict, apply,
predict_proba"""
387 if check_input:
--> 388 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
389 if issparse(X) and (X.indices.dtype != np.intc or
390 X.indptr.dtype != np.intc):
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters,
args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy,
force_all_finite, ensure_2d, allow_nd, ensure_min_samples,
ensure_min_features, estimator)
622 "Reshape your data either using
array.reshape(-1, 1) if "
623 "your data has a single feature or
array.reshape(1, -1) "
--> 624 "if it contains a single sample.".format(array))
625
626 # in the future np.flexible dtypes will be handled like
object dtypes
ValueError: Expected 2D array, got 1D array instead:
array=[0.3119313 0.29728386 0.29309732 ... 0.30558413 0.29317933
0.29755104].
Reshape your data either using array.reshape(-1, 1) if your data has a
single feature or array.reshape(1, -1) if it contains a single sample.

Unable to execute sklearn.naive_bayes GaussianNB on California housing data set

Getting an error Unknown label type: (array([0.14999, 0.175 , 0.225 , ..., 4.991 , 5. , 5.00001]),) when trying to fit the dataset.
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_iris
cali = fetch_california_housing()
iris = load_iris()
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB() # probabilistic
y_pred_cali = gnb.fit(cali.data, cali.target).predict(cali.data)
Error:
ValueError Traceback (most recent call last)
<ipython-input-23-71ed3304ef0f> in <module>
14
15 gnb = GaussianNB() # probabilistic
---> 16 y_pred_cali = gnb.fit(cali[0], cali[1]).predict(cali[0])
17
~\Anaconda3\lib\site-packages\sklearn\naive_bayes.py in fit(self, X, y, sample_weight)
189 X, y = check_X_y(X, y)
190 return self._partial_fit(X, y, np.unique(y), _refit=True,
--> 191 sample_weight=sample_weight)
192
193 #staticmethod
~\Anaconda3\lib\site-packages\sklearn\naive_bayes.py in _partial_fit(self, X, y, classes, _refit, sample_weight)
351 self.classes_ = None
352
--> 353 if _check_partial_fit_first_call(self, classes):
354 # This is the first call to partial_fit:
355 # initialize various cumulative counters
~\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in _check_partial_fit_first_call(clf, classes)
318 else:
319 # This is the first call to partial_fit
--> 320 clf.classes_ = unique_labels(classes)
321 return True
322
~\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in unique_labels(*ys)
92 _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
93 if not _unique_labels:
---> 94 raise ValueError("Unknown label type: %s" % repr(ys))
95
96 ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys))
ValueError: Unknown label type: (array([0.14999, 0.175 , 0.225 , ..., 4.991 , 5. , 5.00001]),)
This data set has a continuous target variable.
GNB is a classification method, not a regression method. Y needs to be discrete classes, not a continuous variable.

Having problems with dimensionality using scikit-learn LinearRegression.predict()

I'm training polynomial regressions over a series of dimensions, and attempting to use predict() for a list of inputs.
inputs = np.linspace(0,10,100).reshape(-1,1)
for i, deg in enumerate([1, 3, 6, 9]):
poly = PolynomialFeatures(degree=deg)
X_poly = poly.fit_transform(X_train.reshape(-1,1))
linreg = LinearRegression().fit(X_poly, y_train)
print(linreg.predict(inputs))
When I call predict(), I get the following traceback:
ValueError Traceback (most recent call last)
<ipython-input-5-4100ae3f3ba3> in <module>()
13 return
14
---> 15 answer_one()
<ipython-input-5-4100ae3f3ba3> in answer_one()
9 X_poly = PolynomialFeatures(degree=deg).fit_transform(X_train.reshape(-1,1))
10 linreg = LinearRegression().fit(X_poly, y_train)
---> 11 print(linreg.predict(inputs))
12 # print(linreg.score(X_poly, y_train))
13 return
/opt/conda/lib/python3.6/site-packages/sklearn/linear_model/base.py in predict(self, X)
266 Returns predicted values.
267 """
--> 268 return self._decision_function(X)
269
270 _preprocess_data = staticmethod(_preprocess_data)
/opt/conda/lib/python3.6/site-packages/sklearn/linear_model/base.py in _decision_function(self, X)
251 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
252 return safe_sparse_dot(X, self.coef_.T,
--> 253 dense_output=True) + self.intercept_
254
255 def predict(self, X):
/opt/conda/lib/python3.6/site-packages/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output)
187 return ret
188 else:
--> 189 return fast_dot(a, b)
190
191
ValueError: shapes (100,1) and (2,) not aligned: 1 (dim 1) != 2 (dim 0)
The (100,1) shape is clearly for the inputs array, but I'm not sure what object's shape is (2,).
When you train a classifier with poly:
X_poly = poly.fit_transform(X_train.reshape(-1,1))
you need to make sure that the prediction is also using poly values:
print(linreg.predict(inputs))
in this case inputs have to be also polys:
inputs = poly.transform(inputs)
print(linreg.predict(inputs))

How to use SHAP with a linear SVC model from sklearn using Pipeline?

I am doing text classification using a linear SVC model from sklearn. Now I want to visualize which words/tokens have the highest impact on the classification decision by using SHAP (https://github.com/slundberg/shap).
Right now this does not work because I am getting an error that seems to originate from the vectorizer step in the pipeline I have defined - whats wrong here?
Is my general approach on how to use SHAP in this case correct?
x_Train, x_Test, y_Train, y_Test = train_test_split(df_all['PDFText'], df_all['class'], test_size = 0.2, random_state = 1234)
pipeline = Pipeline([
(
'tfidv',
TfidfVectorizer(
ngram_range=(1,3),
analyzer='word',
strip_accents = ascii,
use_idf = True,
sublinear_tf=True,
max_features=6000,
min_df=2,
max_df=1.0
)
),
(
'lin_svc',
svm.SVC(
C=1.0,
probability=True,
kernel='linear'
)
)
])
pipeline.fit(x_Train, y_Train)
shap.initjs()
explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
shap_values = explainer.shap_values(x_Test, nsamples=100)
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_Test.iloc[0,:])
This is the error message I get:
Provided model function fails when applied to the provided data set.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-81-4bca63616b3b> in <module>
3
4 # use Kernel SHAP to explain test set predictions
----> 5 explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
6 shap_values = explainer.shap_values(x_Test, nsamples=100)
7
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\explainers\kernel.py in __init__(self, model, data, link, **kwargs)
95 self.keep_index_ordered = kwargs.get("keep_index_ordered", False)
96 self.data = convert_to_data(data, keep_index=self.keep_index)
---> 97 model_null = match_model_to_data(self.model, self.data)
98
99 # enforce our current input type limitations
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\common.py in match_model_to_data(model, data)
80 out_val = model.f(data.convert_to_df())
81 else:
---> 82 out_val = model.f(data.data)
83 except:
84 print("Provided model function fails when applied to the provided data set.")
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\pipeline.py in predict_proba(self, X)
379 for name, transform in self.steps[:-1]:
380 if transform is not None:
--> 381 Xt = transform.transform(Xt)
382 return self.steps[-1][-1].predict_proba(Xt)
383
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents, copy)
1631 check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
1632
-> 1633 X = super(TfidfVectorizer, self).transform(raw_documents)
1634 return self._tfidf.transform(X, copy=False)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents)
1084
1085 # use the same matrix-building strategy as fit_transform
-> 1086 _, X = self._count_vocab(raw_documents, fixed_vocab=True)
1087 if self.binary:
1088 X.data.fill(1)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
940 for doc in raw_documents:
941 feature_counter = {}
--> 942 for feature in analyze(doc):
943 try:
944 feature_idx = vocabulary[feature]
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
326 tokenize)
327 return lambda doc: self._word_ngrams(
--> 328 tokenize(preprocess(self.decode(doc))), stop_words)
329
330 else:
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
KernelExplainer expects to receive a classification model as the first argument. Please check the use of Pipeline with Shap following the link.
In your case, you can use the Pipeline as follows:
x_Train = pipeline.named_steps['tfidv'].fit_transform(x_Train)
explainer = shap.KernelExplainer(pipeline.named_steps['lin_svc'].predict_proba, x_Train)

FastAI v1 PyTorch Custom Model

i have been trying to use fastai with a custom torch model. My code is as follow:
X_train = np.load(dirpath + 'X_train.npy')
X_valid = np.load(dirpath + 'X_valid.npy')
Y_train = np.load(dirpath + 'Y_train.npy')
Y_valid = np.load(dirpath + 'Y_valid.npy')
X_train's shape is : (240, 122, 96),
and Y_train's shape is : (240,1)
Then i convert these to torch tensors ,
# Converting data to torch tensors
def to_torch_data(x,np_type,tch_type):
return torch.from_numpy(x.astype(np_type)).to(tch_type)
X_train = to_torch_data(X_train,float,torch.float32)
X_valid = to_torch_data(X_valid,float,torch.float32)
Y_train = to_torch_data(Y_train,float,torch.float32)
Y_valid = to_torch_data(Y_valid,float,torch.float32)
Creating TensorDataSets for fastai DataBunch wrapper,
# Creating torch tensor datasets so that data can be used
# on ImageDataBunch function for fastai
train_ds = tdatautils.TensorDataset(X_train,Y_train)
valid_ds = tdatautils.TensorDataset(X_valid,Y_valid)
# Creating DataBunch object to be used as data in fastai methods.
batch_size = 24
my_data_bunch = DataBunch.create(train_ds,valid_ds,bs=batch_size)
And this is my custom torch model :
# Creating corresponding torch model
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self,droprate=0,activationF=None):
super(Net, self).__init__()
self.lstm_0 = nn.LSTM(96, 720)
self.activation_0 = nn.ELU()
self.dropout_0 = nn.Dropout(p=droprate)
self.lstm_1 = nn.LSTM(720,480)
self.activation_1 = nn.ELU()
self.batch_norm_1 = nn.BatchNorm1d(122)
self.fc_2 = nn.Linear(480,128)
self.dropout_2 = nn.Dropout(p=droprate)
self.last = nn.Linear(128,1)
self.last_act = nn.ReLU()
def forward(self, x):
out,hid1 = self.lstm_0(x)
out = self.dropout_0(self.activation_0(out))
out,hid2 = self.lstm_1(out)
out = out[:,-1,:]
out = self.batch_norm_1(self.activation_1(out))
out = self.dropout_2(self.fc_2(out))
out = self.last_act(self.last(out))
return out
#create instance of model
net = Net(droprate=train_droprate,activationF=train_activation) #.cuda()
print(net)
After all these, i run the learn for lr_find method. And i get this error :
Empty Traceback (most recent call last)
C:\Anaconda3\envs\fastai\lib\site-packages\torch\utils\data\dataloader.py in _try_get_batch(self, timeout)
510 try:
--> 511 data = self.data_queue.get(timeout=timeout)
512 return (True, data)
C:\Anaconda3\envs\fastai\lib\queue.py in get(self, block, timeout)
171 if remaining <= 0.0:
--> 172 raise Empty
173 self.not_empty.wait(remaining)
Empty:
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-35-e4b7603c0a82> in <module>
----> 1 my_learner.lr_find()
~\Desktop\fastai\fastai\fastai\train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, wd)
30 cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
31 epochs = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 32 learn.fit(epochs, start_lr, callbacks=[cb], wd=wd)
33
34 def to_fp16(learn:Learner, loss_scale:float=None, max_noskip:int=1000, dynamic:bool=True, clip:float=None,
~\Desktop\fastai\fastai\fastai\basic_train.py in fit(self, epochs, lr, wd, callbacks)
197 callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
198 if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
--> 199 fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
200
201 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
~\Desktop\fastai\fastai\fastai\basic_train.py in fit(epochs, learn, callbacks, metrics)
97 cb_handler.set_dl(learn.data.train_dl)
98 cb_handler.on_epoch_begin()
---> 99 for xb,yb in progress_bar(learn.data.train_dl, parent=pbar):
100 xb, yb = cb_handler.on_batch_begin(xb, yb)
101 loss = loss_batch(learn.model, xb, yb, learn.loss_func, learn.opt, cb_handler)
C:\Anaconda3\envs\fastai\lib\site-packages\fastprogress\fastprogress.py in __iter__(self)
70 self.update(0)
71 try:
---> 72 for i,o in enumerate(self._gen):
73 if i >= self.total: break
74 yield o
~\Desktop\fastai\fastai\fastai\basic_data.py in __iter__(self)
73 def __iter__(self):
74 "Process and returns items from `DataLoader`."
---> 75 for b in self.dl: yield self.proc_batch(b)
76
77 #classmethod
C:\Anaconda3\envs\fastai\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
574 while True:
575 assert (not self.shutdown and self.batches_outstanding > 0)
--> 576 idx, batch = self._get_batch()
577 self.batches_outstanding -= 1
578 if idx != self.rcvd_idx:
C:\Anaconda3\envs\fastai\lib\site-packages\torch\utils\data\dataloader.py in _get_batch(self)
541 elif self.pin_memory:
542 while self.pin_memory_thread.is_alive():
--> 543 success, data = self._try_get_batch()
544 if success:
545 return data
C:\Anaconda3\envs\fastai\lib\site-packages\torch\utils\data\dataloader.py in _try_get_batch(self, timeout)
517 if not all(w.is_alive() for w in self.workers):
518 pids_str = ', '.join(str(w.pid) for w in self.workers if not w.is_alive())
--> 519 raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str))
520 if isinstance(e, queue.Empty):
521 return (False, None)
RuntimeError: DataLoader worker (pid(s) 9584, 7236, 5108, 932, 13228, 13992, 4576, 13204) exited unexpectedly
I have researched about DataLoader but couldn't find anything useful.
Although I didn't understand the error message you posted, I see one problem in your code.
out = out[:,-1,:] # batch_size x 480
out = self.batch_norm_1(self.activation_1(out))
But you declared batch_norm_1 as:
self.batch_norm_1 = nn.BatchNorm1d(122)
Which should be:
self.batch_norm_1 = nn.BatchNorm1d(480)

Resources