Can I use probabilistic label when train model in logistic regression? - python-3.x

I use sklearn.linear_model.LogisticRegression and would like to use probabilistic label when train model.
But as following code I got error when I attempt to use train data with probability label for training logistic regression model.
Is there an any way to use probablity label for training logistic regression model?
import numpy as np
from sklearn.linear_model import LogisticRegression
x = np.array([1966, 1967, 1968, 1969, 1970,
1971, 1972, 1973, 1974, 1975,
1976, 1977, 1978, 1979, 1980,
1981, 1982, 1983, 1984]).reshape(-1, 1)
y = np.array([0.003, 0.016, 0.054, 0.139, 0.263,
0.423, 0.611, 0.758, 0.859, 0.903,
0.937, 0.954, 0.978, 0.978, 0.982,
0.985, 0.989, 0.988, 0.992])
lr = LogisticRegression()
lr.fit(x, y)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-26-6f0a54f18841> in <module>()
13
14 lr = LogisticRegression()
---> 15 lr.fit(x, y) # => ValueError: Unknown label type: 'continuous'
/home/sudot/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py in fit(self, X, y, sample_weight)
1172 X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
1173 order="C")
-> 1174 check_classification_targets(y)
1175 self.classes_ = np.unique(y)
1176 n_samples, n_features = X.shape
/home/sudot/anaconda3/lib/python3.6/site-packages/sklearn/utils/multiclass.py in check_classification_targets(y)
170 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
171 'multilabel-indicator', 'multilabel-sequences']:
--> 172 raise ValueError("Unknown label type: %r" % y_type)
173
174
ValueError: Unknown label type: 'continuous'

Logistic Regression is a binary classification model. You can't pass non-categorical values as target.
Just round values of y before fitting.
y = y.round(0) # Add this line
lr = LogisticRegression()
lr.fit(x, y)

Related

How can I solve svm predict model problem

Im having problem by svm predict model
from sklearn.svm import SVC
svm_model = SVC(kernel='rbf', C=8, gamma=0.1)
svm_model.fit(X_train_std, y_train)
y_pred = svm_model.predict(X_test_std)
/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-53-398f1caaa8e8> in <module>
3 svm_model = SVC(kernel='rbf', C=8, gamma=0.1)
4
----> 5 svm_model.fit(X_train_std, y_train)
6
7 y_pred = svm_model.predict(X_test_std)
2 frames
/usr/local/lib/python3.8/dist-packages/sklearn/utils/multiclass.py in check_classification_targets(y)
195 "multilabel-sequences",
196 ]:
--> 197 raise ValueError("Unknown label type: %r" % y_type)
198
199
ValueError: Unknown label type: 'continuous'
I thought y type problem
train = pd.get_dummies(train, columns=['LSTAT'], drop_first=True)
So I use that but problem was disappeared
Somebody help me

ValueError: multiclass format is not supported on ROC_Curve for text classification

I am trying to use ROC for evaluating my emotion text classifier model
This is my code for the ROC :
# ROC-AUC Curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
fpr, tpr, thresholds = roc_curve(y_test, y_test_hat2)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC CURVE')
plt.legend(loc="lower right")
plt.show()
This is the Error :
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-30-ef4ee0eff994> in <module>()
2 from sklearn.metrics import roc_curve, auc
3 import matplotlib.pyplot as plt
----> 4 fpr, tpr, thresholds = roc_curve(y_test, y_test_hat2)
5 roc_auc = auc(fpr, tpr)
6 plt.figure()
1 frames
/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_ranking.py in roc_curve(y_true, y_score, pos_label, sample_weight, drop_intermediate)
961 """
962 fps, tps, thresholds = _binary_clf_curve(
--> 963 y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
964 )
965
/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_ranking.py in _binary_clf_curve(y_true, y_score, pos_label, sample_weight)
729 y_type = type_of_target(y_true)
730 if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)):
--> 731 raise ValueError("{0} format is not supported".format(y_type))
732
733 check_consistent_length(y_true, y_score, sample_weight)
ValueError: multiclass format is not supported
This is the y_test and y_test_hat2 :
y_test = data_test["label"]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
test_vectors = vectorizer.transform(data_test['tweet'])
classifier_linear2 = LinearSVC(verbose=1)
y_test_hat2=classifier_linear2.predict(test_vectors)
Shape of test_vectors = (1096, 11330)
Shape of y_test = (1096,)
Label in y_test = ['0', '1', '2', '3', '4']
A ROC curve is based on soft predictions, i.e. it uses the predicted probability of an instance to belong to the positive class rather than the predicted class. For example with sklearn one can obtain the probabilities with predict_proba instead of predict (for the classifiers which provide it, example).
Note: OP used the tag multiclass-classification, but it's important to note that ROC curves can only be applied to binary classification problems.
One can find a short explanation of ROC curves here.

sklearn train_test_split - ValueError: Found input variables with inconsistent numbers of samples: [2552, 1] /Linear Regression

I need assistance reshaping my input to match my output.
I wanted to create a model that vectorizes and classifies 'All information' information so that the label'Fall' can be divided into 0 and 1.
However, I keep getting the [ValueError: Found input variables with inconsistent numbers of samples: [2552, 1]] error.
The'shape' looks fine, but I don't know how to fix it.
## Linear Regression
import pandas as pd
import numpy as np
from tqdm import tqdm
#instance->fit->predict
from sklearn.linear_model import LinearRegression
model=LinearRegression(fit_intercept=True)
data=pd.read_csv("Fall_test_0826.csv", encoding='cp949', header=0)
data.head(2)
X=data.drop(["fall"], axis=1)
y= data.fall
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 0)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer()
tfidf_vect.fit(X_train)#단어사전 만듬
X_train_tfidf_vect = tfidf_vect.fit_transform(X_train['All information']).toarray()
X_test_tfidf_vect = tfidf_vect.transform(X_test)
lr_clf=LinearRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
from sklearn.metrics import accuracy_score
print('Logisitic Regression _ {0:.3f}'.format(accuracy_score(y_test, pred)))
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-85-bec6ead862c8> in <module>
----> 1 print('{0:.3f}'.format(accuracy_score(y_test, pred)))
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in accuracy_score(y_true, y_pred, normalize, sample_weight)
185
186 # Compute accuracy for each possible representation
--> 187 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
188 check_consistent_length(y_true, y_pred, sample_weight)
189 if y_type.startswith('multilabel'):
~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in _check_targets(y_true, y_pred)
79 y_pred : array or indicator matrix
80 """
---> 81 check_consistent_length(y_true, y_pred)
82 type_true = type_of_target(y_true)
83 type_pred = type_of_target(y_pred)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
254 uniques = np.unique(lengths)
255 if len(uniques) > 1:
--> 256 raise ValueError("Found input variables with inconsistent numbers of"
257 " samples: %r" % [int(l) for l in lengths])
258
ValueError: Found input variables with inconsistent numbers of samples: [2552, 1]
enter image description here
enter image description here
I think you have to change your the lines in your code from
X_test_tfidf_vect = tfidf_vect.transform(X_test)
to
X_test_tfidf_vect = tfidf_vect.transform(X_test['All information'])
But your approach is wrong. You are going for Linear Regression but trying to use classifciation metrics (accuracy_score) (Reference)
Doing so should lead to the error ValueError: Classification metrics can't handle a mix of binary and continuous targets
So this will not work, because your array pred will hold float values, so for example 0.5, but for the accuracy_score you need class labels as integers, so for example 0,1,2 or 3 etc.
You need to use regression metrics instead to evaluate your Linear Regression.
Have a look at the available regression metrics here.

Bad input shape while training a prediction model

I used the following code to train a currency exchange rate prediction model using sklearn but get an error:
import numpy as np
x = [[30],[40],[50],[60],[70],[80],[90],[100],[120],[130],[140],[150]]
y = ['jan','febuary,'march','april','may','june','july','august','september','october','november','december']
y_2 = np.reshape(y, (-1, 2))
#reshaping because it throws in an error to reshape
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_3 = le.fit_transform(y_2)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-144-c98a5b8bd15a> in <module>
----> 1 y_3 = le.fit_transform(y_2)
c:\users\user\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\preprocessing\label.py in fit_transform(self, y)
233 y : array-like of shape [n_samples]
234 """
--> 235 y = column_or_1d(y, warn=True)
236 self.classes_, y = _encode(y, encode=True)
237 return y
c:\users\user\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\utils\validation.py in column_or_1d(y, warn)
795 return np.ravel(y)
796
--> 797 raise ValueError("bad input shape {0}".format(shape))
798
799
ValueError: bad input shape (6, 2)
What do I need to do to fix this error?
There is no need to perform the reshape() on y that you are doing. The following is sufficient.
import numpy as np
x = [[30],[40],[50],[60],[70],[80],[90],[100],[120],[130],[140],[150]]
y = ['jan','febuary','march','april','may','june','july','august','september','october','november','december']
#y_2 = np.reshape(y, (-1, 2)) --> This is not needed
#reshaping because it throws in an error to reshape
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y2 = le.fit_transform(y)
print("LabelEncoder classes =", le.classes_)
# LabelEncoder classes = ['april' 'august' 'december' 'febuary' 'jan' 'july' 'june' 'march' 'may' 'november' 'october' 'september']

Explaining LSTM keras with Eli5 library

I'm trying to use Eli5 for explaining an LSTM keras model for time series prediction. The keras model receives as input an array with shape (nsamples, timesteps, nfeatures).
This is my code:
def baseline_model():
model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='logcosh', optimizer='adam')
return model
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance
my_model = KerasRegressor(build_fn= baseline_model, nb_epoch= 30, batch_size= 32, verbose= False)
history = my_model.fit(X_train, y_train)
So far, everything is ok. The problem is when I execute the following line that launchs an error:
# X_train has a shape equal to (nsamples, timesteps, nfeatures) and y_train has a shape (nsamples)
perm = PermutationImportance(my_model, random_state=1).fit(X_train, y_train)
Error:
ValueError Traceback (most recent call last)
in ()
2 d2_train_dataset = X_train.reshape((nsamples, timesteps * features))
3
----> 4 perm = PermutationImportance(my_model, random_state=1).fit(X_train, y_train)
5 #eli5.show_weights(perm, feature_names = X.columns.tolist())
~/anaconda3/lib/python3.6/site-packages/eli5/sklearn/permutation_importance.py in fit(self, X, y, groups, **fit_params)
183 self.estimator_.fit(X, y, **fit_params)
184
--> 185 X = check_array(X)
186
187 if self.cv not in (None, "prefit"):
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
568 if not allow_nd and array.ndim >= 3:
569 raise ValueError("Found array with dim %d. %s expected <= 2."
--> 570 % (array.ndim, estimator_name))
571 if force_all_finite:
572 _assert_all_finite(array,
ValueError: Found array with dim 3. Estimator expected <= 2.
What can I do to fix this error? How can I use eli5 with my LSTM Keras Model?

Resources