sklearn and statsmodels getting very different logistic regression results - scikit-learn

from sklearn.linear_model import LogisticRegression
from io import StringIO
import pandas as pd
import statsmodels.api as sm
TESTDATA = StringIO(""",age,age2,gender,average,hypertension
0,61,3721,0,0.068025807,FALSE 1,52,2704,0,0.066346102,FALSE
2,59,3481,0,0.068163704,FALSE 3,47,2209,0,0.062870186,FALSE
4,57,3249,0,0.065415069,TRUE 5,50,2500,1,0.06260146,FALSE
6,44,1936,0,0.067612307,FALSE 7,60,3600,0,0.062675767,FALSE
8,60,3600,0,0.063555558,TRUE 9,65,4225,0,0.066346102,FALSE
10,61,3721,0,0.068163704,FALSE 11,52,2704,0,0.062870186,FALSE
12,59,3481,0,0.065415069,FALSE 13,47,2209,0,0.06260146,FALSE
14,57,2209,0,0.067612307,TRUE 15,50,3249,1,0.067612307,FALSE
16,44,2500,0,0.067612307,FALSE 17,50,1936,0,0.062675767,FALSE
18,44,3600,0,0.063555558,FALSE 19,60,3600,0,0.066346102,TRUE
20,60,4225,0,0.068163704,TRUE 21,65,3721,0,0.062870186,TRUE
22,61,3600,0,0.065415069,FALSE 23,52,3600,0,0.06260146,FALSE
24,57,4225,0,0.067612307,FALSE 25,50,2209,1,0.066346102,TRUE
26,44,3249,0,0.068163704,FALSE 27,60,2500,0,0.062870186,FALSE
28,60,1936,0,0.065415069,FALSE 29,60,3600,0,0.06260146,FALSE
30,65,3600,0,0.067612307,FALSE 31,61,4225,0,0.066346102,FALSE
32,52,3721,0,0.068163704,TRUE 33,59,2704,0,0.062870186,FALSE
34,47,3249,0,0.065415069,FALSE 35,57,2500,1,0.06260146,TRUE
36,50,1936,0,0.067612307,FALSE 37,60,3600,0,0.062675767,FALSE
38,57,3600,0,0.063555558,FALSE 39,50,4225,0,0.067508574,FALSE
40,44,3721,0,0.068163704,TRUE 41,50,3600,0,0.066346102,FALSE
42,44,3600,0,0.068163704,FALSE 43,60,4225,0,0.062870186,TRUE
44,60,3600,0,0.065415069,TRUE 45,33,4225,1,0.06260146,TRUE
46,44,3721,0,0.067612307,FALSE 47,60,2704,0,0.067508574,FALSE
48,60,3600,0,0.068025807,FALSE 49,65,4225,0,0.066346102,FALSE
50,61,3721,0,0.068163704,FALSE 51,52,3600,0,0.062870186,TRUE
52,60,3600,0,0.065415069,FALSE 53,65,4225,0,0.066346102,FALSE
54,61,2209,0,0.062870186,TRUE 55,52,3600,1,0.065415069,FALSE
56,59,4225,0,0.068163704,FALSE 57,47,3721,0,0.062870186,FALSE
58,57,3600,0,0.065415069,TRUE 59,50,3600,0,0.06260146,FALSE
60,44,4225,0,0.067612307,FALSE 61,60,3721,0,0.066346102,FALSE
62,34,1936,0,0.068163704,FALSE 63,59,3600,0,0.062870186,FALSE
64,47,3600,0,0.065415069,TRUE 65,57,4225,1,0.06260146,FALSE
66,56,1936,0,0.067612307,FALSE 67,56,2209,0,0.062675767,FALSE
68,60,3249,0,0.063555558,FALSE 69,65,2500,0,0.067508574,FALSE""")
df = pd.read_csv(TESTDATA, sep=",")
print(sm.Logit(endog=df["hypertension"], exog=df[[ "age", "age2", "gender","average"]]).fit( disp=False).params)
print(LogisticRegression(fit_intercept = False, C = 1e9).fit( df[[ "age", "age2", "gender","average"]],df["hypertension"]).coef_)
The results are completely different:
age 0.011864
age2 0.000294
gender 1.015793
average -44.285129
[[-2.69997534e-02 8.27509854e-05 7.92208243e-01 -2.28174015e-02]]
Meanwhile, the results are the same for linear regression.
print(sm.OLS(endog=df["a"], exog=df[["b","c"]]).fit( disp=False).params)
print(LinearRegression(fit_intercept = False).fit( df[["b","c"]],df["a"]).coef_)
Results:
age 0.002484
age2 0.000050
gender 0.223877
average -1.235937
[ 2.48380428e-03 4.98449037e-05 2.23877433e-01 -1.23593682e+00]
Why is that? It's really puzzling...

scikit-learn isn't finding the best objective value here. statsmodels does a better job in this particular example. The only difference appears to be the choice of the optimizer, and if statsmodels is forced to use the same choice as SK learn, then the estimated parameter values are the same.
from sklearn.linear_model import LogisticRegression
from io import StringIO
import pandas as pd
import statsmodels.api as sm
TESTDATA = StringIO(""",age,age2,gender,average,hypertension
0,61,3721,0,0.068025807,FALSE 1,52,2704,0,0.066346102,FALSE
2,59,3481,0,0.068163704,FALSE 3,47,2209,0,0.062870186,FALSE
4,57,3249,0,0.065415069,TRUE 5,50,2500,1,0.06260146,FALSE
6,44,1936,0,0.067612307,FALSE 7,60,3600,0,0.062675767,FALSE
8,60,3600,0,0.063555558,TRUE 9,65,4225,0,0.066346102,FALSE
10,61,3721,0,0.068163704,FALSE 11,52,2704,0,0.062870186,FALSE
12,59,3481,0,0.065415069,FALSE 13,47,2209,0,0.06260146,FALSE
14,57,2209,0,0.067612307,TRUE 15,50,3249,1,0.067612307,FALSE
16,44,2500,0,0.067612307,FALSE 17,50,1936,0,0.062675767,FALSE
18,44,3600,0,0.063555558,FALSE 19,60,3600,0,0.066346102,TRUE
20,60,4225,0,0.068163704,TRUE 21,65,3721,0,0.062870186,TRUE
22,61,3600,0,0.065415069,FALSE 23,52,3600,0,0.06260146,FALSE
24,57,4225,0,0.067612307,FALSE 25,50,2209,1,0.066346102,TRUE
26,44,3249,0,0.068163704,FALSE 27,60,2500,0,0.062870186,FALSE
28,60,1936,0,0.065415069,FALSE 29,60,3600,0,0.06260146,FALSE
30,65,3600,0,0.067612307,FALSE 31,61,4225,0,0.066346102,FALSE
32,52,3721,0,0.068163704,TRUE 33,59,2704,0,0.062870186,FALSE
34,47,3249,0,0.065415069,FALSE 35,57,2500,1,0.06260146,TRUE
36,50,1936,0,0.067612307,FALSE 37,60,3600,0,0.062675767,FALSE
38,57,3600,0,0.063555558,FALSE 39,50,4225,0,0.067508574,FALSE
40,44,3721,0,0.068163704,TRUE 41,50,3600,0,0.066346102,FALSE
42,44,3600,0,0.068163704,FALSE 43,60,4225,0,0.062870186,TRUE
44,60,3600,0,0.065415069,TRUE 45,33,4225,1,0.06260146,TRUE
46,44,3721,0,0.067612307,FALSE 47,60,2704,0,0.067508574,FALSE
48,60,3600,0,0.068025807,FALSE 49,65,4225,0,0.066346102,FALSE
50,61,3721,0,0.068163704,FALSE 51,52,3600,0,0.062870186,TRUE
52,60,3600,0,0.065415069,FALSE 53,65,4225,0,0.066346102,FALSE
54,61,2209,0,0.062870186,TRUE 55,52,3600,1,0.065415069,FALSE
56,59,4225,0,0.068163704,FALSE 57,47,3721,0,0.062870186,FALSE
58,57,3600,0,0.065415069,TRUE 59,50,3600,0,0.06260146,FALSE
60,44,4225,0,0.067612307,FALSE 61,60,3721,0,0.066346102,FALSE
62,34,1936,0,0.068163704,FALSE 63,59,3600,0,0.062870186,FALSE
64,47,3600,0,0.065415069,TRUE 65,57,4225,1,0.06260146,FALSE
66,56,1936,0,0.067612307,FALSE 67,56,2209,0,0.062675767,FALSE
68,60,3249,0,0.063555558,FALSE 69,65,2500,0,0.067508574,FALSE""")
df = pd.read_csv(TESTDATA, sep=",")
mod = sm.Logit(endog=df["hypertension"], exog=df[[ "age", "age2", "gender","average"]])
sk_mod = LogisticRegression(fit_intercept = False, C = 1e9).fit( df[[ "age", "age2", "gender","average"]],df["hypertension"])
res_default = mod.fit(np.squeeze(sk_mod.coef_), disp=False)
res_lbfgs= mod.fit(np.squeeze(sk_mod.coef_), method="lbfgs", disp=False)
print("The default optimizer produces a larger log-likelihood (the optimization target)")
print(f"Default: {res_default.llf}, LBFGS: {res_lbfgs.llf}")
print("LBFGS is identical to SK Learn")
print(f"SK Learn coef\n {np.squeeze(sk_mod.coef_)}")
print(f"LBFGS coef \n {np.asarray(res_lbfgs.params)}")
print("The default optimizer produces different estimates")
print(f"Default coef \n {np.asarray(res_default.params)}")
res_lbfgs_sv= mod.fit(res_default.params, method="lbfgs", disp=False)
print(f"LBFGS with better starting parameters matches the default\n {np.asarray(res_lbfgs_sv.params)}")
Running the code produces
The default optimizer produces a larger log-likelihood (the optimization target)
Default: -15.853969516447952, LBFGS: -16.30414297615966
LBFGS is identical to SK Learn
SK Learn coef
[-4.42216394e-02 2.23648541e-04 1.19470339e+00 -4.28565669e-03]
LBFGS coef
[-4.42216394e-02 2.23648541e-04 1.19470339e+00 -4.28565669e-03]
The default optimizer produces different estimates
Default coef
[ 1.33419520e-02 4.79332044e-04 1.69742850e+00 -6.53888649e+01]
LBFGS with better starting parameters matches the default
[ 1.33419520e-02 4.79332044e-04 1.69742850e+00 -6.53888649e+01]

Related

Discounted Cumulative Gain dcg_score sklearn

from sklearn.metrics import ndcg_score, dcg_score
import numpy as np
actual= [3,2,0,0,1]
ideal= sorted(actual, reverse=True)
#list to np asarray
actualarr=np.asarray([actual])
idealarr= np.asarray([ideal])
print ("actual score as array", actualarr)
print("ideal score as array", idealarr)
#Discounted Cumulative Gain
dcg= dcg_score(idealarr, actualarr)
print("DCG: ", dcg)
I don't understand why dcg_score takes y_score as a parameter. When I work out DCG longhand (sum relevance/log2(i+1)) I can get the same answer ~4.6, but i can achieve this just with the true scores [3,2,0,0,1], so why does it also require the ideal score [3,2,1,0,0] in the function?
I understood that sklearn.metrics.ndcg computes its sum by taking values from y_true as if it was reordered according to y_score.
As explained inside the code: "Sum the true scores ranked in the order induced by the predicted scores"
This means the metric is computed on the induced ranking, using true relevance values.
A small example:
import numpy as np
from sklearn.metrics import dcg_score
def naive_dcg(y_score):
score = 0
for i,n in enumerate(y_score[0]):
num = 2**n -1
den = np.log2(i+1+1)
score += num/den
return score
y_true = [[1,0]]
y_score = [[0,1]]
print(f'sklearn: {dcg_score(y_true,y_score):.2}, naive: {naive_dcg(y_score):.2}')
y_score = [[0.1,0.2]]
print(f'sklearn: {dcg_score(y_true,y_score):.2}, naive: {naive_dcg(y_score):.2}')
outputs:
sklearn: 0.63, naive: 0.63
sklearn: 0.63, naive: 0.17
which shows naive produces a different metric for the same ranking order.

How to get the threshold from a specific precision and recall

I'm trying to get the threshold for a specific precision and recall. Let's say I want to get the threshold at the precision of 60% and recall of 40%. Are there any straightforward way to do it using the sklearn package?
precision, recall, threshold = precision_recall_curve(y_val, y_e)
df_pr = pd.DataFrame()
df_pr['precision'] = precision
df_pr['recall'] = recall
df_pr['threshold'] = list(threshold) + [1]
precision recall threshold
0 0.247543 1.000000 0.059483
1 0.247486 0.999692 0.059489
2 0.247504 0.999692 0.059512
3 0.247523 0.999692 0.059542
Provided that I've properly understood your question, imo, the point to highlight is that precision and recall are not necessarily coupled as you seem to imply. Here's a toy example:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
X, y = make_classification(n_samples=1000, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_scores = lr.predict_proba(X_test)
precision, recall, threshold = precision_recall_curve(y_test, y_scores[:, 1])
plt.plot(threshold, precision[:-1], 'b--', label='Precision')
plt.plot(threshold, recall[:-1], 'r--', label='Recall')
plt.xlabel('Threshold')
plt.legend(loc='lower left')
plt.ylim([0,1])
This said, the problem becomes something you can easily solve either with numpy or pandas, depending on your "setting". For instance, here's a toy function returning precision, recall and threshold at the index where the condition is attained.
def prt(arr, value):
array = np.asarray(arr)
idx = np.where(array[:-1] == value)[0][0]
return precision[idx], recall[idx], threshold[idx]
prt(precision, 0.6) # I checked ex-ante that precision=0.6 is attained. Differently you'll have to go with something custom.
(0.6, 0.9622641509433962, 0.052229434776723364)
Otherwise, to resemble your setting with a pandas DataFrame:
df = pd.DataFrame()
df['precision'] = precision[:-1]
df['recall'] = recall[:-1]
df['threshold'] = threshold
df[df.loc[:, 'precision'] == 0.6]
I would suggest you sklearn precision_recall_curve and threshold that tries to explain how .precision_recall_curve() works under the hood and Why does precision_recall_curve() return different values than confusion matrix? which might be somehow related.

probability difference between categorical target and one-hot encoding target using OneVsRestClassifier

A bit confused with the probability between categorical target and one-hot encoding target from OneVsRestClassifier of sklean. Using iris data with simple logistic regression as an example. When I use original iris class[0,1,2], the calculated OneVsRestClassifier() probability for each observation will always add up to 1. However, if I converted the target to dummies, this is not the case. I understand that OneVsRestClassifier() compares one vs rest (class 0 vs non class 0, class 1 vs non class 1, etc). It makes more sense that the sum of these probabilities has no relation with 1. Then why I see the difference and how so?
import numpy as np
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
np.set_printoptions(suppress=True)
iris = datasets.load_iris()
rng = np.random.RandomState(0)
perm = rng.permutation(iris.target.size)
X = iris.data[perm]
y = iris.target[perm]
# categorical target with no conversion
X_train, y_train1 = X[:80], y[:80]
X_test, y_test1 = X[80:], y[80:]
m3 = LogisticRegression(random_state=0)
clf1 = OneVsRestClassifier(m3).fit(X_train, y_train1)
y_pred1 = clf1.predict(X_test)
print(np.sum(y_pred1 == y_test))
y_prob1 = clf1.predict_proba(X_test)
y_prob1[:5]
#output
array([[0.00014508, 0.17238549, 0.82746943],
[0.03850173, 0.79646817, 0.1650301 ],
[0.73981106, 0.26018067, 0.00000827],
[0.00016332, 0.32231163, 0.67752505],
[0.00029197, 0.2495404 , 0.75016763]])
# one hot encoding for categorical target
y2 = pd.get_dummies(y)
y_train2 = y2[:80]
y_test2 = y2[80:]
clf2 = OneVsRestClassifier(m3).fit(X_train, y_train2)
y_pred2 = clf2.predict(X_test)
y_prob2 = clf2.predict_proba(X_test)
y_prob2[:5]
#output
array([[0.00017194, 0.20430011, 0.98066319],
[0.02152246, 0.44522562, 0.09225181],
[0.96277892, 0.3385952 , 0.00001076],
[0.00023024, 0.45436925, 0.95512082],
[0.00036849, 0.31493725, 0.94676348]])
When you encode the targets, sklearn interprets your problem as a multilabel one instead of just multiclass; that is, that it is possible for a point to have more than one true label. And in that case, it is perfectly acceptable for the total sum of probabilities to be greater (or less) than 1. That's generally true for sklearn, but OneVsRestClassifier calls it out specifically in the docstring:
OneVsRestClassifier can also be used for multilabel classification. To use this feature, provide an indicator matrix for the target y when calling .fit.
As for the first approach, there are indeed three independent models, but the predictions are normalized; see the source code. Indeed, that's the only difference:
(y_prob2 / y_prob2.sum(axis=1)[:, None] == y_prob1).all()
# output
True
It's probably worth pointing out that LogisticRegression also natively supports multiclass. In that case, the weights for each class are independent, so it's similar to three separate models, but the resulting probabilities are the result of a softmax application, and the loss function minimizes the loss for each class simultaneously, so that the resulting coefficients and hence predictions can be different from those obtained from OneVsRestClassifier:
m3.fit(X_train, y_train1)
y_prob0 = m3.predict_proba(X_test)
y_prob0[:5]
# output:
array([[0.00000494, 0.01381671, 0.98617835],
[0.02569699, 0.88835451, 0.0859485 ],
[0.95239985, 0.04759984, 0.00000031],
[0.00001338, 0.04195642, 0.9580302 ],
[0.00002815, 0.04230022, 0.95767163]])

Fewer than expected purity scores in PCA analysis

I'm trying to plot the line graph of purity scores against the captured variances in PCA. The goal is to plot the line graph of purity scores against the captured variances of 89% and 99% only. In my code when the components/dimensions are 2 it captures 89% of variance and and when components/dimensions are 4 it captures 99% of variance.
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("clustering.csv")
X10_df = df.drop("Class",axis = 1) #feature matrix
Y10_df = df["Class"] #Target vector
X10_df = np.array(X10_df)
Y10_df = np.array(Y10_df)
scaler = StandardScaler() # Standardizing the data
df_std = scaler.fit_transform(X10_df)
pca = PCA()
pca.fit(df_std)
purity = []
n_comp = range(2,5)
for k in n_comp :
pca = PCA(n_components = k)
pca.fit(df_std)
pca.transform(df_std)
scores_pca = pca.transform(df_std)
kmeans_pca = KMeans(n_clusters=3, init ='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y12 = kmeans_pca.fit_predict(scores_pca)
purity13 = purity_score(Y10_df, pred_y12)
purity.append(purity13)
Below function calculates the purity score :
def purity_score(y_true, y_pred):
contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
However, while I have four variance scores, I only have three purity scores. I expected to have four purity scores so that I could create a plot of the variance vs purity.
Why there are only three purity scores?
Here is the link to my dataset file : https://gofile.io/d/3CgFTi
This is simply because when you using for loop with a range, the last number in the range is ignored. So in a range(2,5), it will go 2, 3, 4 and then quite the loop. Please read on for loop in Python.

How to interpret the model once a set of coefficient is obtained for Multivariable polynomial regression?

I was solving a Multivariable polynomial regression problem,as a part of an online course, where one must obtain a model (polynomial form) for determining 'price of a car' as a function of 'horsepower','curb-weight','engine-size','highway-mpg'. The code given in the course slide didn't work for me and hence I tried to solve the problem on my own using a little different approach and (not sure) I succedded.
Now I want to determine which coefficient belongs to which variable and to what power.
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
lm=LinearRegression()
pr=PolynomialFeatures(degree=2, include_bias=False)
zi=df[['horsepower','curb-weight','engine-size','highway-mpg']]
y=df["price"]
x_poly=pr.fit_transform(zi)
lm.fit(x_poly,y)
y_poly_pred=lm.predict(x_poly)
print(lm.intercept_)
print(lm.coef_)
The output of the 'print(lm.coef_)' is an array:
[ 3.76158683e+02, 1.09866844e+01, -1.15342835e+02, 2.20081486e+02,
1.67487147e+00, -1.85925420e-01, -1.27963440e+00, -1.97616945e+00,
5.93872420e-04, 1.11397083e-01, -2.12935236e-01, 1.04605018e-01,
2.69312438e-01, 4.36657298e+00]
How can I assign or know to which variables and to which powers each of these coeffecients correspond to?
One way of doing is, You can get the ploymomialfeature column names like this
pr.get_feature_names(zi.columns)
and
pd.DataFrame(zip(pr.get_feature_names(zi.columns),lm.coef_),columns=["feature","coef_"])
Above should print the coef for each feature
Working example :
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
data = pd.DataFrame.from_dict({
'x': np.random.randint(low=1, high=10, size=5),
'y': np.random.randint(low=-1, high=1, size=5),
})
lm=LinearRegression()
p = PolynomialFeatures(degree=2)
p_data = p.fit_transform(data)
lm.fit(p_data,data['y'])
print (p.get_feature_names(data.columns))
coefmapping = pd.DataFrame(zip(p.get_feature_names(data.columns),lm.coef_),columns=["feature","coef_"])
print(coefmapping)
output:
feature coef_
0 1 -1.204939e-14
1 x -1.165951e-15
2 y 5.000000e-01
3 x^2 -6.938894e-18
4 x y -3.156113e-16
5 y^2 -5.000000e-01

Resources