Discounted Cumulative Gain dcg_score sklearn - scikit-learn

from sklearn.metrics import ndcg_score, dcg_score
import numpy as np
actual= [3,2,0,0,1]
ideal= sorted(actual, reverse=True)
#list to np asarray
actualarr=np.asarray([actual])
idealarr= np.asarray([ideal])
print ("actual score as array", actualarr)
print("ideal score as array", idealarr)
#Discounted Cumulative Gain
dcg= dcg_score(idealarr, actualarr)
print("DCG: ", dcg)
I don't understand why dcg_score takes y_score as a parameter. When I work out DCG longhand (sum relevance/log2(i+1)) I can get the same answer ~4.6, but i can achieve this just with the true scores [3,2,0,0,1], so why does it also require the ideal score [3,2,1,0,0] in the function?

I understood that sklearn.metrics.ndcg computes its sum by taking values from y_true as if it was reordered according to y_score.
As explained inside the code: "Sum the true scores ranked in the order induced by the predicted scores"
This means the metric is computed on the induced ranking, using true relevance values.
A small example:
import numpy as np
from sklearn.metrics import dcg_score
def naive_dcg(y_score):
score = 0
for i,n in enumerate(y_score[0]):
num = 2**n -1
den = np.log2(i+1+1)
score += num/den
return score
y_true = [[1,0]]
y_score = [[0,1]]
print(f'sklearn: {dcg_score(y_true,y_score):.2}, naive: {naive_dcg(y_score):.2}')
y_score = [[0.1,0.2]]
print(f'sklearn: {dcg_score(y_true,y_score):.2}, naive: {naive_dcg(y_score):.2}')
outputs:
sklearn: 0.63, naive: 0.63
sklearn: 0.63, naive: 0.17
which shows naive produces a different metric for the same ranking order.

Related

How to get the threshold from a specific precision and recall

I'm trying to get the threshold for a specific precision and recall. Let's say I want to get the threshold at the precision of 60% and recall of 40%. Are there any straightforward way to do it using the sklearn package?
precision, recall, threshold = precision_recall_curve(y_val, y_e)
df_pr = pd.DataFrame()
df_pr['precision'] = precision
df_pr['recall'] = recall
df_pr['threshold'] = list(threshold) + [1]
precision recall threshold
0 0.247543 1.000000 0.059483
1 0.247486 0.999692 0.059489
2 0.247504 0.999692 0.059512
3 0.247523 0.999692 0.059542
Provided that I've properly understood your question, imo, the point to highlight is that precision and recall are not necessarily coupled as you seem to imply. Here's a toy example:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
X, y = make_classification(n_samples=1000, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_scores = lr.predict_proba(X_test)
precision, recall, threshold = precision_recall_curve(y_test, y_scores[:, 1])
plt.plot(threshold, precision[:-1], 'b--', label='Precision')
plt.plot(threshold, recall[:-1], 'r--', label='Recall')
plt.xlabel('Threshold')
plt.legend(loc='lower left')
plt.ylim([0,1])
This said, the problem becomes something you can easily solve either with numpy or pandas, depending on your "setting". For instance, here's a toy function returning precision, recall and threshold at the index where the condition is attained.
def prt(arr, value):
array = np.asarray(arr)
idx = np.where(array[:-1] == value)[0][0]
return precision[idx], recall[idx], threshold[idx]
prt(precision, 0.6) # I checked ex-ante that precision=0.6 is attained. Differently you'll have to go with something custom.
(0.6, 0.9622641509433962, 0.052229434776723364)
Otherwise, to resemble your setting with a pandas DataFrame:
df = pd.DataFrame()
df['precision'] = precision[:-1]
df['recall'] = recall[:-1]
df['threshold'] = threshold
df[df.loc[:, 'precision'] == 0.6]
I would suggest you sklearn precision_recall_curve and threshold that tries to explain how .precision_recall_curve() works under the hood and Why does precision_recall_curve() return different values than confusion matrix? which might be somehow related.

sklearn and statsmodels getting very different logistic regression results

from sklearn.linear_model import LogisticRegression
from io import StringIO
import pandas as pd
import statsmodels.api as sm
TESTDATA = StringIO(""",age,age2,gender,average,hypertension
0,61,3721,0,0.068025807,FALSE 1,52,2704,0,0.066346102,FALSE
2,59,3481,0,0.068163704,FALSE 3,47,2209,0,0.062870186,FALSE
4,57,3249,0,0.065415069,TRUE 5,50,2500,1,0.06260146,FALSE
6,44,1936,0,0.067612307,FALSE 7,60,3600,0,0.062675767,FALSE
8,60,3600,0,0.063555558,TRUE 9,65,4225,0,0.066346102,FALSE
10,61,3721,0,0.068163704,FALSE 11,52,2704,0,0.062870186,FALSE
12,59,3481,0,0.065415069,FALSE 13,47,2209,0,0.06260146,FALSE
14,57,2209,0,0.067612307,TRUE 15,50,3249,1,0.067612307,FALSE
16,44,2500,0,0.067612307,FALSE 17,50,1936,0,0.062675767,FALSE
18,44,3600,0,0.063555558,FALSE 19,60,3600,0,0.066346102,TRUE
20,60,4225,0,0.068163704,TRUE 21,65,3721,0,0.062870186,TRUE
22,61,3600,0,0.065415069,FALSE 23,52,3600,0,0.06260146,FALSE
24,57,4225,0,0.067612307,FALSE 25,50,2209,1,0.066346102,TRUE
26,44,3249,0,0.068163704,FALSE 27,60,2500,0,0.062870186,FALSE
28,60,1936,0,0.065415069,FALSE 29,60,3600,0,0.06260146,FALSE
30,65,3600,0,0.067612307,FALSE 31,61,4225,0,0.066346102,FALSE
32,52,3721,0,0.068163704,TRUE 33,59,2704,0,0.062870186,FALSE
34,47,3249,0,0.065415069,FALSE 35,57,2500,1,0.06260146,TRUE
36,50,1936,0,0.067612307,FALSE 37,60,3600,0,0.062675767,FALSE
38,57,3600,0,0.063555558,FALSE 39,50,4225,0,0.067508574,FALSE
40,44,3721,0,0.068163704,TRUE 41,50,3600,0,0.066346102,FALSE
42,44,3600,0,0.068163704,FALSE 43,60,4225,0,0.062870186,TRUE
44,60,3600,0,0.065415069,TRUE 45,33,4225,1,0.06260146,TRUE
46,44,3721,0,0.067612307,FALSE 47,60,2704,0,0.067508574,FALSE
48,60,3600,0,0.068025807,FALSE 49,65,4225,0,0.066346102,FALSE
50,61,3721,0,0.068163704,FALSE 51,52,3600,0,0.062870186,TRUE
52,60,3600,0,0.065415069,FALSE 53,65,4225,0,0.066346102,FALSE
54,61,2209,0,0.062870186,TRUE 55,52,3600,1,0.065415069,FALSE
56,59,4225,0,0.068163704,FALSE 57,47,3721,0,0.062870186,FALSE
58,57,3600,0,0.065415069,TRUE 59,50,3600,0,0.06260146,FALSE
60,44,4225,0,0.067612307,FALSE 61,60,3721,0,0.066346102,FALSE
62,34,1936,0,0.068163704,FALSE 63,59,3600,0,0.062870186,FALSE
64,47,3600,0,0.065415069,TRUE 65,57,4225,1,0.06260146,FALSE
66,56,1936,0,0.067612307,FALSE 67,56,2209,0,0.062675767,FALSE
68,60,3249,0,0.063555558,FALSE 69,65,2500,0,0.067508574,FALSE""")
df = pd.read_csv(TESTDATA, sep=",")
print(sm.Logit(endog=df["hypertension"], exog=df[[ "age", "age2", "gender","average"]]).fit( disp=False).params)
print(LogisticRegression(fit_intercept = False, C = 1e9).fit( df[[ "age", "age2", "gender","average"]],df["hypertension"]).coef_)
The results are completely different:
age 0.011864
age2 0.000294
gender 1.015793
average -44.285129
[[-2.69997534e-02 8.27509854e-05 7.92208243e-01 -2.28174015e-02]]
Meanwhile, the results are the same for linear regression.
print(sm.OLS(endog=df["a"], exog=df[["b","c"]]).fit( disp=False).params)
print(LinearRegression(fit_intercept = False).fit( df[["b","c"]],df["a"]).coef_)
Results:
age 0.002484
age2 0.000050
gender 0.223877
average -1.235937
[ 2.48380428e-03 4.98449037e-05 2.23877433e-01 -1.23593682e+00]
Why is that? It's really puzzling...
scikit-learn isn't finding the best objective value here. statsmodels does a better job in this particular example. The only difference appears to be the choice of the optimizer, and if statsmodels is forced to use the same choice as SK learn, then the estimated parameter values are the same.
from sklearn.linear_model import LogisticRegression
from io import StringIO
import pandas as pd
import statsmodels.api as sm
TESTDATA = StringIO(""",age,age2,gender,average,hypertension
0,61,3721,0,0.068025807,FALSE 1,52,2704,0,0.066346102,FALSE
2,59,3481,0,0.068163704,FALSE 3,47,2209,0,0.062870186,FALSE
4,57,3249,0,0.065415069,TRUE 5,50,2500,1,0.06260146,FALSE
6,44,1936,0,0.067612307,FALSE 7,60,3600,0,0.062675767,FALSE
8,60,3600,0,0.063555558,TRUE 9,65,4225,0,0.066346102,FALSE
10,61,3721,0,0.068163704,FALSE 11,52,2704,0,0.062870186,FALSE
12,59,3481,0,0.065415069,FALSE 13,47,2209,0,0.06260146,FALSE
14,57,2209,0,0.067612307,TRUE 15,50,3249,1,0.067612307,FALSE
16,44,2500,0,0.067612307,FALSE 17,50,1936,0,0.062675767,FALSE
18,44,3600,0,0.063555558,FALSE 19,60,3600,0,0.066346102,TRUE
20,60,4225,0,0.068163704,TRUE 21,65,3721,0,0.062870186,TRUE
22,61,3600,0,0.065415069,FALSE 23,52,3600,0,0.06260146,FALSE
24,57,4225,0,0.067612307,FALSE 25,50,2209,1,0.066346102,TRUE
26,44,3249,0,0.068163704,FALSE 27,60,2500,0,0.062870186,FALSE
28,60,1936,0,0.065415069,FALSE 29,60,3600,0,0.06260146,FALSE
30,65,3600,0,0.067612307,FALSE 31,61,4225,0,0.066346102,FALSE
32,52,3721,0,0.068163704,TRUE 33,59,2704,0,0.062870186,FALSE
34,47,3249,0,0.065415069,FALSE 35,57,2500,1,0.06260146,TRUE
36,50,1936,0,0.067612307,FALSE 37,60,3600,0,0.062675767,FALSE
38,57,3600,0,0.063555558,FALSE 39,50,4225,0,0.067508574,FALSE
40,44,3721,0,0.068163704,TRUE 41,50,3600,0,0.066346102,FALSE
42,44,3600,0,0.068163704,FALSE 43,60,4225,0,0.062870186,TRUE
44,60,3600,0,0.065415069,TRUE 45,33,4225,1,0.06260146,TRUE
46,44,3721,0,0.067612307,FALSE 47,60,2704,0,0.067508574,FALSE
48,60,3600,0,0.068025807,FALSE 49,65,4225,0,0.066346102,FALSE
50,61,3721,0,0.068163704,FALSE 51,52,3600,0,0.062870186,TRUE
52,60,3600,0,0.065415069,FALSE 53,65,4225,0,0.066346102,FALSE
54,61,2209,0,0.062870186,TRUE 55,52,3600,1,0.065415069,FALSE
56,59,4225,0,0.068163704,FALSE 57,47,3721,0,0.062870186,FALSE
58,57,3600,0,0.065415069,TRUE 59,50,3600,0,0.06260146,FALSE
60,44,4225,0,0.067612307,FALSE 61,60,3721,0,0.066346102,FALSE
62,34,1936,0,0.068163704,FALSE 63,59,3600,0,0.062870186,FALSE
64,47,3600,0,0.065415069,TRUE 65,57,4225,1,0.06260146,FALSE
66,56,1936,0,0.067612307,FALSE 67,56,2209,0,0.062675767,FALSE
68,60,3249,0,0.063555558,FALSE 69,65,2500,0,0.067508574,FALSE""")
df = pd.read_csv(TESTDATA, sep=",")
mod = sm.Logit(endog=df["hypertension"], exog=df[[ "age", "age2", "gender","average"]])
sk_mod = LogisticRegression(fit_intercept = False, C = 1e9).fit( df[[ "age", "age2", "gender","average"]],df["hypertension"])
res_default = mod.fit(np.squeeze(sk_mod.coef_), disp=False)
res_lbfgs= mod.fit(np.squeeze(sk_mod.coef_), method="lbfgs", disp=False)
print("The default optimizer produces a larger log-likelihood (the optimization target)")
print(f"Default: {res_default.llf}, LBFGS: {res_lbfgs.llf}")
print("LBFGS is identical to SK Learn")
print(f"SK Learn coef\n {np.squeeze(sk_mod.coef_)}")
print(f"LBFGS coef \n {np.asarray(res_lbfgs.params)}")
print("The default optimizer produces different estimates")
print(f"Default coef \n {np.asarray(res_default.params)}")
res_lbfgs_sv= mod.fit(res_default.params, method="lbfgs", disp=False)
print(f"LBFGS with better starting parameters matches the default\n {np.asarray(res_lbfgs_sv.params)}")
Running the code produces
The default optimizer produces a larger log-likelihood (the optimization target)
Default: -15.853969516447952, LBFGS: -16.30414297615966
LBFGS is identical to SK Learn
SK Learn coef
[-4.42216394e-02 2.23648541e-04 1.19470339e+00 -4.28565669e-03]
LBFGS coef
[-4.42216394e-02 2.23648541e-04 1.19470339e+00 -4.28565669e-03]
The default optimizer produces different estimates
Default coef
[ 1.33419520e-02 4.79332044e-04 1.69742850e+00 -6.53888649e+01]
LBFGS with better starting parameters matches the default
[ 1.33419520e-02 4.79332044e-04 1.69742850e+00 -6.53888649e+01]

Fewer than expected purity scores in PCA analysis

I'm trying to plot the line graph of purity scores against the captured variances in PCA. The goal is to plot the line graph of purity scores against the captured variances of 89% and 99% only. In my code when the components/dimensions are 2 it captures 89% of variance and and when components/dimensions are 4 it captures 99% of variance.
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("clustering.csv")
X10_df = df.drop("Class",axis = 1) #feature matrix
Y10_df = df["Class"] #Target vector
X10_df = np.array(X10_df)
Y10_df = np.array(Y10_df)
scaler = StandardScaler() # Standardizing the data
df_std = scaler.fit_transform(X10_df)
pca = PCA()
pca.fit(df_std)
purity = []
n_comp = range(2,5)
for k in n_comp :
pca = PCA(n_components = k)
pca.fit(df_std)
pca.transform(df_std)
scores_pca = pca.transform(df_std)
kmeans_pca = KMeans(n_clusters=3, init ='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y12 = kmeans_pca.fit_predict(scores_pca)
purity13 = purity_score(Y10_df, pred_y12)
purity.append(purity13)
Below function calculates the purity score :
def purity_score(y_true, y_pred):
contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
However, while I have four variance scores, I only have three purity scores. I expected to have four purity scores so that I could create a plot of the variance vs purity.
Why there are only three purity scores?
Here is the link to my dataset file : https://gofile.io/d/3CgFTi
This is simply because when you using for loop with a range, the last number in the range is ignored. So in a range(2,5), it will go 2, 3, 4 and then quite the loop. Please read on for loop in Python.

There are the normalized values of the davies_bouldin_score coefficient in scikit-learn?

Is this measure normalized between 0 and 1?
At https://scikit-learn.org/stable/modules/generated/sklearn.metrics.davies_bouldin_score.html I understand that it is not normalized but is this in scikit-learn? Or generally?
The minimum value is 0 but the maximum can be above 1.
From the documentation:
This index signifies the average ‘similarity’ between clusters, where
the similarity is a measure that compares the distance between
clusters with the size of the clusters themselves.
Zero is the lowest possible score. Values closer to zero indicate a better partition.
Example where the score is > 1:
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
iris = datasets.load_iris()
X = iris.data
kmeans = KMeans(n_clusters=13, random_state=1).fit(X)
labels = kmeans.labels_
davies_bouldin_score(X, labels)
1.068885319440245

How to interpret the model once a set of coefficient is obtained for Multivariable polynomial regression?

I was solving a Multivariable polynomial regression problem,as a part of an online course, where one must obtain a model (polynomial form) for determining 'price of a car' as a function of 'horsepower','curb-weight','engine-size','highway-mpg'. The code given in the course slide didn't work for me and hence I tried to solve the problem on my own using a little different approach and (not sure) I succedded.
Now I want to determine which coefficient belongs to which variable and to what power.
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
lm=LinearRegression()
pr=PolynomialFeatures(degree=2, include_bias=False)
zi=df[['horsepower','curb-weight','engine-size','highway-mpg']]
y=df["price"]
x_poly=pr.fit_transform(zi)
lm.fit(x_poly,y)
y_poly_pred=lm.predict(x_poly)
print(lm.intercept_)
print(lm.coef_)
The output of the 'print(lm.coef_)' is an array:
[ 3.76158683e+02, 1.09866844e+01, -1.15342835e+02, 2.20081486e+02,
1.67487147e+00, -1.85925420e-01, -1.27963440e+00, -1.97616945e+00,
5.93872420e-04, 1.11397083e-01, -2.12935236e-01, 1.04605018e-01,
2.69312438e-01, 4.36657298e+00]
How can I assign or know to which variables and to which powers each of these coeffecients correspond to?
One way of doing is, You can get the ploymomialfeature column names like this
pr.get_feature_names(zi.columns)
and
pd.DataFrame(zip(pr.get_feature_names(zi.columns),lm.coef_),columns=["feature","coef_"])
Above should print the coef for each feature
Working example :
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
data = pd.DataFrame.from_dict({
'x': np.random.randint(low=1, high=10, size=5),
'y': np.random.randint(low=-1, high=1, size=5),
})
lm=LinearRegression()
p = PolynomialFeatures(degree=2)
p_data = p.fit_transform(data)
lm.fit(p_data,data['y'])
print (p.get_feature_names(data.columns))
coefmapping = pd.DataFrame(zip(p.get_feature_names(data.columns),lm.coef_),columns=["feature","coef_"])
print(coefmapping)
output:
feature coef_
0 1 -1.204939e-14
1 x -1.165951e-15
2 y 5.000000e-01
3 x^2 -6.938894e-18
4 x y -3.156113e-16
5 y^2 -5.000000e-01

Resources