How to extract the actual values from the shap summary plot - python-3.x

How to go about extracting the numerical values for the shap summary plot so that the data can be viewed in a dataframe?:
Here is a MWE:
from sklearn.datasets import make_classification
from shap import Explainer, waterfall_plot, Explanation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Generate noisy Data
X, y = make_classification(n_samples=1000,
n_features=50,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=9,
flip_y=0.2,
random_state=17)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
explainer = Explainer(model)
sv = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_train, plot_type="bar")
I tried
np.abs(shap_values.values).mean(axis=0)
but I get a shape of (50,10). How do I get just the aggerated value for each feature to then sort for the feature importance?

You've done this:
from sklearn.datasets import make_classification
from shap import Explainer, waterfall_plot, Explanation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from shap import summary_plot
# Generate noisy Data
X, y = make_classification(n_samples=1000,
n_features=50,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=9,
flip_y=0.2,
random_state=17)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
explainer = Explainer(model)
sv = explainer.shap_values(X_test)
summary_plot(sv, X_train, plot_type="bar")
Note, you have features 3, 29, 34 and so on at the top.
If you do:
np.abs(sv).shape
(10, 250, 50)
You'll find out you've got 10 classes for 250 datapoints for 50 features.
If you aggregate, you'll get everything you need:
aggs = np.abs(sv).mean(1)
aggs.shape
(10, 50)
You can draw it:
sv_df = pd.DataFrame(aggs.T)
sv_df.plot(kind="barh",stacked=True)
And if it still doesn't look familiar, you can rearrange and filter:
sv_df.loc[sv_df.sum(1).sort_values(ascending=True).index[-10:]].plot(kind="barh",stacked=True)
Conclusion:
sv_df are aggregated SHAP values, as in summary plot, arranged as features per row and classes per column.
Does it help?

Related

I am using lazy classifier for my dataset but it returns empty frame

from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyClassifier
y = np.array(skin_new_df['diagnostic'])
X = np.array(skin_new_df.drop(['diagnostic'], axis=1))
print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)
clf = LazyClassifier(verbose=0,
ignore_warnings=True,
custom_metric = None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)
I run this code and get empty frame at the output
(2298, 25)
(2298,)
100%|██████████| 29/29 [00:08<00:00, 3.61it/s]
Accuracy Balanced Accuracy ROC AUC F1 Score Time Taken
Model
I want to get all models accuracy

LogisticRegression classifier

I need to use Logistic Regression classifier I have dataset the length of each column 2000 this is all my code:
from statistics import mode
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
# Importing the datasets
###Social_Network_Ads
datasets = pd.read_csv('C:/Users/n3.csv',header=None)
X = datasets.iloc[:, 0:5].values
Y = datasets.iloc[:, 5].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
# instantiate the model (using the default parameters)
model = LogisticRegression()
# fit the model with data
model.fit(X_Train, Y_Train)
predicted = cross_val_predict(mode, X_Train, Y_Train, cv=5)
train_acc = model.score(X_Train, Y_Train)
print("The Accuracy for Training Set is {}".format(train_acc*100))
But in I got on this error:
TypeError: Cannot clone object '<function mode at 0x000000FD6579B9D0>'
(type <class 'function'>): it does not seem to be a scikit-learn
estimator as it does not implement a 'get_params' method.
How solve this?
Change this line
predicted = cross_val_predict(mode, X_Train, Y_Train, cv=5)
to
predicted = cross_val_predict(model, X_Train, Y_Train, cv=5)
You have a simple typo. You want to pass your estimator to the function but instead you passed mode which is imported from statistics. That's why the error tells you that it can not clone an object of type function. You are passing a function but it expects an estimator.

accuracy of the logistic regression program always differ

import math
import numpy as np
import pandas as pd
#from pandas import DataFrame
from sklearn import preprocessing,cross_validation
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
from numpy import loadtxt, where
from pylab import scatter, show, legend, xlabel, ylabel
# scale larger positive and values to between -1,1 depending on the largest
# value in the data
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
df = pd.read_excel("Cryotherapy.xlsx", header=0)
# clean up data
df.columns = ["sex","age","Time","Number_of_Warts", "Type",
"Area","Result_of_Treatment"]
x = df["Result_of_Treatment"]
X = df[["Type","Area",]]
X = np.array(X)
X = min_max_scaler.fit_transform(X)
Y = df["Result_of_Treatment"]
Y = np.array(Y)
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y,
test_size=0.4)
# train scikit learn model
clf = LogisticRegression()
clf.fit(X_train, Y_train)
accuracy = clf.score(X_test,Y_test)
print(accuracy)
Try passing a random_state into the train_test_split function. If you do not do this then the data is gonna be shuffled randomly each time -> producing different train and test sets.
Example:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.4, random_state=1)

Why is my y_pred model only close to zero?

I am new to python and also learning machine learning. I got a data-set for titanic and trying to predict who survived and who did not. But my code seems to have an issue with the y_pred, as none of them is close to 1 or above one. Find attached also the y_test and y_pred images.
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('train.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values
# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 2:3])
X[:, 2:3] = imputer.transform(X[:, 2:3])
#Encoding Categorical variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
# Dummy variable trap
X = X[:, 1:]
# Splitting the Dataset into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Split the dataset into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_tratin, y_test = train_test_split(X, y, test_size = 0.2,)
# Fitting the Multiple Linear Regression to the training set
""" regressor is an object of LinearRegression() class in line 36 """
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
Thanks for the help everyone, I have been able to sort it out.
The problem was y in the importing dataset was seen as a vector and not a matrix
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('train.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3:].values
# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 2:3])
X[:, 2:3] = imputer.transform(X[:, 2:3])
#Encoding Categorical variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
# Dummy variable trap
X = X[:, 1:]
# Splitting the Dataset into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
# Fitting the Multiple Linear Regression to the training set
""" regressor is an object of LinearRegression() class in line 36 """
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Predicting the test set result
y_pred = regressor.predict(X_test)

Data Modelling - SVM

I am currently doing Data modelling and I am getting an error and couldn't find a solution to it. So I am hoping I would get some help from this platform!
Thanks in advance.
My code:-
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn import svm
X = np.array(observables) #X are features
y = np.array(df['diagnosis']) # y is label
X_train, y_train, X_test, y_test= cross_validation.train_test_split(X, y, test_size=0.2)
clf= svm.SVC()
clf.fit(X_train, y_train)
accuracy= clf.score(X_test, y_test)
print (accuracy)
However I get this error:
ValueError: bad input shape (114, 8)
It seems like you mixed up the order of the return values of train_test_split, use
X_train, X_test, y_train, y_test= cross_validation.train_test_split(X, y, test_size=0.2)
instead of
X_train, y_train, X_test, y_test= cross_validation.train_test_split(X, y, test_size=0.2)

Resources