I'm running xgboost for machine learning, and after successful completion of my machine learning using XGBClassifier, I want to make plots of the results.
A minimal working example of my input data in JSON format:
[{"age":58,"Deceased":"False","sex":"False"},{"Deceased":"False","age":59,"sex":"False"},{"sex":"False","age":"68","Deceased":"False"},{"Deceased":"False","age":"26","sex":"False"},{"Deceased":"False","age":87,"sex":"False"},{"sex":"True","age":31,"Deceased":"False"},{"Deceased":"False","age":"35","sex":"False"},{"sex":"False","Deceased":"False","age":41},{"age":"78","Deceased":"False","sex":"True"},{"Deceased":"False","age":"45","sex":"True"},{"sex":"False","age":56,"Deceased":"False"},{"sex":"False","Deceased":"False","age":"26"},{"sex":"True","age":"64","Deceased":"False"},{"sex":"False","age":"37","Deceased":"False"},{"age":"86","Deceased":"True","sex":"False"},{"age":76,"Deceased":"True","sex":"True"},{"Deceased":"True","age":69,"sex":"False"},{"Deceased":"True","age":79,"sex":"True"}]
Following advice from https://evgenypogorelov.com/multiclass-xgb-shap.html
my script:
import mlflow
import sys
import json
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import xgboost
import shap
from sklearn.metrics import accuracy_score, precision_score, plot_roc_curve
def ref_to_json_file(data, filename):
json1=json.dumps(data)
f = open(filename,"w+")
print(json1,file=f)
def xgbclassifier_wrapper( json_file, dependent_var, output_stem):
#https://xgboost.readthedocs.io/en/latest/parameter.html
pandasDF = pd.read_json(json_file)
bool_cols = ["Deceased", "sex"]#, 'Hospitalized', 'Respiratory_Support', 'sex']
for col in bool_cols:
pandasDF[col] = pandasDF[col]=='True'
Y = pandasDF[dependent_var]
X = pandasDF.drop([dependent_var], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
mlflow.sklearn.autolog()
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.
with mlflow.start_run():
# Set the model parameters.
n_estimators = 200
colsample_bytree = 0.3
learning_rate = 0.05
max_depth = 6# default 6; max. depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in lossguided growing policy when tree_method is set as hist or gpu_hist and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
#min_child_rate = 0
gamma = 0 # default = 0; Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
# Create and train model.
xg_clf = xgboost.XGBClassifier( n_estimators=n_estimators, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth)
xg_clf.fit(X_train, y_train)
# Use the model to make predictions on the test dataset.
predictions = xg_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
pre_score = precision_score(y_test, predictions)
feature_importances = pd.DataFrame(xg_clf.feature_importances_, index=X.columns, columns=['importance'])
feature_importances.to_json("data/" + output_stem + '.feature_importances.json')
kfold = KFold(n_splits=10)
results = cross_val_score(xg_clf, X, Y, cv=kfold)
accuracy = results.mean() * 100
roc = plot_roc_curve(xg_clf, X_test, y_test, name = dependent_var)
return accuracy
json_file = 'debug.json'#"/home/con/covid_study2065/data/pat.data.array.json"
if not os.path.isfile(json_file):
sys.exit("json file doesn't exist.")
deceased = xgbclassifier_wrapper(json_file, "Deceased", 'debug')
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "raw", feature_perturbation="interventional", data = deceased.X)
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)
gives an error:
Exception ignored in: 'array_dealloc'
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1353, in __init__
_cext.dense_tree_update_weights(
SystemError: <class 'DeprecationWarning'> returned a result with an error set
Found a NULL input array in _cext_dense_tree_update_weights!
Traceback (most recent call last):
File "debug.py", line 97, in <module>
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 147, in __init__
self.model = TreeEnsemble(model, self.data, self.data_missing, model_output)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 827, in __init__
self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1522, in get_trees
trees.append(SingleTree({
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1353, in __init__
_cext.dense_tree_update_weights(
SystemError: <built-in function dense_tree_update_weights> returned NULL without setting an error
When I look at deceased.xg_clf, which is input to shap.TreeExplainer:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.05, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=200, n_jobs=1, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
Adjusting the input to XGBClassifer to the same parameters that the tutorial used, viz.
xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto')
also gives the same error as my parameters.
I have literally no idea what's causing this error, and this message isn't helpful: I never did anything like array_alloc, which I thought was a C-level thing to do.
this error also occurs when doing a parameter grid_search.
I'm running Python 3.8.0 on Ubuntu 18.04 on a VM, using shap 0.38.1 The error also occurs on Python 3.8.5. The error also occurs with Ubuntu 20.04.2 LTS (Focal Fossa) 64-bit, linux kernel 5.8.044-generic x86_64.
Updating to shap version 0.39.0 did not help.
I tried updating to Python 3.8.8, but that made the situation even worse, because one of the dependencies of shap isn't compatible with that version:
Collecting slicer==0.0.7 (from shap)
Could not find a version that satisfies the requirement slicer==0.0.7 (from shap) (from versions: )
No matching distribution found for slicer==0.0.7 (from shap)
I've opened an issue on their GitHub page: https://github.com/slundberg/shap/issues/1844
also, my versions of xgboost, numpy, and scipy are all up-to-date:
Requirement already up-to-date: xgboost in /usr/local/lib/python3.8/dist-packages (1.3.3)
Requirement already satisfied, skipping upgrade: numpy in /usr/local/lib/python3.8/dist-packages (from xgboost) (1.19.5)
Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.8/dist-packages (from xgboost) (1.6.1)
How can I run the shap library?
or... is there some competitor to shap that I could use?
In my case I had features with the data type 'bool' in my pandas DataFrame for training the XGBClassifier.
After removing or converting these features to 'integer' this was solved.
The solution was that there was an error in the commands to TreeExplainer. The problem is that the error message was "Less than Awesome". The solution:
import mlflow
import sys, os
import json
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import xgboost
import shap
from sklearn.metrics import accuracy_score, precision_score, plot_roc_curve
def ref_to_json_file(data, filename):
json1=json.dumps(data)
f = open(filename,"w+")
print(json1,file=f)
class xgb_result:
def __init__(self, xgb_result, X_test):
self.xgb_result = xgb_result
self.X_test = X_test
def xgbclassifier_wrapper( json_file, dependent_var, output_stem):
#https://xgboost.readthedocs.io/en/latest/parameter.html
pandasDF = pd.read_json(json_file)
bool_cols = ["Deceased", "sex"]#, 'Hospitalized', 'Respiratory_Support', 'sex']
for col in bool_cols:
pandasDF[col] = pandasDF[col]=='True'
Y = pandasDF[dependent_var]
X = pandasDF.drop([dependent_var], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
mlflow.sklearn.autolog()
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.
with mlflow.start_run():
# Set the model parameters.
n_estimators = 200
colsample_bytree = 0.3
learning_rate = 0.05
max_depth = 6# default 6; max. depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in lossguided growing policy when tree_method is set as hist or gpu_hist and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
#min_child_rate = 0
gamma = 0 # default = 0; Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
# Create and train model.
xg_clf = xgboost.XGBClassifier( n_estimators=n_estimators, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth)
xg_clf.fit(X_train, y_train)
# Use the model to make predictions on the test dataset.
predictions = xg_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
pre_score = precision_score(y_test, predictions)
feature_importances = pd.DataFrame(xg_clf.feature_importances_, index=X.columns, columns=['importance'])
feature_importances.to_json("data/" + output_stem + '.feature_importances.json')
kfold = KFold(n_splits=10)
results = cross_val_score(xg_clf, X, Y, cv=kfold)
accuracy = results.mean() * 100
roc = plot_roc_curve(xg_clf, X_test, y_test, name = dependent_var)
return_object = xgb_result(xg_clf, X_test)
return return_object
json_file = 'debug.json'#"/home/con/covid_study2065/data/pat.data.array.json"
if not os.path.isfile(json_file):
sys.exit("json file doesn't exist.")
deceased = xgbclassifier_wrapper(json_file, "Deceased", 'debug')
shap_values = shap.TreeExplainer(deceased.xgb_result).shap_values(deceased.X_test)
shap_interaction_values = shap.TreeExplainer(deceased.xgb_result).shap_interaction_values(deceased.X_test)
#explainer = shap.TreeExplainer(deceased, model_output = "raw", feature_perturbation="interventional", data = deceased.X)
#explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)
I'm trying to do multiple linear regression with sklearn and I have performed the following steps. However, when it comes to predicting y_pred using the trained model I am getting a perfect r^2 = 1.0. Does anyone know why this is the case/what's going wrong with my code?
Also sorry I'm new to this site so I'm not fully up to speed with the formatting/etiquette of questions!
import numpy as np
import pandas as pd
# Import and subset data
ml_data_all = pd.read_excel('C:/Users/User/Documents/RSEM/STADM/Coursework/Crime_SF/Machine_learning_collated_data.xlsx')
ml_data_1218 = ml_data_all[ml_data_all['Year'] >= 2012]
ml_data_1218.drop(columns=['Pop_MOE',
'Pop_density_MOE',
'Age_median_MOE',
'Sex_ratio_MOE',
'Income_median_household_MOE',
'Pop_total_pov_status_determ_MOE',
'Pop_total_50percent_pov_MOE',
'Pop_total_125percent_pov_MOE',
'Poverty_percent_below_MOE',
'Total_labourforceMOE',
'Unemployed_total_MOE',
'Unemployed_total_male_MOE'], inplace=True)
# Taking care of missing data
# Delete rows containing any NaNs
ml_data_1218.dropna(axis=0,
how='any',
inplace=True)
# DATA PREPROCESSING
# Defining X and y
X = ml_data_1218.drop(columns=['Year']).values
y = ml_data_1218['Burglaries '].values
# Encoding categorical data
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer(transformers=[("cat", OneHotEncoder(), [0])], remainder='passthrough')
X = transformer.fit_transform(X)
X.toarray()
X = pd.DataFrame.sparse.from_spmatrix(X)
# Split into Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train.iloc[:,149:] = sc_X.fit_transform(X_train.iloc[:,149:])
X_test.iloc[:,149:] = sc_X.transform(X_test.iloc[:,149:])
# Fitting multiple linear regression to training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Predicting test set results
y_pred = regressor.predict(X_test)
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)
So turns out it was a stupid mistake in the end: I forgot to drop the dependent variable (Burglaries) from the X columns haha, hence why the linear regression model was making perfect predictions. Now it's working (r2 = 0.56). Thanks everyone!
With regression, it's often a good idea to run a correlation matrix against all of your variables (IVs and the DV). Regression likes parsimony, so removing IVs that are functionally the same (and just leaving one in the model) is better for R^2 value (aka model fit). Also, if something is correlated at .97 or higher with the DV, it is basically a substitute for the DV and all the other data is most likely superfluous.
When reading your issue (before I saw your "Answer") I was thinking "either this person has outrageous correlation issues or the DV is also in the prediction data."
I would like to visualize my decision tree with export_graphviz, however I keep on getting the following error:
File "C:\Users\User\AppData\Local\Continuum\anaconda3\envs\data_science\lib\site-packages\sklearn\utils\validation.py", line 951, in check_is_fitted
raise NotFittedError(msg % {'name': type(estimator).__name__})
NotFittedError: This Pipeline instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.
I am pretty sure my Pipeline is fitted because I call predict in my code which works just fine. Here is the code in question:
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
#Parameters for model building an reproducibility
state = 13
data_age.dropna(inplace=True)
X_age = data_age.iloc[:,0:77]
y_age = data_age.iloc[:,77]
X = X_age
y = y_age
#split between testing and training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= state)
# Pipeline with the regressor
regressors = [DecisionTreeRegressor(random_state = state)]
for reg in regressors:
steps=[('regressor', reg)]
pipeline = Pipeline(steps) #seed that controls the random grid search
#Train the model
pipeline.set_params(regressor__max_depth = 5, regressor__min_samples_split =5, regressor__min_samples_leaf = 5).fit(X_train, y_train)
pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)
export_graphviz(pipeline, out_file='tree.dot')
I know I don't really need the Pipeline here but I would still like to understand what is the problem for future reference and be able to plot a decision tree, whithin a pipeline which has been fitted.
So, based on Farseer answer, the last line has to be:
#Train the model
pipeline.set_params(regressor__max_depth = 5, regressor__min_samples_split =5, regressor__min_samples_leaf = 5).fit(X_train, y_train)
pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)
#export as a .dot file
export_graphviz(regressors[0], out_file='tree.dot')
And now it works.
Signature of export_graphviz is export_graphviz(decision_tree, ...) as can be seen in documentation.
So, you should pass your decision tree as argument to export_graphviz function and not your Pipeline.
You can also see in source code, that export_grpahviz is calling check_is_fitted(decision_tree, 'tree_') method.
I am getting an error while implementing perceptron online training in sci-kit learn. I have referred this stack overflow question for reference but I am unable to figure out my mistake.
The dataset I was experimenting has 1000 rows and 11 columns.
10 are feature columns and 1 was the class label column.I am attaching the code for your reference:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
df = pd.read_csv(r'C:\Users\sjrk\Desktop\ML\Machine learning practise\d-10.csv')
X = df[['D-0','D-1','D-2','D-3','D-4','D-5','D-6','D-7','D-8','D-9']]
y = df['C']
train_test_split =X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
scalar_model = StandardScaler()
scalar_model.fit(X_train)
X_train_std = scalar_model.transform(X_train)
X_test_std = scalar_model.transform(X_test)
#perceptron initialization
ppn = Perceptron(n_iter = 100,eta0=0.1,random_state=0)
# Online training
num_samples = X_train_std.shape[0]
classes_y = np.unique(y_train)
X_train_std = X_train_std.reshape(700,10)
y_train = y_train.reshape(700,1)
for i in range(num_samples):
ppn.partial_fit(X_train_std[i], y_train[i], classes = classes_y )
It is throwing an error like this :
ValueError: Expected 2D array, got 1D array instead:
array=[ 1.6540008 -0.09311816 -0.17325239 -1.21276374 -1.27102032 -0.51813835
1.74932495 -1.49606596 0.61310441 -0.66910947].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
There is something i am doing wrong with the reshaping in the online training.
Please help me out.
Thanks
I'm trying to recompute grid.best_score_ I obtained on my own data without success...
So I tried it using a conventional dataset but no more success. Here is the code :
from sklearn import datasets
from sklearn import linear_model
from sklearn.cross_validation import ShuffleSplit
from sklearn import grid_search
from sklearn.metrics import r2_score
import numpy as np
lr = linear_model.LinearRegression()
boston = datasets.load_boston()
target = boston.target
param_grid = {'fit_intercept':[False]}
cv = ShuffleSplit(target.size, n_iter=5, test_size=0.30, random_state=0)
grid = grid_search.GridSearchCV(lr, param_grid, cv=cv)
grid.fit(boston.data, target)
# got cv score computed by gridSearchCV :
print grid.best_score_
0.677708680059
# now try a custom computation of cv score
cv_scores = []
for (train, test) in cv:
y_true = target[test]
y_pred = grid.best_estimator_.predict(boston.data[test,:])
cv_scores.append(r2_score(y_true, y_pred))
print np.mean(cv_scores)
0.703865991851
I can't see why it's different, GridSearchCV is supposed to use scorer from LinearRegression, which is r2 score. Maybe the way I code cv score is not the one used to compute best_score_... I'm asking here before going through GridSearchCV code.
Unless refit=False in the GridSearchCV constructor, the winning estimator is refit on the entire dataset at the end of fit. best_score_ is the estimator's average score using the cross-validation splits, while best_estimator_ is an estimator of the winning configuration fit on all the data.
lr2 = linear_model.LinearRegression(fit_intercept=False)
scores2 = [lr2.fit(boston.data[train,:], target[train]).score(boston.data[test,:], target[test])
for train, test in cv]
print np.mean(scores2)
Will print 0.67770868005943297.