xgboost feature importance high but doesn't produce a better model - python-3.x

I am using XGboost for a binary prediction problem. I tested my model with several features and had some good results.
After adding one feature to the model and calculating the feature importance. The importance of this feature showed to be very high and far superior to other features.
However, when testing the model the test score drops considerably.
Is there an explanation for this kind of behaviour ?

There are at least a few ways to run feature importance experiments.
# Let's load the packages
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import shap
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
# 1
rf.feature_importances_
plt.barh(boston.feature_names, rf.feature_importances_)
sorted_idx = rf.feature_importances_.argsort()
plt.barh(boston.feature_names[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")
# 2
perm_importance = permutation_importance(rf, X_test, y_test)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(boston.feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
# 3
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")
Also, you can certainly add more data into your model. Models, almost without exception, produce more accurate results when they 'see' more data. Finally, you can always test other models on your dataset and see how they perform. Today at work I tested an XGboost model and a RandomForestRegressor model. I expected the former to perform better, but the latter actually performed much better. It's almost impossible to guess which model will perform better over any given dataset, you have to try multiple models, check the predictive capabilities of each, and pick the one (or maybe two) that performs the best. Having said that, you can try something like this.
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
np.random.seed(0)
pd.set_option('display.max_columns', 500)
#df = pd.read_csv('C:\\your_path_here\\test.csv')
#print('done!')
#df = df[:10000]
#df = df.fillna(0)
#df = df.dropna()
X = df[['RatingScore',
'Par',
'Term',
'TimeToMaturity',
'LRMScore',
'Coupon',
'Price']]
#select your target variable
y = df[['Spread']]
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
clustering_names = [
'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
'SpectralClustering', 'Ward', 'AgglomerativeClustering',
'DBSCAN', 'Birch']
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock", n_clusters=2,
connectivity=connectivity)
birch = cluster.Birch(n_clusters=2)
clustering_algorithms = [
two_means, affinity_propagation, ms, spectral, ward, average_linkage,
dbscan, birch]
for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
# plot
plt.subplot(4, len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
Finally, consider looping through several regression, or classification, models in one go, and getting the results for each.
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
from sklearn import linear_model
import statsmodels.api as sm
X = X
y = y
# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model
# Print out the statistics
model.summary()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
regressors = [
LinearRegression(),
SGDRegressor(),
KNeighborsRegressor(),
DecisionTreeRegressor(),
RandomForestRegressor(),
GradientBoostingRegressor(),
TweedieRegressor(),
PoissonRegressor(),
Ridge(),
Lasso()
]
import pandas as pd
# Logging for Visual Comparison
log_cols=["Regressor", "RMSE", "MAE"]
log = pd.DataFrame(columns=log_cols)
for reg in regressors:
reg.fit(X_train, y_train)
name = reg.__class__.__name__
print(reg.score(X_test, y_test))
y_pred = reg.predict(X_test)
lr_mse = mean_squared_error(y_pred, y_test)
lr_rmse = np.sqrt(lr_mse)
print(name + ' RMSE: %.4f' % lr_rmse)
lin_mae = mean_absolute_error(y_pred, y_test)
print(name + ' MAE: %.4f' % lin_mae)
log_entry = pd.DataFrame([[name, lr_rmse, lin_mae]], columns=log_cols)
log = log.append(log_entry)
print("="*30)
import seaborn as sns
import matplotlib as plt
sns.set_color_codes("muted")
sns.barplot(x='RMSE', y='Regressor', data=log, color="b")
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.datasets import load_iris
iris = load_iris()
iris
# Step 2: Separating the data into dependent and independent variables
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
# Step 3: Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
classifiers = [
GaussianNB(),
MLPClassifier(),
KNeighborsClassifier(),
GaussianProcessClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
QuadraticDiscriminantAnalysis()]
import pandas as pd
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)
for clf in classifiers:
clf.fit(X_train, y_train)
name = clf.__class__.__name__
print("="*30)
print(name)
print('****Results****')
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))
log_entry = pd.DataFrame([[name, acc*100]], columns=log_cols)
log = log.append(log_entry)
print("="*30)
import seaborn as sns
import matplotlib as plt
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")

Related

AttributeError: 'numpy.ndarray' object has no attribute 'lower' - how to fix it?

The full error is this. I am not sure how to fix it. I'm trying to predict the link between gender and aggresiveness in tweets.
(https://i.stack.imgur.com/T4Ual.png)
This is the whole script
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#De specifikke, vi ved vi kommer til at bruge
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB #Gør at man kan have mere end 2 classes
data = pd.read_csv('/work/90301/Individual project/TheClimateChangeTwitterDataset.csv')
#corpus=data['text']
#corpus=text.loc[:,['aggressiveness', 'gender']]
cv=CountVectorizer() #Take some text and turn it into a matrix
X = cv.fit_transform(data.values).toarray()
#x = X['aggressiveness'].values
#y = X['gender'].values
y=data['gender'].values
print(X.shape)
print(y.shape)
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
#Instantiate and train Naive Bayes
classifier = MultinomialNB(fit_prior=True)
classifier.fit(X_train, y_train)
#test model
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f'Relative accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Accuracy in instances: {accuracy_score(y_test, y_pred, normalize=False)}')
#Infer the label (spam/ham) of a message
aggressiveness=[corpus]
#print(email)
aggressiveness_array = cv.transform(aggressiveness).toarray()
print(classifier.predict(aggressiveness_array))

Error in Grid search CV - RidgeClassifierCV as the constructor either does not set or modifies parameter alphas

I am performing gridsearchcv on ridgeclassifiercv to obtain hyper-parameters for my model.
So i imported the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')
np.random.seed(27)
Then i imported the dataset and split, scaled and label encoded the target variable
!wget -O ChurnData.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/ChurnData.csv
churn = pd.read_csv("ChurnData.csv")
X = churn.drop(['churn'], axis='columns')
y1 = churn[['churn']]
y1['churn']=y1['churn'].astype('int')
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(churn['churn'].unique())
y = le.transform(y1)
# split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2)
Then i performed gridsearchcv
alphas = [(0.1, 1, 2, 5 , 10)]
solver_churn = ['auto', 'svd','cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
fit_intercept = [True, False]
class_weight = [{0:0.5,1:0.5},{0:0.6,1:0.4}]
param_grid_churn = dict(alphas=alphas, fit_intercept=fit_intercept,class_weight=class_weight)
ridgecv = linear_model.RidgeClassifierCV()
grids_churn = GridSearchCV(estimator=ridgecv, param_grid=param_grid_churn, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_result_churn = grids_churn.fit(X_train, y_train)
alphas is given in docs as a parameter still i get
Error in Grid search CV - RidgeClassifierCV as the constructor either does not set or modifies parameter alphas
How to resolve this?
Adjust your code like this:
alphas = (0.1, 1, 2, 5 , 10)
solver_churn = ['auto', 'svd','cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
fit_intercept = [True, False]
class_weight = [{0:0.5,1:0.5},{0:0.6,1:0.4}]
param_grid_churn = dict(fit_intercept=fit_intercept,class_weight=class_weight)
ridgecv = linear_model.RidgeClassifierCV(alphas=alphas)
grids_churn = GridSearchCV(estimator=ridgecv, param_grid=param_grid_churn, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_result_churn = grids_churn.fit(X_train, y_train)

Support vector regression

After executing this code, y_pred is way too high
I have tried my code
import numpy as py
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:,1:2].values
y= dataset.iloc[:, 2].values
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y= sc_y.fit_transform(y.reshape(-1,1))
# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)
# Predicting a new result
y_pred=regressor.predict([[6.5]])
y_pred = sc_y.inverse_transform(y_pred)
Why is the value of y_pred so high? is there some mistake in my code
I found the solution:
Instead of line 31 and 32, I need to use
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]))))

I am trying to run Gradient Boosting Classifier

from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)
errors = [mean_sqaured_error(y_val, y_pred)
for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argim(errors)
gbrt_best = GradientBoostingRegressor(max_depth = 2, n_estimators = bst_n_estimators)
gbrt_best.fit(X_train, y_train)
When I run this code I get the following error
ValueError: could not convert string to float: '<=50K'
I am using the following data
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
After the boosting classifier I want to check the performance boost on area under the curve, but the above error needs to be fixed first
Based on your provided code and data preview, ValueError occurs because you're feeding in the string values/categorical data to the GBM model. Recommend doing one-hot encoding (http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) or pd.get_dummies first (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html), then fit the model.
For ROC curve, please check out: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py. The example should be fairly straightforward for what you need.
df = pd.read_csv(['PLEASE SPECIFY YOUR FILE PATH'], thousands = ',')
df.columns = ['V' + str(col) for col in df.columns]
list_cat = ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V13', 'V14']
list_target = ['V0']
df = pd.get_dummies(df, columns = list_cat, drop_first = True)
X = df.loc[:, df.columns != list_target[0]].values
y = df[list_target].values
print(df.shape)
df.head()
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

accuracy of the logistic regression program always differ

import math
import numpy as np
import pandas as pd
#from pandas import DataFrame
from sklearn import preprocessing,cross_validation
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
from numpy import loadtxt, where
from pylab import scatter, show, legend, xlabel, ylabel
# scale larger positive and values to between -1,1 depending on the largest
# value in the data
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
df = pd.read_excel("Cryotherapy.xlsx", header=0)
# clean up data
df.columns = ["sex","age","Time","Number_of_Warts", "Type",
"Area","Result_of_Treatment"]
x = df["Result_of_Treatment"]
X = df[["Type","Area",]]
X = np.array(X)
X = min_max_scaler.fit_transform(X)
Y = df["Result_of_Treatment"]
Y = np.array(Y)
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y,
test_size=0.4)
# train scikit learn model
clf = LogisticRegression()
clf.fit(X_train, Y_train)
accuracy = clf.score(X_test,Y_test)
print(accuracy)
Try passing a random_state into the train_test_split function. If you do not do this then the data is gonna be shuffled randomly each time -> producing different train and test sets.
Example:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.4, random_state=1)

Resources