xgboost feature importance high but doesn't produce a better model - python-3.x
I am using XGboost for a binary prediction problem. I tested my model with several features and had some good results.
After adding one feature to the model and calculating the feature importance. The importance of this feature showed to be very high and far superior to other features.
However, when testing the model the test score drops considerably.
Is there an explanation for this kind of behaviour ?
There are at least a few ways to run feature importance experiments.
# Let's load the packages
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import shap
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
# 1
rf.feature_importances_
plt.barh(boston.feature_names, rf.feature_importances_)
sorted_idx = rf.feature_importances_.argsort()
plt.barh(boston.feature_names[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")
# 2
perm_importance = permutation_importance(rf, X_test, y_test)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(boston.feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
# 3
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")
Also, you can certainly add more data into your model. Models, almost without exception, produce more accurate results when they 'see' more data. Finally, you can always test other models on your dataset and see how they perform. Today at work I tested an XGboost model and a RandomForestRegressor model. I expected the former to perform better, but the latter actually performed much better. It's almost impossible to guess which model will perform better over any given dataset, you have to try multiple models, check the predictive capabilities of each, and pick the one (or maybe two) that performs the best. Having said that, you can try something like this.
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
np.random.seed(0)
pd.set_option('display.max_columns', 500)
#df = pd.read_csv('C:\\your_path_here\\test.csv')
#print('done!')
#df = df[:10000]
#df = df.fillna(0)
#df = df.dropna()
X = df[['RatingScore',
'Par',
'Term',
'TimeToMaturity',
'LRMScore',
'Coupon',
'Price']]
#select your target variable
y = df[['Spread']]
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
clustering_names = [
'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
'SpectralClustering', 'Ward', 'AgglomerativeClustering',
'DBSCAN', 'Birch']
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock", n_clusters=2,
connectivity=connectivity)
birch = cluster.Birch(n_clusters=2)
clustering_algorithms = [
two_means, affinity_propagation, ms, spectral, ward, average_linkage,
dbscan, birch]
for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
# plot
plt.subplot(4, len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
Finally, consider looping through several regression, or classification, models in one go, and getting the results for each.
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
from sklearn import linear_model
import statsmodels.api as sm
X = X
y = y
# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model
# Print out the statistics
model.summary()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
regressors = [
LinearRegression(),
SGDRegressor(),
KNeighborsRegressor(),
DecisionTreeRegressor(),
RandomForestRegressor(),
GradientBoostingRegressor(),
TweedieRegressor(),
PoissonRegressor(),
Ridge(),
Lasso()
]
import pandas as pd
# Logging for Visual Comparison
log_cols=["Regressor", "RMSE", "MAE"]
log = pd.DataFrame(columns=log_cols)
for reg in regressors:
reg.fit(X_train, y_train)
name = reg.__class__.__name__
print(reg.score(X_test, y_test))
y_pred = reg.predict(X_test)
lr_mse = mean_squared_error(y_pred, y_test)
lr_rmse = np.sqrt(lr_mse)
print(name + ' RMSE: %.4f' % lr_rmse)
lin_mae = mean_absolute_error(y_pred, y_test)
print(name + ' MAE: %.4f' % lin_mae)
log_entry = pd.DataFrame([[name, lr_rmse, lin_mae]], columns=log_cols)
log = log.append(log_entry)
print("="*30)
import seaborn as sns
import matplotlib as plt
sns.set_color_codes("muted")
sns.barplot(x='RMSE', y='Regressor', data=log, color="b")
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.datasets import load_iris
iris = load_iris()
iris
# Step 2: Separating the data into dependent and independent variables
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
# Step 3: Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
classifiers = [
GaussianNB(),
MLPClassifier(),
KNeighborsClassifier(),
GaussianProcessClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
QuadraticDiscriminantAnalysis()]
import pandas as pd
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)
for clf in classifiers:
clf.fit(X_train, y_train)
name = clf.__class__.__name__
print("="*30)
print(name)
print('****Results****')
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))
log_entry = pd.DataFrame([[name, acc*100]], columns=log_cols)
log = log.append(log_entry)
print("="*30)
import seaborn as sns
import matplotlib as plt
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")
Related
AttributeError: 'numpy.ndarray' object has no attribute 'lower' - how to fix it?
The full error is this. I am not sure how to fix it. I'm trying to predict the link between gender and aggresiveness in tweets. (https://i.stack.imgur.com/T4Ual.png) This is the whole script import pandas as pd import numpy as np import matplotlib.pyplot as plt #De specifikke, vi ved vi kommer til at bruge from sklearn import linear_model from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB #Gør at man kan have mere end 2 classes data = pd.read_csv('/work/90301/Individual project/TheClimateChangeTwitterDataset.csv') #corpus=data['text'] #corpus=text.loc[:,['aggressiveness', 'gender']] cv=CountVectorizer() #Take some text and turn it into a matrix X = cv.fit_transform(data.values).toarray() #x = X['aggressiveness'].values #y = X['gender'].values y=data['gender'].values print(X.shape) print(y.shape) X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) #Instantiate and train Naive Bayes classifier = MultinomialNB(fit_prior=True) classifier.fit(X_train, y_train) #test model y_pred = classifier.predict(X_test) cm = confusion_matrix(y_test, y_pred) print(cm) print(f'Relative accuracy: {accuracy_score(y_test, y_pred)}') print(f'Accuracy in instances: {accuracy_score(y_test, y_pred, normalize=False)}') #Infer the label (spam/ham) of a message aggressiveness=[corpus] #print(email) aggressiveness_array = cv.transform(aggressiveness).toarray() print(classifier.predict(aggressiveness_array))
Error in Grid search CV - RidgeClassifierCV as the constructor either does not set or modifies parameter alphas
I am performing gridsearchcv on ridgeclassifiercv to obtain hyper-parameters for my model. So i imported the libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler from sklearn import linear_model from sklearn.model_selection import GridSearchCV, RandomizedSearchCV import warnings warnings.filterwarnings('ignore') np.random.seed(27) Then i imported the dataset and split, scaled and label encoded the target variable !wget -O ChurnData.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/ChurnData.csv churn = pd.read_csv("ChurnData.csv") X = churn.drop(['churn'], axis='columns') y1 = churn[['churn']] y1['churn']=y1['churn'].astype('int') scaler=StandardScaler() X_scaled=scaler.fit_transform(X) from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(churn['churn'].unique()) y = le.transform(y1) # split X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2) Then i performed gridsearchcv alphas = [(0.1, 1, 2, 5 , 10)] solver_churn = ['auto', 'svd','cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'] fit_intercept = [True, False] class_weight = [{0:0.5,1:0.5},{0:0.6,1:0.4}] param_grid_churn = dict(alphas=alphas, fit_intercept=fit_intercept,class_weight=class_weight) ridgecv = linear_model.RidgeClassifierCV() grids_churn = GridSearchCV(estimator=ridgecv, param_grid=param_grid_churn, scoring='roc_auc', verbose=1, n_jobs=-1) grid_result_churn = grids_churn.fit(X_train, y_train) alphas is given in docs as a parameter still i get Error in Grid search CV - RidgeClassifierCV as the constructor either does not set or modifies parameter alphas How to resolve this?
Adjust your code like this: alphas = (0.1, 1, 2, 5 , 10) solver_churn = ['auto', 'svd','cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'] fit_intercept = [True, False] class_weight = [{0:0.5,1:0.5},{0:0.6,1:0.4}] param_grid_churn = dict(fit_intercept=fit_intercept,class_weight=class_weight) ridgecv = linear_model.RidgeClassifierCV(alphas=alphas) grids_churn = GridSearchCV(estimator=ridgecv, param_grid=param_grid_churn, scoring='roc_auc', verbose=1, n_jobs=-1) grid_result_churn = grids_churn.fit(X_train, y_train)
Support vector regression
After executing this code, y_pred is way too high I have tried my code import numpy as py import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv('Position_Salaries.csv') X = dataset.iloc[:,1:2].values y= dataset.iloc[:, 2].values from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y= sc_y.fit_transform(y.reshape(-1,1)) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel = 'rbf') regressor.fit(X, y) # Predicting a new result y_pred=regressor.predict([[6.5]]) y_pred = sc_y.inverse_transform(y_pred) Why is the value of y_pred so high? is there some mistake in my code
I found the solution: Instead of line 31 and 32, I need to use y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]))))
I am trying to run Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingRegressor gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0) gbrt.fit(X, y) import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error X_train, X_val, y_train, y_val = train_test_split(X, y) gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120) gbrt.fit(X_train, y_train) errors = [mean_sqaured_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)] bst_n_estimators = np.argim(errors) gbrt_best = GradientBoostingRegressor(max_depth = 2, n_estimators = bst_n_estimators) gbrt_best.fit(X_train, y_train) When I run this code I get the following error ValueError: could not convert string to float: '<=50K' I am using the following data https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data After the boosting classifier I want to check the performance boost on area under the curve, but the above error needs to be fixed first
Based on your provided code and data preview, ValueError occurs because you're feeding in the string values/categorical data to the GBM model. Recommend doing one-hot encoding (http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) or pd.get_dummies first (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html), then fit the model. For ROC curve, please check out: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py. The example should be fairly straightforward for what you need. df = pd.read_csv(['PLEASE SPECIFY YOUR FILE PATH'], thousands = ',') df.columns = ['V' + str(col) for col in df.columns] list_cat = ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V13', 'V14'] list_target = ['V0'] df = pd.get_dummies(df, columns = list_cat, drop_first = True) X = df.loc[:, df.columns != list_target[0]].values y = df[list_target].values print(df.shape) df.head() import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0) gbrt.fit(X, y)
accuracy of the logistic regression program always differ
import math import numpy as np import pandas as pd #from pandas import DataFrame from sklearn import preprocessing,cross_validation from sklearn.linear_model import LogisticRegression #from sklearn.cross_validation import train_test_split from numpy import loadtxt, where from pylab import scatter, show, legend, xlabel, ylabel # scale larger positive and values to between -1,1 depending on the largest # value in the data min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) df = pd.read_excel("Cryotherapy.xlsx", header=0) # clean up data df.columns = ["sex","age","Time","Number_of_Warts", "Type", "Area","Result_of_Treatment"] x = df["Result_of_Treatment"] X = df[["Type","Area",]] X = np.array(X) X = min_max_scaler.fit_transform(X) Y = df["Result_of_Treatment"] Y = np.array(Y) X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.4) # train scikit learn model clf = LogisticRegression() clf.fit(X_train, Y_train) accuracy = clf.score(X_test,Y_test) print(accuracy)
Try passing a random_state into the train_test_split function. If you do not do this then the data is gonna be shuffled randomly each time -> producing different train and test sets. Example: X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.4, random_state=1)