Get features for maximum value of Scikit-learn estimator - python-3.x

I have the following very simple code trying to model a simple dataset:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
data = {'Feature_A': [1, 2, 3, 4], 'Feature_B': [7, 8, 9, 10], 'Feature_C': [2, 3, 4, 5], 'Label': [7, 7, 8, 9]}
data = pd.DataFrame(data)
data_labels = data['Label']
data = data.drop(columns=['Label'])
pipeline = Pipeline([('imputer', SimpleImputer()),
('std_scaler', StandardScaler())])
data_prepared = pipeline.fit_transform(data)
lin_reg = LinearRegression()
lin_grid = {"n_jobs": [20, 50]}
error = "max_error"
grid_search = GridSearchCV(lin_reg, param_grid=lin_grid, verbose=3, cv=2, refit=True, scoring=error, return_train_score=True)
grid_search.fit(data_prepared, data_labels)
print(grid_search.best_estimator_.coef_)
print(grid_search.best_estimator_.intercept_)
print(list(data_labels))
print(list(grid_search.best_estimator_.predict(data_prepared)))
That gives me the following results:
[0.2608746 0.2608746 0.2608746]
7.75
[7, 7, 8, 9]
[6.7, 7.4, 8.1, 8.799999999999999]
From there, is there a way of computing the values of the features that would give me the maximum label, within the boundaries of the dataset?

If I understand your question correctly, this should work:
import numpy as np
id_max = np.argmax(grid_search.predict(data)) # find id of the maximum predicted label
print(data.loc[id_max])

Related

How come Verbose=True does not show any output with VotingClassifier?

It's in the documentation that verbose=True will output time elapsed, but it is not doing so for me:
from sklearn.ensemble import VotingClassifier
voting_c_all = VotingClassifier(
estimators=[
('random_forest', gs_forest2),
('grid_search', gs),
],
voting='soft',
verbose=True,
n_jobs=-1
)
voting_c_all.fit(X_res, y_res)
Using the example from the manual:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='soft',verbose=True)
eclf1 = eclf1.fit(X, y)
[Voting] ....................... (1 of 2) Processing lr, total= 0.0s
[Voting] ....................... (2 of 2) Processing rf, total= 0.1s
But once you set n_jobs to be more than 1, the job should be sent to other cores and you don't see the print, hence doesn't keep track of the time:
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='soft',verbose=True,n_jobs=2)
eclf1 = eclf1.fit(X, y)

I try imputing in sklearn but I have an error

I try below code but I have some error.
imp=SimpleImputer(missing_values='NaN',strategy="mean")
col = veriler.iloc[:,1:4].values
type(col) ##numpy.ndarray
imp=imp.fit(col)
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
You need to convert the infinity values to a bounded value to apply imputation. np.nan_to_num clips nan, inf and -inf to workable values.
For example:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X = [[7, np.inf, 3], [4, np.nan, 6], [10, 5, 9]]
X = np.nan_to_num(X, nan=-9999, posinf=33333333, neginf=-33333333)
imp_mean.fit(X)
>>> SimpleImputer(add_indicator=False, copy=True, fill_value=None,
missing_values=nan, strategy='mean', verbose=0)
For transform also, this can be applied:
X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9], [np.nan, np.inf, -np.inf]]
X = np.nan_to_num(X, nan=-9999, posinf=33333333, neginf=-33333333)
print(imp_mean.transform(X))
>>>
[[-9.9990000e+03 2.0000000e+00 3.0000000e+00]
[ 4.0000000e+00 -9.9990000e+03 6.0000000e+00]
[ 1.0000000e+01 -9.9990000e+03 9.0000000e+00]
[-9.9990000e+03 3.3333333e+07 -3.3333333e+07]]

sklearn train_test_split returns some elements in both test/train

I have a data-set X with 260 unique observations.
when running x_train,x_test,_,_=test_train_split(X,y,test_size=0.2) I would assume that
[p for p in x_test if p in x_train] would be empty, but it is not. Actually it turns out that only two observations in x_test is not in x_train.
Is that intended or...?
EDIT (posted the data I am using):
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split as split
import numpy as np
DATA=load_breast_cancer()
X=DATA.data
y= DATA.target
y=np.array([1 if p==0 else 0 for p in DATA.target])
x_train,x_test,y_train,y_test=split(X,y,test_size=0.2,stratify=y,random_state=42)
len([p for p in x_test if p in x_train]) #is not 0
EDIT 2.0: Showing that the test works
a=np.array([[1,2,3],[4,5,6]])
b=np.array([[1,2,3],[11,12,13]])
len([p for p in a if p in b]) #1
This is not a bug with the implementation of train_test_split in sklearn, but a weird peculiarity of how the in operator works on numpy arrays. The in operator first does an elementwise comparison between two arrays, and returns True if ANY of the elements match.
import numpy as np
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[6, 7, 8], [5, 5, 5]])
a in b # True
The correct way to test for this kind of overlap is using the equality operator and np.all and np.any. As a bonus, you also get the indices that overlap for free.
import numpy as np
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[6, 7, 8], [5, 5, 5], [7, 8, 9]])
a in b # True
z = np.any(np.all(a == b[:, None, :], -1)) # False
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[6, 7, 8], [1, 2, 3], [7, 8, 9]])
a in b # True
overlap = np.all(a == b[:, None, :], -1)
z = np.any(overlap) # True
indices = np.nonzero(overlap) # (1, 0)
You need to check using the following:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split as split
import numpy as np
DATA=load_breast_cancer()
X=DATA.data
y= DATA.target
y=np.array([1 if p==0 else 0 for p in DATA.target])
x_train,x_test,y_train,y_test=split(X,y,test_size=0.2,stratify=y,random_state=42)
len([p for p in x_test.tolist() if p in x_train.tolist()])
0
Using x_test.tolist() the in operator will work as intended.
Reference: testing whether a Numpy array contains a given row

How to convert a grouped pandas dataframe into a numpy 3d array and apply right-padding?

In order to feed data into a LSTM network to predict remaining-useful-life (RUL) I need to create a 3D numpy array (No of machines, No of sequences, No of variables).
I already tried to combine solutions from stackoverflow and managed to create a prototype (which you can see below).
import numpy as np
import tensorflow as tf
import pandas as pd
df = pd.DataFrame({'ID': [1, 1, 2, 3, 3, 3, 3],
'V1': [1, 2, 2, 3, 3, 4, 2],
'V2': [4, 2, 3, 2, 1, 5, 1],
})
df_desired_result = np.array([[[1, 4], [2, 2], [-99, -99]],
[[2, 3], [-99, -99], [-99, -99]],
[[3, 2], [3, 1], [4, 5]]])
max_len = df['ID'].value_counts().max()
def pad_df(df, cols, max_seq, group_col= 'ID'):
array_for_pad = np.array(list(df[cols].groupby(df[group_col]).apply(pd.DataFrame.as_matrix)))
padded_array = tf.keras.preprocessing.sequence.pad_sequences(array_for_pad,
padding='post',
maxlen=max_seq,
value=-99
)
return padded_array
#testing prototype
pad_df(df, ['V1', 'V2'], max_len)
But when I apply the code above to my data, it applies the right-padding correctly but all values are set to 0.0.
I can't fully figure out this behaviour, I noticed that in the first line of my function, I get returned an array with nested arrays for 'array_for_pad'.
Here is a screenshot of the result:
result padding

Selecting Samples in Scikit-Learn

Is there any way of automatically selecting the 'training samples' from the collection of features for better fit of the model (DT or SVM)? I know about selecting the 'features'. But I am talking about selecting the 'samples' after selecting the features.
There are a couple different ways to split your set into training, testing, and cross validation sets. Check out sklearn.cross_validation.train_test_split. But also take a look at the plethora of advanced splitting methods that are also available in SK-Learn.
Here's an example with test_train_split:
In:
import numpy as np
from sklearn.cross_validation import train_test_split
a, b = np.arange(10).reshape((5, 2)), range(5)
a
Out:
array([[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9]])
In:
list(b)
Out:
[0, 1, 2, 3, 4]
In:
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.33, random_state=42)
a_train
Out:
array([[4, 5],
[0, 1],
[6, 7]])
In:
b_train
Out:
[2, 0, 3]
In:
a_test
Out:
array([[2, 3],
[8, 9]])
In:
b_test
Out:
[1, 4]
There are generally two ways to do feature selections: Univariate Feature Selection and L1-based Sparse Feature Selection.
from sklearn.datasets import make_classification
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import numpy as np
# simulate some artificial data: 2000 obs, features: 1000-dim
# but only 2 out 1000 features are informative, the rest 998 features are noises
X, y = make_classification(n_samples=2000, n_features=1000, n_informative=2, random_state=0)
X.shape
Out[153]: (2000, 1000)
# Univariate Feature Selection: select 20 best from 1000 features
# ==========================================================================
# classification F-test
X_selected = SelectKBest(f_classif, k=20).fit_transform(X, y)
X_selected.shape
# or to visualize each f-score/p-value of 1000 features
X_f_scores, X_f_pval = f_classif(X, y)
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(X_f_scores)
ax.set_title('Univariate Feature Selection: Classification F-Score')
ax.set_xlabel('features')
ax.set_ylabel('F-score')
# which features are most important: top 10
np.argsort(X_f_scores)[-10:] # argsort is from smallest to largest
Out[154]: array([940, 163, 574, 969, 994, 977, 360, 291, 838, 524])
# L1-based Sparse Feature Selection: any algo implementation penalty 'l1'
# ==========================================================================
# use LinearSVC for example here
# other popular choices: logistic regression, Lasso (for regression)
feature_selector = LinearSVC(C=0.01, penalty='l1', dual=False)
feature_selector.fit(X, y)
# get features with non-zero coefficients: exactly 2
(feature_selector.coef_ != 0.0).sum()
Out[155]: 2
X_selected_l1 = feature_selector.transform(X)
# or X[:, feature_selector.coef_ != 0.0]

Resources