error of making a test and train data split in sklearn - python-3.x

I am trying to make a test and train data split by "train_test_split".
Why I got the error "At least one array required as input".
The input of "train_test_split" can be array and dataFrame, right ?
import pandas as pd
import numpy as np
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
import pandas.rpy.common as rpy_common
from sklearn.model_selection import train_test_split
def la():
ro.r('library(MASS)')
pydf = rpy_common.load_data(name = 'Boston', package=None, convert=True)
pddf = pd.DataFrame(pydf)
targetIndex = pddf.columns.get_loc("medv")
# make train and test data
rowNum = pddf.shape[0]
colNum = pddf.shape[1]
print(type(pddf.as_matrix()))
print(pddf.as_matrix().shape)
m = np.asarray(pddf.as_matrix()).reshape(rowNum,colNum)
print(type(m))
x_train, x_test, y_train, y_test = train_test_split(x = m[:, 0:rowNum-2], \
y = m[:, -1],\
test_size = 0.5)
# error: raise ValueError("At least one array required as input")
ValueError: At least one array required as input

From the sklearn docs the arrays are handled with positional item unpacking ("*args").
You are using keyword arguments, "x=" and "y=", which it tries to handle by looking if "x" and "y" are the names of special keyword options.
Try:
train_test_split(m[:, 0:rowNum-2], m[:, -1], test_size=0.5)
(removing the keyword argument names from the arrays).

Related

Mismatch in number of cases despite there are same number of cases sktime

learning classification in sktime
from sklearn.model_selection import train_test_split
X = AUDCHF_h1_model[['Open','High','Low','Close','Volume','VWMA',
'Minute','Hour','Day','Week','Month','Year']].values
y = AUDCHF_h1_model[['is_beg_leg']].values
X_train,X_test,y_train,y_test = train_test_split(
X, y, test_size=0.2)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(53250, 12) (53250, 1) (13313, 12) (13313, 1)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sktime.classification.compose import ColumnEnsembleClassifier
from sktime.classification.dictionary_based import BOSSEnsemble
from sktime.classification.interval_based import TimeSeriesForestClassifier
#from sktime.classification.shapelet_based import MrSEQLClassifier
from sktime.datasets import load_basic_motions
from sktime.transformations.panel.compose import ColumnConcatenator
steps = [
("concatenate", ColumnConcatenator()),
("classify", TimeSeriesForestClassifier(n_estimators=100)),
]
clf = Pipeline(steps)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
i receive
ValueError: Mismatch in number of cases. Number in X = 639000 nos in y = 53250
but
X_train.shape
(53250, 12)
y_train.shape
(53250, 1)
who knows ?
Based on the information you provided, I can't say anything for sure, but I suspect that the problem is the ColumnConcatenator in your pipeline, which stacks all the columns of X to create a new univariate time series with 53250 * 12 = 639000 rows. This concatenated time series is then passed to the TimeSeriesForestClassifier and has a different shape than your original input.
Depending on your use case, you can now either delete the "concatenated" step or you have to provide target values for the newly created univariate time series.

Issues with One Hot Encoding for model with values not in training data

I would like to use One Hot Encoding for my simple model. Yet it seems to trigger an error no matter how I set it up. First, One Hot Encoding is not converting string to float even though I have version 1.0.2 of sklearn. Now the issue is because the values in my training data are not the same length as in test data. Training only has 2 values, testing has all three. How do I fix that? The exact error is the truth value of a series is ambiguous. The error with this other idea is to reshape the data.
import lightgbm as lgbm
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
X = [[ 'apple',5],['banana',1],['apple',6],['banana',2]]
X=pd.DataFrame(X).to_numpy()
test = [[ 'pineapple',0],['banana',1],['apple',7],['banana,2']]
y = [1,0,1,0]
y=pd.DataFrame(y).to_numpy()
labels = ['apples','bananas','pineapple']
ohc = OneHotEncoder(categories=labels)
pp = ColumnTransformer(
transformers=[('ohc', ohc, [0])]
,remainder = 'passthrough')
model=lgbm.LGBMClassifier()
mymodel = Pipeline(steps = [('preprocessor', pp),
('model', model)
])
params = {'model__learning_rate':[0.1]
,'model__n_estimators':[2]}
lgbm_gs=GridSearchCV(
estimator = mymodel, param_grid=params, n_jobs = -1,
cv=2, scoring='accuracy'
,verbose=-1)
lgbm_gs.fit(X,y)
The issue should be related to the fact that you're passing categories as a list rather than as a list of array-like (eg a list of list(s)) as the doc states. Therefore, the following adjustment should fix it.
import lightgbm as lgbm
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
X = [['apple',5],['banana',1],['apple',6],['banana',2]]
X = pd.DataFrame(X).to_numpy()
test = [['pineapple',0],['banana',1],['apple',7],['banana',2]]
y = [1,0,1,0]
y = pd.DataFrame(y).to_numpy()
labels = [['apple', 'banana', 'pineapple']] # observe you were also mispelling categories ('apples' --> 'apple'; 'bananas' --> 'banana')
ohc = OneHotEncoder(categories=labels)
pp = ColumnTransformer(transformers=[('ohc', ohc, [0])], remainder='passthrough')
model=lgbm.LGBMClassifier()
mymodel = Pipeline(steps = [('preprocessor', pp),
('model', model)])
params = {'model__learning_rate':[0.1], 'model__n_estimators':[2]}
lgbm_gs=GridSearchCV(
estimator = mymodel, param_grid=params, n_jobs = -1,
cv=2, scoring='accuracy', verbose=-1)
lgbm_gs.fit(X, y.ravel())
As a further remark, observe what the guide suggests when dealing with cases where test data has categories that cannot be found in the training set.
If there is a possibility that the training data might have missing categorical features, it can often be better to specify handle_unknown='ignore' instead of setting the categories manually as above. When handle_unknown='ignore' is specified and unknown categories are encountered during transform, no error will be raised but the resulting one-hot encoded columns for this feature will be all zeros (handle_unknown='ignore' is only supported for one-hot encoding):
Eventually, you can observe that the attribute categories_ (which specifies the categories of each feature determined during fitting) is a list of array(s) (single array here as you're one-hot-encoding one column only), too. Example with categories='auto':
ohc = OneHotEncoder(handle_unknown='ignore')
ohc.fit(X[:, 0].reshape(-1, 1)).categories_
# Output: [array(['apple', 'banana'], dtype=object)]
Example with your custom categories:
ohc = OneHotEncoder(categories=labels)
ohc.fit(X[:, 0].reshape(-1, 1)).categories_
# Output: [array(['apple', 'banana', 'pineapple'], dtype=object)]

Python 3 and Sklearn: Difficulty to use a NOT-sklearn model as a sklearn model

The code below is working. I have just a routine to run a cross validation scheme using a linear model previous defined in sklearn. I do not have a problem with this. My problem is that: if I replace the code model=linear_model.LinearRegression() by the model=RBF('multiquadric') (please see line 14 and 15 in the __main__, it does not work anymore. So my problem is actually in the class RBF where I try to mimic a sklearn model.
If I replace the code described above, I get the following error:
FitFailedWarning)
/home/daniel/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: All arrays must be equal length.
FitFailedWarning)
1) Should I define a score function in the Class RBF?
2) How to do that? I am lost. Since I am inherit BaseEstimator and RegressorMixin, I expected that this was internally solved.
3) Is there something else missing?
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import Rbf
np.random.seed(0)
from sklearn.base import BaseEstimator, RegressorMixin
class RBF(BaseEstimator, RegressorMixin):
def __init__(self,function):
self.function=function
def fit(self,x,y):
self.rbf = Rbf(x, y,function=self.function)
def predict(self,x):
return self.rbf(x)
if __name__ == "__main__":
# Load Data
targetName='HousePrice'
data=datasets.load_boston()
featuresNames=list(data.feature_names)
featuresData=data.data
targetData = data.target
df=pd.DataFrame(featuresData,columns=featuresNames)
df[targetName]=targetData
independent_variable_list=featuresNames
dependent_variable=targetName
X=df[independent_variable_list].values
y=np.squeeze(df[[dependent_variable]].values)
# Model Definition
model=linear_model.LinearRegression()
#model=RBF('multiquadric')
# Cross validation routine
number_splits=5
score_list=['neg_mean_squared_error','neg_mean_absolute_error','r2']
kfold = model_selection.KFold(n_splits=number_splits,shuffle=True, random_state=0)
scalar = StandardScaler()
pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
results = model_selection.cross_validate(pipeline, X, y, cv=kfold, scoring=score_list,return_train_score=True)
for score in score_list:
print(score+':')
print('Train: '+'Mean',np.mean(results['train_'+score]),'Standard Error',np.std(results['train_'+score]))
print('Test: '+'Mean',np.mean(results['test_'+score]),'Standard Error',np.std(results['test_'+score]))
Lets look at the documentation here
*args : arrays
x, y, z, …, d, where x, y, z, … are the coordinates of the nodes and d is the array of values at the nodes
So it takes variable length argument with the last argument being the value which is y in your case. Argument k is the kth coordinates of all the data point (same for all other argument z, y, z, ….
Following the documentation, your code should be
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import Rbf
np.random.seed(0)
from sklearn.base import BaseEstimator, RegressorMixin
class RBF(BaseEstimator, RegressorMixin):
def __init__(self,function):
self.function=function
def fit(self,X,y):
self.rbf = Rbf(*X.T, y,function=self.function)
def predict(self,X):
return self.rbf(*X.T)
# Load Data
data=datasets.load_boston()
X = data.data
y = data.target
number_splits=5
score_list=['neg_mean_squared_error','neg_mean_absolute_error','r2']
kfold = model_selection.KFold(n_splits=number_splits,shuffle=True, random_state=0)
scalar = StandardScaler()
model = RBF(function='multiquadric')
pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
results = model_selection.cross_validate(pipeline, X, y, cv=kfold, scoring=score_list,return_train_score=True)
for score in score_list:
print(score+':')
print('Train: '+'Mean',np.mean(results['train_'+score]),'Standard Error',np.std(results['train_'+score]))
print('Test: '+'Mean',np.mean(results['test_'+score]),'Standard Error',np.std(results['test_'+score]))
Output
neg_mean_squared_error:
Train: Mean -1.552450953914355e-20 Standard Error 7.932530906290208e-21
Test: Mean -23.007377210596463 Standard Error 4.254629143836107
neg_mean_absolute_error:
Train: Mean -9.398502208736061e-11 Standard Error 2.4673749061941226e-11
Test: Mean -3.1319779583728673 Standard Error 0.2162343985534446
r2:
Train: Mean 1.0 Standard Error 0.0
Test: Mean 0.7144217179633185 Standard Error 0.08526294242760363
Why *X.T : As we saw, each argument correspond to an axis of all the data points, so we transpose them and then use * operator to expand and pass each of the sub array as an argument to the variable length function.
Looks like the latest implementation has a mode parameter where we can pass the N-D array directly.

swap .has_khey by something else in python 3

I know that in python3 ".has_khey" is replace by "in"
But in my exemple , i didn't manage for make it working .
the whole quote for execution
from sklearn import model_selection
import pandas as pd
import numpy as np
from sklearn import neighbors, metrics
from matplotlib import pyplot as plt
data = pd.read_csv('your_path/winequality-red.csv', sep=";")
X = data.as_matrix([data.columns[:-1]])
y = data.as_matrix([data.columns[-1]])
y.flatten()
X_train, X_test, y_train, y_test = \
model_selection.train_test_split(X,y, test_size=0.3)
knn= neighbors.KNeighborsRegressor(n_neighbors = 12)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
the part which return me error :
sizes = {}
for (yt, yp) in zip(list(y_test), list(y_pred)):
if sizes.has_key((yt, yp)):
sizes[(yt, yp)] += 1
else:
sizes[(yt, yp)] = 1
keys = sizes.keys()
plt.scatter([k[0] for k in keys], [k[1] for k in keys], s=[sizes[k] for k in keys], color='coral')
when i try to swap if sizes.has_key((yt, yp)): in if (yt, yp) in sizes:
I got the error : TypeError: unhashable type: 'numpy.ndarray'
download the wine database
thanks in advance for any help
the result i'm looking for :
plot scatter size
here the .ipynb or .py file
I don't think the code you show can actually produce the error you report. Possibly you have redefined some variable in the notebook outside of that code?
In any case, concerning the question, you would want to replace if sizes.has_key((yt, yp)): by
if (yt, yp) in sizes.keys():
This should give you the desired plot

Unorderable Types: str() > float error KNN model

I have read quite a bit on this particular error and haven't been able to find an answer that addresses my issue. I have a data set that I have split into train and test sets and am looking to run a KNeighborsClassifier. My code is below... My problem is that when I look at the dtypes of my X_train i don't see any string formatted columns at all. My y_train is a single categorical variable. This is my first stackoverflow post so my apologies if I've overlooked any formalities and thanks for the help! :)
Error:
TypeError: unorderable types: str() > float()
Dtypes:
X_train.dtypes.value_counts()
Out[54]:
int64 2035
float64 178
dtype: int64
Code:
# Import Packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.cross_validation import train_test_split, KFold
from matplotlib.ticker import FormatStrFormatter
from sklearn import cross_validation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import pdb
# Set Directory Path
path = "file_path"
os.chdir(path)
#Select Import File
data = 'RawData2.csv'
delim = ','
#Import Data File
df = pd.read_csv(data, sep = delim)
print (df.head())
df.columns.get_loc('Categories')
#Model
#Select/Update Features
X = df[df.columns[14:2215]]
#Get Column Index for Target Variable
df.columns.get_loc('Categories')
#Select Target and fill na's with "Small" label
y = y[y.columns[21]]
print(y.values)
y.fillna('Small')
#Training/Test Set
X_sample = X.loc[X.Var1 <1279]
X_valid = X.loc[X.Var1 > 1278]
y_sample = y.head(len(X_sample))
y_test = y.head(len(y)-len(X_sample))
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size = 0.2)
cv = KFold(n = X_train.shape[0], n_folds = 5, random_state = 17)
print(X_train.shape, y_train.shape)
X_train.dtypes.value_counts()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train) **<-- This is where the error is flagged**
accuracy_score(knn.predict(X_test))
Everything in sklearn is based on numpy which only uses numbers. Hence categorical X and Y need to be encoded as numbers. For x you can use get_dummies. For y you can use LabelEncoder.
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

Resources