saving polynomial model , doesn't save polynomial degree - python-3.x

How can I deal with polynomial degree when I want to save a polynomial model, sicne this info is not being saved!
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = pd.DataFrame({
"a": np.random.uniform(0.0, 1.0, 1000),
"b": np.random.uniform(10.0, 14.0, 1000),
"c": np.random.uniform(100.0, 1000.0, 1000)})
def data():
X_train, X_val, y_train, y_val = train_test_split(df.iloc[:, :2].values,
df.iloc[:, 2].values,
test_size=0.2,
random_state=1340)
return X_train, X_val, y_train, y_val
X_train, X_val, y_train, y_val = data()
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
poly_reg_model = LinearRegression().fit(X_poly, y_train)
poly_model = joblib.dump(poly_reg_model, 'themodel')
y_pred = poly_reg_model.predict(poly_reg.fit_transform(X_val))
themodel = joblib.load('themodel')
Now, if I try to predict:
themodel.predict(X_val), I am receiving:
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 6 is different from 2)
I have to do:
pol_feat = PolynomialFeatures(degree=2)
themodel.predict(pol_feat.fit_transform(X_val))
in order to work.
So, how can i store this info in order to be able to use the model for prediction?

You have to pickle trained PolynomialFeatures also:
# train and pickle
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
poly_reg_model = LinearRegression().fit(X_poly, y_train)
joblib.dump(poly_reg_model, 'themodel')
joblib.dump(poly_reg, 'poilynomia_features_model')
# load and predict
poilynomia_features_model = joblib.load('poilynomia_features_model')
themodel = joblib.load('themodel')
X_val_prep = poilynomia_features_model.transform(X_val)
predictions = themodel.predict(X_val_prep)
But better will wrap all the steps in the single pipeline:
pipeline = Pipeline(steps=[('poilynomia', PolynomialFeatures()),
('lr', LinearRegression())])
pipeline.fit(X_train, y_train)
pipeline.predict(X_val)

Related

List object not callable in SVM

I'm trying to run this SVM using stratified K fold in Python,however I keep on getting the error like below
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, zero_one_loss, confusion_matrix
import pandas as pd
import numpy as np
z = pd.read_csv('/home/User/datasets/gtzan.csv', header=0)
X = z.iloc[:, :-1]
y = z.iloc[:, -1:]
X = np.array(X)
y = np.array(y)
# Performing standard scaling
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)
# Defining the SVM with 'rbf' kernel
svc = SVC(kernel='rbf', C=100, random_state=50)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, shuffle=True)
skf = StratifiedKFold(n_splits=10, shuffle=True)
accuracy_score = []
#skf.get_n_splits(X, y)
for train_index, test_index in skf.split(X, y):
X_train, X_test = X_scaled[train_index], X_scaled[test_index]
y_train, y_test = y[train_index], y[test_index]
# Training the model
svc.fit(X_train, np.ravel(y_train))
# Prediction on test dataste
y_pred = svc.predict(X_test)
# Obtaining the accuracy scores of the model
score = accuracy_score(y_test, y_pred)
accuracy_score.append(score)
# Print the accuarcy of the svm model
print('accuracy score: %0.3f' % np.mean(accuracy_score))
however, it gives me an error like below
Traceback (most recent call last):
File "/home/User/Test_SVM.py", line 55, in <module>
score = accuracy_score(y_test, y_pred)
TypeError: 'list' object is not callable
What makes this score list uncallable and how do I fix this error?
accuracy_scoreis a list in my code and I was also calling the same list as a function, which is overriding the existing functionality of function accuarcy_score. Changed the list name to acc_score which solved the problem.

error while performing RFECV through SVR The classifier does not expose "coef_" or "feature_importances_" attributes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
housing = pd.read_csv('boston.csv')
x = housing.iloc[:, 0:13].values
y = housing.iloc[:, 13:14].values
y = np.ravel(y)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 0)
y_train = np.ravel(y_train)
regressor = SVR(kernel = 'poly', degree=2)
regressor.fit(x_train, y_train)
rfecv = RFECV(estimator = regressor, cv=5, scoring='accuracy')
After executing above line (i.e. rfecv) I get the following error:
"RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes"
What am I doing wrong ???
You need to fit it afterwards, change it to:
regressor = SVR(kernel = 'poly', degree=2)
rfecv = RFECV(estimator = regressor, cv=5, scoring='accuracy')
rfecv = rfec.fit(x_train, y_train)

How to use .fit when the X value is in time format

Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.2, random_state = 10)
You have to preprocess data before feeding your model. Here is a complete working example. First, let's import the required modules:
from datetime import datetime
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
Then, define the training data:
X = ['17:00','17:05', '17:10', '17:15', '17:20', '17:25']
X = np.array(X).reshape(-1, 1)
y = [1, 0, 1, 1, 0, 1]
Note, the X must be 2D array. Also, you have to convert time string values to the numerical format. One way to do it is to convert strings to timestamp using the builtin datetime module. Here is a function which will be used to transform the data:
def transform(X, y=None):
X_new = np.apply_along_axis(
lambda x: [datetime.strptime(x[0], '%H:%M').timestamp()],
axis=1,
arr=X)
return X_new
Don't forget to scale your data since SVC models require data normalization. One can easily combine all the preprocessing steps using the Pipeline:
pipeline = Pipeline(steps=[
('transformer', FunctionTransformer(transform, validate=False)),
('scaler', MinMaxScaler()),
('predictor', SVC(kernel='linear'))
])
Finally, let's fit the model:
print('Build and fit a model...')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print('Done. Score', score)

accuracy of the logistic regression program always differ

import math
import numpy as np
import pandas as pd
#from pandas import DataFrame
from sklearn import preprocessing,cross_validation
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
from numpy import loadtxt, where
from pylab import scatter, show, legend, xlabel, ylabel
# scale larger positive and values to between -1,1 depending on the largest
# value in the data
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
df = pd.read_excel("Cryotherapy.xlsx", header=0)
# clean up data
df.columns = ["sex","age","Time","Number_of_Warts", "Type",
"Area","Result_of_Treatment"]
x = df["Result_of_Treatment"]
X = df[["Type","Area",]]
X = np.array(X)
X = min_max_scaler.fit_transform(X)
Y = df["Result_of_Treatment"]
Y = np.array(Y)
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y,
test_size=0.4)
# train scikit learn model
clf = LogisticRegression()
clf.fit(X_train, Y_train)
accuracy = clf.score(X_test,Y_test)
print(accuracy)
Try passing a random_state into the train_test_split function. If you do not do this then the data is gonna be shuffled randomly each time -> producing different train and test sets.
Example:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.4, random_state=1)

Why is my y_pred model only close to zero?

I am new to python and also learning machine learning. I got a data-set for titanic and trying to predict who survived and who did not. But my code seems to have an issue with the y_pred, as none of them is close to 1 or above one. Find attached also the y_test and y_pred images.
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('train.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values
# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 2:3])
X[:, 2:3] = imputer.transform(X[:, 2:3])
#Encoding Categorical variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
# Dummy variable trap
X = X[:, 1:]
# Splitting the Dataset into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Split the dataset into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_tratin, y_test = train_test_split(X, y, test_size = 0.2,)
# Fitting the Multiple Linear Regression to the training set
""" regressor is an object of LinearRegression() class in line 36 """
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
Thanks for the help everyone, I have been able to sort it out.
The problem was y in the importing dataset was seen as a vector and not a matrix
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('train.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3:].values
# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 2:3])
X[:, 2:3] = imputer.transform(X[:, 2:3])
#Encoding Categorical variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
# Dummy variable trap
X = X[:, 1:]
# Splitting the Dataset into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
# Fitting the Multiple Linear Regression to the training set
""" regressor is an object of LinearRegression() class in line 36 """
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Predicting the test set result
y_pred = regressor.predict(X_test)

Resources