error as function of dataset size - keras - keras

I am trying to show a simple thing (allegedly). I want to show that the more examples I have in a range the better the performance is the test should be. I manipulate the number of data points in dataset [10,20,40,200,2000]. I use the val_loss, but it doesn't seem that the more data points there are the lower the loss should be. My question is why is the loss not getting lower the more points I add to the dataset (is keras do what I expect?).
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD,Adam
from keras import regularizers
import numpy as np
#np.random.seed(seed=1)
import matplotlib.pyplot as plt
%matplotlib inline
import random
import math
from numpy.random import seed
import random
import sklearn
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
def xrange(start_point,end_point,N,base):
temp = np.logspace(0.1, 1, N,base=base,endpoint=False)
temp=temp-temp.min()
temp=(0.0+temp)/(0.0+temp.max()) #this is between 0 and 1
return (end_point-start_point)*temp +start_point #this is the range
def func(x):
function=np.sin(3*x)/2 + np.cos(5*x)/2 + np.sin(20*x)
return function
def train_model(x_train,y_train,x_test):
#seed(5)
model=Sequential()
num_units=100
act='relu'
model.add(Dense(num_units,input_shape=(1,),activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(1,activation='tanh')) #output layer 1 unit ; activation='tanh'
model.compile(Adam(),'mean_squared_error',metrics=['mse'])
history=model.fit(x_train,y_train,batch_size=32,epochs=500,verbose=1,validation_split = 0.2 ) #train on the noise (not moshe)
fit=model.predict(x_test)
loss = history.history['loss']
val_loss = history.history['val_loss']
return loss,val_loss,fit
Ns=[10,20,40,200,2000]
start_point=-5.25 #-1
end_point=5.25 #1
#base=550#[0.001,1.5,5,40,400]#545 np.arange(0.05,1,0.05).tolist()#[450,0.75]#[0.5,2,5,10,100,300]#Final:[0.001,500
test_step=0.0007
x_test=np.arange(start_point,end_point,test_step)
y_test=func(x_test)
loss=[]
val_loss=[]
fit=[]
list_y_train=[]
list_x_train=[]
for N in Ns:
x_train=np.linspace(start_point, end_point, num=N, endpoint=True)#xrange(start_point,end_point,N,b)
func_train=func(x_train)#np.sin(3*x_train)/2 + np.cos(5*x_train)/2 + np.sin(7*x_train) #### write a functino
noise=np.random.uniform(-0.2,0.2,len(x_train))
y_train=func_train+noise
l,v,f=train_model(x_train,y_train,x_test)
loss.append(y_train)
val_loss.append(v)
fit.append(f)
list_x_train.append(x_train)
list_y_train.append(y_train)
y_ideal=func(x_test)#func np.sin(3*x_train)/2
k=1
for f in fit:
p=plt.subplot(len(fit), 1, k)
#plt.plot(x_train,y_ideal,'k')
plt.scatter(list_x_train[k-1], list_y_train[k-1], facecolors='none', edgecolors='g') #plt.plot(x_value,sample,'bo')
plt.scatter(x_test, f, facecolors='none', edgecolors='b') #plt.plot(x_value,sample,'bo')
plt.plot(x_test,y_ideal,'k')
k=k+1
plt.plot(np.asarray(val_loss[0]),label=str(Ns[0]))
plt.plot(np.asarray(val_loss[1]),label=str(Ns[1]))
plt.plot(np.asarray(val_loss[2]),label=str(Ns[2]))
plt.plot(np.asarray(val_loss[3]),label=str(Ns[3]))
plt.plot(np.asarray(val_loss[4]),label=str(Ns[4]))
plt.legend()

Related

RandomAdjustSharpness gives IndexError: tuple index out of range

While using RandomAdjustSharpness, my code throws the following error - IndexError: tuple index out of range. I followed the instructions given over here - https://pytorch.org/vision/stable/transforms.html and therefore am confused with this error.
Here is my code -
import math, random
from sklearn.datasets import load_sample_images
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
def random_crop(imgs):
imgs = torch.tensor(imgs)
change = torch.nn.Sequential(
transforms.RandomCrop(427),
transforms.RandomAdjustSharpness(1, p=1)
)
imgs = change(imgs).numpy()
return imgs
###Obtaining a random image and preprocessing it!##
dataset = load_sample_images()
first_img_data = dataset.images[0]
first_img_data = first_img_data.reshape(-1, 427, 640)
first_img_data = first_img_data[1, :, :]
#first_img_data = first_img_data[0:84, 0:84].reshape(-1, 84,84)
# first_img_data = torch.tensor(first_img_data)
plt.figure()
plt.imshow(np.squeeze(first_img_data))
foo = random_crop(first_img_data)
plt.figure()
plt.imshow(np.squeeze(foo))
plt.show()
you need to a dimension to your tensor like this
torch.tensor([imgs])

Python 3 and Sklearn: Difficulty to use a NOT-sklearn model as a sklearn model

The code below is working. I have just a routine to run a cross validation scheme using a linear model previous defined in sklearn. I do not have a problem with this. My problem is that: if I replace the code model=linear_model.LinearRegression() by the model=RBF('multiquadric') (please see line 14 and 15 in the __main__, it does not work anymore. So my problem is actually in the class RBF where I try to mimic a sklearn model.
If I replace the code described above, I get the following error:
FitFailedWarning)
/home/daniel/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: All arrays must be equal length.
FitFailedWarning)
1) Should I define a score function in the Class RBF?
2) How to do that? I am lost. Since I am inherit BaseEstimator and RegressorMixin, I expected that this was internally solved.
3) Is there something else missing?
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import Rbf
np.random.seed(0)
from sklearn.base import BaseEstimator, RegressorMixin
class RBF(BaseEstimator, RegressorMixin):
def __init__(self,function):
self.function=function
def fit(self,x,y):
self.rbf = Rbf(x, y,function=self.function)
def predict(self,x):
return self.rbf(x)
if __name__ == "__main__":
# Load Data
targetName='HousePrice'
data=datasets.load_boston()
featuresNames=list(data.feature_names)
featuresData=data.data
targetData = data.target
df=pd.DataFrame(featuresData,columns=featuresNames)
df[targetName]=targetData
independent_variable_list=featuresNames
dependent_variable=targetName
X=df[independent_variable_list].values
y=np.squeeze(df[[dependent_variable]].values)
# Model Definition
model=linear_model.LinearRegression()
#model=RBF('multiquadric')
# Cross validation routine
number_splits=5
score_list=['neg_mean_squared_error','neg_mean_absolute_error','r2']
kfold = model_selection.KFold(n_splits=number_splits,shuffle=True, random_state=0)
scalar = StandardScaler()
pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
results = model_selection.cross_validate(pipeline, X, y, cv=kfold, scoring=score_list,return_train_score=True)
for score in score_list:
print(score+':')
print('Train: '+'Mean',np.mean(results['train_'+score]),'Standard Error',np.std(results['train_'+score]))
print('Test: '+'Mean',np.mean(results['test_'+score]),'Standard Error',np.std(results['test_'+score]))
Lets look at the documentation here
*args : arrays
x, y, z, …, d, where x, y, z, … are the coordinates of the nodes and d is the array of values at the nodes
So it takes variable length argument with the last argument being the value which is y in your case. Argument k is the kth coordinates of all the data point (same for all other argument z, y, z, ….
Following the documentation, your code should be
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import Rbf
np.random.seed(0)
from sklearn.base import BaseEstimator, RegressorMixin
class RBF(BaseEstimator, RegressorMixin):
def __init__(self,function):
self.function=function
def fit(self,X,y):
self.rbf = Rbf(*X.T, y,function=self.function)
def predict(self,X):
return self.rbf(*X.T)
# Load Data
data=datasets.load_boston()
X = data.data
y = data.target
number_splits=5
score_list=['neg_mean_squared_error','neg_mean_absolute_error','r2']
kfold = model_selection.KFold(n_splits=number_splits,shuffle=True, random_state=0)
scalar = StandardScaler()
model = RBF(function='multiquadric')
pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
results = model_selection.cross_validate(pipeline, X, y, cv=kfold, scoring=score_list,return_train_score=True)
for score in score_list:
print(score+':')
print('Train: '+'Mean',np.mean(results['train_'+score]),'Standard Error',np.std(results['train_'+score]))
print('Test: '+'Mean',np.mean(results['test_'+score]),'Standard Error',np.std(results['test_'+score]))
Output
neg_mean_squared_error:
Train: Mean -1.552450953914355e-20 Standard Error 7.932530906290208e-21
Test: Mean -23.007377210596463 Standard Error 4.254629143836107
neg_mean_absolute_error:
Train: Mean -9.398502208736061e-11 Standard Error 2.4673749061941226e-11
Test: Mean -3.1319779583728673 Standard Error 0.2162343985534446
r2:
Train: Mean 1.0 Standard Error 0.0
Test: Mean 0.7144217179633185 Standard Error 0.08526294242760363
Why *X.T : As we saw, each argument correspond to an axis of all the data points, so we transpose them and then use * operator to expand and pass each of the sub array as an argument to the variable length function.
Looks like the latest implementation has a mode parameter where we can pass the N-D array directly.

When I applied RandomForest in Python, ValueError: Found input variables with inconsistent numbers of samples: [2883, 1236]

File "D:\Users\Watson Rockstar\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 205, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError:
Found input variables with inconsistent numbers of samples: [2883, 1236]
This dataset totally has 4119 data, and the Xtrain volum= (2883,18), Xtest volum = (1236,18)
I have tried to use LabelEncoder and OneHotEncoder to sovle the problems, but it is not helpful:
# Ignore the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
#import the necessary modelling algos.
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#preprocessing
from sklearn.preprocessing import LabelEncoder
telebanking = pd.read_csv('bank-additional.csv')
telebank = telebanking.drop(['duration','default'],axis =1)
def transform(feature):
le = LabelEncoder()
telebank[feature] = le.fit_transform(telebank[feature])
print(le.classes_)
cat_telebank=telebank.select_dtypes(include='object')
cat_telebank.columns
for col in cat_telebank.columns:
transform(col)
scaler=StandardScaler()
scaled_telebank=scaler.fit_transform(telebank.drop('y',axis=1))
X=scaled_telebank
Y=telebank['y'].as_matrix()
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3)
def compare(model):
clf = model
clf.fit(Xtrain,Ytrain)
pred = clf.predict(Xtrain)
acc.append(accuracy_score(pred,Ytest))
prec.append(precision_score(pred,Ytest))
rec.append(recall_score(pred,Ytest))
auroc.append(roc_auc_score(pred,Ytest))
acc=[]
prec=[]
rec=[]
auroc=[]
models=[RandomForestClassifier(),DecisionTreeClassifier()]
model_names=['RandomForestClassifier','DecisionTreeClassifier']
for model in range(len(models)):
compare(models[model])
d={'Modelling Algo':model_names,'Accuracy':acc,'Precision':prec,'Recall':rec,'Area Under ROC Curve':auroc}
met_telebank=pd.DataFrame(d)
met_telebank
It is the first warning's detail.
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3)
should be
Xtrain,Ytrain,Xtest,Ytest = train_test_split(X,Y,test_size=0.3)
This is causing the error, because it wants to use Xtest as the Ytrain values.

Plotting Grid Search CV grid parameter results on one graph

In sklearn 0.17.1 there was-->> grid_scores_ : list of named tuples (https://scikit-learn.org/0.17/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV)
Now in sklearn 0.21.2 it is replaced with-->> cv_results_ : dict of numpy (masked) ndarrays (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
Previously with sklearn 0.17.1, I was able to plot all grid parameters on a single plot using grid_scores_ but now I am unable to aggregate the values obtained from cv_results_ as there is no "mean_validation_score" in newer version.
I have an existing code which plotted all the parameters score in sklearn 0.17.1 (https://scikit-learn.org/0.17/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV) where grid_scores_ was used and it perfectly plotted all the values on one plot.
In newer version of slearn cv_results_ has been replaced with grid_scores_. I have tried to append all the values in want to plot all the parameters on one plot, currently I am unable to add the correct values to plot on the graph.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics.ranking import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.metrics import accuracy_score
import sklearn
import itertools
from pandas.tools.plotting import scatter_matrix
import os
import datetime as dt
from operator import itemgetter
from itertools import chain
import graphviz
from sklearn.metrics import precision_recall_fscore_support
import scikitplot as skplt
X_train = np.random.randint(0,1, size=[500,5000])
y_train = np.random.randint(0,1, size=500)
print(X_train.shape, y_train.shape)
# (500, 5000) (500,)
#grid_search = GridSearchCV(clf, param_grid, cv=3) # 10 fold cross validation
### hyperparameter estimator
param_grid = {"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": np.arange(1,9,7),
"min_samples_split": np.arange(2,150,90),
"min_samples_leaf": np.arange(1,60,45),
"min_weight_fraction_leaf": np.arange(0.1,0.4, 0.3),
"max_features": [1000, 500, 5000],
"max_leaf_nodes": np.arange(2,60,45),
"min_impurity_decrease": [0.0, 0.5],
}
def evaluate_param(parameter, param_range, index):
grid_search = GridSearchCV(clf, param_grid = {parameter: param_range}, cv=3) # 3 fold cross validation
grid_search.fit(X_train, y_train) ### grid_search.fit(X_train[features], y_train)
df = {}
#for i, score in enumerate(grid_search.grid_scores_): # previously used methods
for i, score in enumerate(grid_search.cv_results_["params"]):
## How do we save the correct values here for plotting
df[parameter] = grid_search.cv_results_["params"][i][parameter]
#df[parameter].update(grid_search.cv_results_["params"][i][parameter])
#print("df : ", df)
#df[parameter].append(grid_search.cv_results_["params"][i][parameter])
#print("df : ", df) # the values are not appended to the keys
df = pd.DataFrame.from_dict(df, orient='index')
df.reset_index(level=0, inplace=True)
df = df.sort_values(by='index')
plt.subplot(5,2,index) # Change here according to the number of parameters
plt.xlabel(parameter, color = "red")
plt.ylabel("GridSearchCV Score", color= "blue")
plot = plt.plot(df['index'], df[0])
plt.title(parameter.capitalize(), color = "red")
plt.savefig('DT_GridSearchCV_Score_Hyperparameter.png')
return plot, df
clf = tree.DecisionTreeClassifier(random_state=99) # verbose=True, n_jobs=-1 :: Dt does not support it
### hyperparameter estimator
index = 1
plt.figure(figsize=(30,30))
for parameter, param_range in dict.items(param_grid):
evaluate_param(parameter, param_range, index) ## 120 features
index += 1
This image is not filled as there is no "mean_validation_score" which can be filled for each subplot now:
https://ibb.co/Z6jwnMr
## Keys() gives the list of keys that gridsearchcv has:
grid_search.cv_results_.keys()
# output
# dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_max_features', 'param_max_leaf_nodes', 'param_min_impurity_decrease', 'param_min_samples_leaf', 'param_min_samples_split', 'param_min_weight_fraction_leaf', 'param_splitter', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'mean_train_score', 'std_train_score'])
grid_search.best_estimator_
# output
# DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
#max_features=1000, max_leaf_nodes=2, min_impurity_decrease=0.0,
#min_impurity_split=None, min_samples_leaf=1,
#min_samples_split=2, min_weight_fraction_leaf=0.1,
#presort=False, random_state=99, splitter='best')
Expected Result (should be filled): https://ibb.co/Z6jwnMr
However each subplot on the plot should have a curve depicting best value for the parameter. The keys do not have a "mean_validation_score" to plot the actual test score which was there in sklearn 0.17.1 but not in sklearn 0.20.2
Kindly let me know if there is still a way to plot all test scores on subplots of a single plot. Thanks in advance!!

Trying to run sklearn text classification on Apache Spark..GETTING Expected sequence or array-like, got PythonRDD[1] at RDD at PythonRDD.scala:43

I am trying to run sklearn SDG classifier on twitter data which is manually labelled into two classes 0 and 1.
I am pretty new to spark and would like your help on this.
I saw some code online and tried to simulate for my example but unfortunately it doesnt seem to work and I dont know why.
Your help would be greatly appreciated.
import sys
sys.path.append('/home/userName/Downloads/spark-1.2.1/python')
from pyspark import SparkContext
import numpy as np
from sklearn.cross_validation import train_test_split, Bootstrap
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.metrics import hamming_loss
from sklearn import cross_validation
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
import pandas as pd;
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from time import time
from sklearn.externals import joblib
import re
from HTMLParser import HTMLParser
from sklearn.grid_search import GridSearchCV
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
%matplotlib inline
def run(sc):
u_cols = ['CLASS','USER_RATING', 'REVIEW_TEXT']
df =
pd.read_csv('/home/userName/Desktop/input_file.csv',header=1,names=u_cols)
#Cleaning the data
lenn = len(df['REVIEW_TEXT'])
tag_remove = re.compile(r'<[^>]+>')
for i in range(0,lenn):
#Removing code block
df['REVIEW_TEXT'][i] = re.sub('<code>.*?</code>', '', df['REVIEW_TEXT'][i])
#Removeing html tags
df['REVIEW_TEXT'][i] = tag_remove.sub('', df['REVIEW_TEXT'][i])
X_train = df['REVIEW_TEXT']
y_train = df['CLASS']
X_train_final = X_train
y_train_final = y_train
#Validation Set Approach
X_train_final, X_test_final, y_train_final, y_test_final = cross_validation.train_test_split(
X_train_final, y_train_final, test_size=0.05, random_state=15)
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 20,
non_negative=True, stop_words = 'english', ngram_range = (1,2))
X_train_final = vectorizer.transform(X_train_final)
X_test_final = vectorizer.transform(X_test_final)
model = (SGDClassifier(alpha=1e-05, class_weight=None, epsilon=0.1, eta0=0.0,fit_intercept=True,
l1_ratio=0.15, learning_rate='optimal',loss='hinge', n_iter=5, n_jobs=1,
penalty='l1', power_t=0.5,random_state=None, shuffle=False, verbose=0,
warm_start=False))
samples = sc.parallelize(Bootstrap(y_train_final.shape[0]))
vote_tally = samples.map(lambda (index, _):
model.fit(X[index], y[index]).predict(X_test)
)
return accuracy_score(y_test_final, vote_tally)
if __name__ == '__main__':
print run(SparkContext("local", "Boost"))
getting the following ERROR:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1-be25c966218e> in <module>()
107
108 if __name__ == '__main__':
--> 109 print run(SparkContext("local", "Boost"))
110
<ipython-input-1-be25c966218e> in run(sc)
102 )
103
--> 104 return accuracy_score(y_test_final, vote_tally)
105 #print vote_tally.count()
106 #return vote_tally
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/metrics.pyc in accuracy_score(y_true, y_pred, normalize, sample_weight)
1295
1296 # Compute accuracy for each possible representation
-> 1297 y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)
1298 if y_type == 'multilabel-indicator':
1299 score = (y_pred != y_true).sum(axis=1) == 0
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/metrics.pyc in _check_clf_targets(y_true, y_pred)
107 y_pred : array or indicator matrix
108 """
--> 109 y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True)
110 type_true = type_of_target(y_true)
111 type_pred = type_of_target(y_pred)
/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in check_arrays(*arrays, **options)
248 checked_arrays.append(array)
249 continue
--> 250 size = _num_samples(array)
251
252 if size != n_samples:
/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in _num_samples(x)
172 x = np.asarray(x)
173 else:
--> 174 raise TypeError("Expected sequence or array-like, got %r" % x)
175 return x.shape[0] if hasattr(x, 'shape') else len(x)
176
**TypeError: Expected sequence or array-like, got PythonRDD[1] at RDD at PythonRDD.scala:43**
The problem is that sklearn components expects sequences/array-like/sparse/etc. data to work on, but you work with RDDs in pyspark.
We have a library which can help you solve your problem. It's called sparkit-learn.
Give it a try.

Resources