'ascii' codec can't encode characters - python-3.x

I use python3.8 in Windows, Here is my code
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
# get a list of models to evaluate
def get_models():
models = dict()
for i in range(2, 29):
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=i)
model = DecisionTreeClassifier()
models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
return models
# evaluate a give model using cross-validation
def evaluate_model(model, x, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=7)
scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs= -1, error_score='raise')
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, x, y)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
The problem is:
D:\software\anaconda3\lib\site-packages\joblib\externals\loky\backend\resource_tracker.py in _send(self, cmd, name, rtype)
202
203 def _send(self, cmd, name, rtype):
--> 204 msg = '{0}:{1}:{2}\n'.format(cmd, name, rtype).encode('ascii')
205 if len(name) > 512:
206 # posix guarantees that writes to a pipe of less than PIPE_BUF
UnicodeEncodeError: 'ascii' codec can't encode characters in position 18-19: ordinal not in range(128)
I have tried many solutions such as add:
import sys
import imp
imp.reload(sys)
And I think there is no Chinese character in my dataset.
Thing is, if I remove the n_jobs=-1 parameter in scores = cross_val_score part, the code works but it runs really slow as there would be only one core working. Any solution for that?

Related

Why is sklearn RandomForestClassifier root node different from the most important feature?

How is feature importance calculated in RandomForestClassifier in scikit-learn?
Here's a reproducible code. I run the classifier once with criterion set to gini and once to entropy. For each of them, I print the feature importance and plot the tree.
In neither of the instances, the root tree is the same as the most important feature. Why is that?
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from IPython.display import Image, display
from subprocess import call
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_wine
from sklearn.datasets import load_iris
wines = load_wine()
iris = load_iris()
def create_and_fit(clf,model_name):
print(clf)
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=5, random_state=seed)
# X,y = iris.data; iris.target
# X,y = wines.data, wines.target
# fit the mode
clf.fit(X, y)
# get importance
importance = clf.feature_importances_
indices = np.argsort(importance)[::-1]
for f in range(X.shape[1]):
print("feature {}: ({})".format(indices[f], importance[indices[f]]))
filename = model_name+model.criterion
if model_name == 'forest_':
print('forest')
export_graphviz(clf.estimators_[0], out_file=filename+'.dot')
else:
export_graphviz(clf, out_file=filename+'.dot')
f = 'tree_'+model.criterion+'.png'
call(['dot', '-Tpng', filename+'.dot', '-o', filename+'.png', '-Gdpi=600'])
seed=0
models = [
RandomForestClassifier(criterion='gini',max_depth=5, random_state=seed),
RandomForestClassifier(criterion='entropy',max_depth=5, random_state=seed),
]
names =['forest_', 'forest_']
for name, model in zip(names, models):
create_and_fit(model,name)
Here's the snippet to load the image:
Image(filename = 'forest_gini'+'.png')
and for the entropy
Image(filename = 'forest_entropy'+'.png')
This behaviour seems to only happen with ensembles not trees (I'm generalizing as I only tried on Random forest and Decision Tree).
Here's the snippet for decision trees
models = [
DecisionTreeClassifier(criterion='gini',max_depth=5, random_state=seed),
DecisionTreeClassifier(criterion='entropy',max_depth=5, random_state=seed)
]
names =['tree_', 'tree_']
for name, model in zip(names, models):
create_and_fit(model,name)
Here's the snippet to load the image:
Image(filename = 'tree_gini'+'.png')
and for the entropy
Image(filename = 'tree_entropy'+'.png')
I think I found the answer, which is related to max_features parameter in RandomForestClassifier. Here's scikit-learn documentation:
max_features{“sqrt”, “log2”, None}, int or float,
default=”sqrt”
The number of features to consider when looking for
the best split:
If int, then consider max_features features at each split.
If float, then max_features is a fraction and round(max_features *
n_features) features are considered at each split.
If “auto”, then max_features=sqrt(n_features).
If “sqrt”, then max_features=sqrt(n_features).
If “log2”, then max_features=log2(n_features).
If None, then max_features=n_features.

Python 3 and Sklearn: Difficulty to use a NOT-sklearn model as a sklearn model

The code below is working. I have just a routine to run a cross validation scheme using a linear model previous defined in sklearn. I do not have a problem with this. My problem is that: if I replace the code model=linear_model.LinearRegression() by the model=RBF('multiquadric') (please see line 14 and 15 in the __main__, it does not work anymore. So my problem is actually in the class RBF where I try to mimic a sklearn model.
If I replace the code described above, I get the following error:
FitFailedWarning)
/home/daniel/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: All arrays must be equal length.
FitFailedWarning)
1) Should I define a score function in the Class RBF?
2) How to do that? I am lost. Since I am inherit BaseEstimator and RegressorMixin, I expected that this was internally solved.
3) Is there something else missing?
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import Rbf
np.random.seed(0)
from sklearn.base import BaseEstimator, RegressorMixin
class RBF(BaseEstimator, RegressorMixin):
def __init__(self,function):
self.function=function
def fit(self,x,y):
self.rbf = Rbf(x, y,function=self.function)
def predict(self,x):
return self.rbf(x)
if __name__ == "__main__":
# Load Data
targetName='HousePrice'
data=datasets.load_boston()
featuresNames=list(data.feature_names)
featuresData=data.data
targetData = data.target
df=pd.DataFrame(featuresData,columns=featuresNames)
df[targetName]=targetData
independent_variable_list=featuresNames
dependent_variable=targetName
X=df[independent_variable_list].values
y=np.squeeze(df[[dependent_variable]].values)
# Model Definition
model=linear_model.LinearRegression()
#model=RBF('multiquadric')
# Cross validation routine
number_splits=5
score_list=['neg_mean_squared_error','neg_mean_absolute_error','r2']
kfold = model_selection.KFold(n_splits=number_splits,shuffle=True, random_state=0)
scalar = StandardScaler()
pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
results = model_selection.cross_validate(pipeline, X, y, cv=kfold, scoring=score_list,return_train_score=True)
for score in score_list:
print(score+':')
print('Train: '+'Mean',np.mean(results['train_'+score]),'Standard Error',np.std(results['train_'+score]))
print('Test: '+'Mean',np.mean(results['test_'+score]),'Standard Error',np.std(results['test_'+score]))
Lets look at the documentation here
*args : arrays
x, y, z, …, d, where x, y, z, … are the coordinates of the nodes and d is the array of values at the nodes
So it takes variable length argument with the last argument being the value which is y in your case. Argument k is the kth coordinates of all the data point (same for all other argument z, y, z, ….
Following the documentation, your code should be
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import Rbf
np.random.seed(0)
from sklearn.base import BaseEstimator, RegressorMixin
class RBF(BaseEstimator, RegressorMixin):
def __init__(self,function):
self.function=function
def fit(self,X,y):
self.rbf = Rbf(*X.T, y,function=self.function)
def predict(self,X):
return self.rbf(*X.T)
# Load Data
data=datasets.load_boston()
X = data.data
y = data.target
number_splits=5
score_list=['neg_mean_squared_error','neg_mean_absolute_error','r2']
kfold = model_selection.KFold(n_splits=number_splits,shuffle=True, random_state=0)
scalar = StandardScaler()
model = RBF(function='multiquadric')
pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
results = model_selection.cross_validate(pipeline, X, y, cv=kfold, scoring=score_list,return_train_score=True)
for score in score_list:
print(score+':')
print('Train: '+'Mean',np.mean(results['train_'+score]),'Standard Error',np.std(results['train_'+score]))
print('Test: '+'Mean',np.mean(results['test_'+score]),'Standard Error',np.std(results['test_'+score]))
Output
neg_mean_squared_error:
Train: Mean -1.552450953914355e-20 Standard Error 7.932530906290208e-21
Test: Mean -23.007377210596463 Standard Error 4.254629143836107
neg_mean_absolute_error:
Train: Mean -9.398502208736061e-11 Standard Error 2.4673749061941226e-11
Test: Mean -3.1319779583728673 Standard Error 0.2162343985534446
r2:
Train: Mean 1.0 Standard Error 0.0
Test: Mean 0.7144217179633185 Standard Error 0.08526294242760363
Why *X.T : As we saw, each argument correspond to an axis of all the data points, so we transpose them and then use * operator to expand and pass each of the sub array as an argument to the variable length function.
Looks like the latest implementation has a mode parameter where we can pass the N-D array directly.

Loading pickle NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted

multilabel classification
I am trying to predict a multilabel classification using scikit-learn/pandas/OneVsRestClassifier/logistic regression. Building and evaluating the model works but attempting to classify new sample text does not.
scenario 1:
Once I build a model saved the model with the name(sample.pkl) and restarting my kernel, but when I load the saved model(sample.pkl) during prediction on sample text getting its giving error:
NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.
I build the model and evaluate the model and i save it the model wtith the name sample.pkl. i restrat my kernal then i load the model making prediction on sample text NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted
inference
import pickle,os
import collections
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
import json, nltk, re, csv, pickle
from sklearn.metrics import f1_score # performance matrix
from sklearn.multiclass import OneVsRestClassifier # binary relavance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
stop_words = set(stopwords.words('english'))
def cleanHtml(sentence):
'''' remove the tags '''
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', str(sentence))
return cleantext
def cleanPunc(sentence):
''' function to clean the word of any
punctuation or special characters '''
cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
cleaned = cleaned.strip()
cleaned = cleaned.replace("\n"," ")
return cleaned
def keepAlpha(sentence):
""" keep the alpha sentenes """
alpha_sent = ""
for word in sentence.split():
alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
alpha_sent += alpha_word
alpha_sent += " "
alpha_sent = alpha_sent.strip()
return alpha_sent
def remove_stopwords(text):
""" remove stop words """
no_stopword_text = [w for w in text.split() if not w in stop_words]
return ' '.join(no_stopword_text)
test1 = pd.read_csv("C:\\Users\\abc\\Downloads\\test1.csv")
test1.columns
test1.head()
siNo plot movie_name genre_new
1 The story begins with Hannah... sing [drama,teen]
2 Debbie's favorite band is Dream.. the bigeest fan [drama]
3 This story of a Zulu family is .. come back,africa [drama,Documentary]
getting Error
I am getting the error here when iam inference on sample text
def infer_tags(q):
q = cleanHtml(q)
q = cleanPunc(q)
q = keepAlpha(q)
q = remove_stopwords(q)
multilabel_binarizer = MultiLabelBinarizer()
tfidf_vectorizer = TfidfVectorizer()
q_vec = tfidf_vectorizer.transform([q])
q_pred = clf.predict(q_vec)
return multilabel_binarizer.inverse_transform(q_pred)
for i in range(5):
print(i)
k = test1.sample(1).index[0]
print("Movie: ", test1['movie_name'][k], "\nPredicted genre: ", infer_tags(test1['plot'][k])), print("Actual genre: ",test1['genre_new'][k], "\n")
solved
I solved the i save tfidf and multibiniraze into pickle model
from sklearn.externals import joblib
pickle.dump(tfidf_vectorizer, open("tfidf_vectorizer.pickle", "wb"))
pickle.dump(multilabel_binarizer, open("multibinirizer_vectorizer.pickle", "wb"))
vectorizer = joblib.load('/abc/downloads/tfidf_vectorizer.pickle')
multilabel_binarizer = joblib.load('/abc/downloads/multibinirizer_vectorizer.pickle')
def infer_tags(q):
q = cleanHtml(q)
q = cleanPunc(q)
q = keepAlpha(q)
q = remove_stopwords(q)
q_vec = vectorizer .transform([q])
q_pred = rf_model.predict(q_vec)
return multilabel_binarizer.inverse_transform(q_pred)
i go though the below link i got the solution
,How do I store a TfidfVectorizer for future use in scikit-learn?>
This happens because you are only dumping the classifier into the pickle and not the vectorizer.
During inference, when you call
tfidf_vectorizer = TfidfVectorizer()
, your vectorizer is not fitted on the training vocabulary, which is giving the error.
What you should do is, dump both the classifier and the vectorizer to pickle. Load them both during inference.

error as function of dataset size - keras

I am trying to show a simple thing (allegedly). I want to show that the more examples I have in a range the better the performance is the test should be. I manipulate the number of data points in dataset [10,20,40,200,2000]. I use the val_loss, but it doesn't seem that the more data points there are the lower the loss should be. My question is why is the loss not getting lower the more points I add to the dataset (is keras do what I expect?).
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD,Adam
from keras import regularizers
import numpy as np
#np.random.seed(seed=1)
import matplotlib.pyplot as plt
%matplotlib inline
import random
import math
from numpy.random import seed
import random
import sklearn
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
def xrange(start_point,end_point,N,base):
temp = np.logspace(0.1, 1, N,base=base,endpoint=False)
temp=temp-temp.min()
temp=(0.0+temp)/(0.0+temp.max()) #this is between 0 and 1
return (end_point-start_point)*temp +start_point #this is the range
def func(x):
function=np.sin(3*x)/2 + np.cos(5*x)/2 + np.sin(20*x)
return function
def train_model(x_train,y_train,x_test):
#seed(5)
model=Sequential()
num_units=100
act='relu'
model.add(Dense(num_units,input_shape=(1,),activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(num_units,activation=act,kernel_initializer='random_uniform'))
model.add(Dense(1,activation='tanh')) #output layer 1 unit ; activation='tanh'
model.compile(Adam(),'mean_squared_error',metrics=['mse'])
history=model.fit(x_train,y_train,batch_size=32,epochs=500,verbose=1,validation_split = 0.2 ) #train on the noise (not moshe)
fit=model.predict(x_test)
loss = history.history['loss']
val_loss = history.history['val_loss']
return loss,val_loss,fit
Ns=[10,20,40,200,2000]
start_point=-5.25 #-1
end_point=5.25 #1
#base=550#[0.001,1.5,5,40,400]#545 np.arange(0.05,1,0.05).tolist()#[450,0.75]#[0.5,2,5,10,100,300]#Final:[0.001,500
test_step=0.0007
x_test=np.arange(start_point,end_point,test_step)
y_test=func(x_test)
loss=[]
val_loss=[]
fit=[]
list_y_train=[]
list_x_train=[]
for N in Ns:
x_train=np.linspace(start_point, end_point, num=N, endpoint=True)#xrange(start_point,end_point,N,b)
func_train=func(x_train)#np.sin(3*x_train)/2 + np.cos(5*x_train)/2 + np.sin(7*x_train) #### write a functino
noise=np.random.uniform(-0.2,0.2,len(x_train))
y_train=func_train+noise
l,v,f=train_model(x_train,y_train,x_test)
loss.append(y_train)
val_loss.append(v)
fit.append(f)
list_x_train.append(x_train)
list_y_train.append(y_train)
y_ideal=func(x_test)#func np.sin(3*x_train)/2
k=1
for f in fit:
p=plt.subplot(len(fit), 1, k)
#plt.plot(x_train,y_ideal,'k')
plt.scatter(list_x_train[k-1], list_y_train[k-1], facecolors='none', edgecolors='g') #plt.plot(x_value,sample,'bo')
plt.scatter(x_test, f, facecolors='none', edgecolors='b') #plt.plot(x_value,sample,'bo')
plt.plot(x_test,y_ideal,'k')
k=k+1
plt.plot(np.asarray(val_loss[0]),label=str(Ns[0]))
plt.plot(np.asarray(val_loss[1]),label=str(Ns[1]))
plt.plot(np.asarray(val_loss[2]),label=str(Ns[2]))
plt.plot(np.asarray(val_loss[3]),label=str(Ns[3]))
plt.plot(np.asarray(val_loss[4]),label=str(Ns[4]))
plt.legend()

Trying to run sklearn text classification on Apache Spark..GETTING Expected sequence or array-like, got PythonRDD[1] at RDD at PythonRDD.scala:43

I am trying to run sklearn SDG classifier on twitter data which is manually labelled into two classes 0 and 1.
I am pretty new to spark and would like your help on this.
I saw some code online and tried to simulate for my example but unfortunately it doesnt seem to work and I dont know why.
Your help would be greatly appreciated.
import sys
sys.path.append('/home/userName/Downloads/spark-1.2.1/python')
from pyspark import SparkContext
import numpy as np
from sklearn.cross_validation import train_test_split, Bootstrap
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.metrics import hamming_loss
from sklearn import cross_validation
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
import pandas as pd;
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from time import time
from sklearn.externals import joblib
import re
from HTMLParser import HTMLParser
from sklearn.grid_search import GridSearchCV
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
%matplotlib inline
def run(sc):
u_cols = ['CLASS','USER_RATING', 'REVIEW_TEXT']
df =
pd.read_csv('/home/userName/Desktop/input_file.csv',header=1,names=u_cols)
#Cleaning the data
lenn = len(df['REVIEW_TEXT'])
tag_remove = re.compile(r'<[^>]+>')
for i in range(0,lenn):
#Removing code block
df['REVIEW_TEXT'][i] = re.sub('<code>.*?</code>', '', df['REVIEW_TEXT'][i])
#Removeing html tags
df['REVIEW_TEXT'][i] = tag_remove.sub('', df['REVIEW_TEXT'][i])
X_train = df['REVIEW_TEXT']
y_train = df['CLASS']
X_train_final = X_train
y_train_final = y_train
#Validation Set Approach
X_train_final, X_test_final, y_train_final, y_test_final = cross_validation.train_test_split(
X_train_final, y_train_final, test_size=0.05, random_state=15)
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 20,
non_negative=True, stop_words = 'english', ngram_range = (1,2))
X_train_final = vectorizer.transform(X_train_final)
X_test_final = vectorizer.transform(X_test_final)
model = (SGDClassifier(alpha=1e-05, class_weight=None, epsilon=0.1, eta0=0.0,fit_intercept=True,
l1_ratio=0.15, learning_rate='optimal',loss='hinge', n_iter=5, n_jobs=1,
penalty='l1', power_t=0.5,random_state=None, shuffle=False, verbose=0,
warm_start=False))
samples = sc.parallelize(Bootstrap(y_train_final.shape[0]))
vote_tally = samples.map(lambda (index, _):
model.fit(X[index], y[index]).predict(X_test)
)
return accuracy_score(y_test_final, vote_tally)
if __name__ == '__main__':
print run(SparkContext("local", "Boost"))
getting the following ERROR:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1-be25c966218e> in <module>()
107
108 if __name__ == '__main__':
--> 109 print run(SparkContext("local", "Boost"))
110
<ipython-input-1-be25c966218e> in run(sc)
102 )
103
--> 104 return accuracy_score(y_test_final, vote_tally)
105 #print vote_tally.count()
106 #return vote_tally
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/metrics.pyc in accuracy_score(y_true, y_pred, normalize, sample_weight)
1295
1296 # Compute accuracy for each possible representation
-> 1297 y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)
1298 if y_type == 'multilabel-indicator':
1299 score = (y_pred != y_true).sum(axis=1) == 0
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/metrics.pyc in _check_clf_targets(y_true, y_pred)
107 y_pred : array or indicator matrix
108 """
--> 109 y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True)
110 type_true = type_of_target(y_true)
111 type_pred = type_of_target(y_pred)
/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in check_arrays(*arrays, **options)
248 checked_arrays.append(array)
249 continue
--> 250 size = _num_samples(array)
251
252 if size != n_samples:
/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in _num_samples(x)
172 x = np.asarray(x)
173 else:
--> 174 raise TypeError("Expected sequence or array-like, got %r" % x)
175 return x.shape[0] if hasattr(x, 'shape') else len(x)
176
**TypeError: Expected sequence or array-like, got PythonRDD[1] at RDD at PythonRDD.scala:43**
The problem is that sklearn components expects sequences/array-like/sparse/etc. data to work on, but you work with RDDs in pyspark.
We have a library which can help you solve your problem. It's called sparkit-learn.
Give it a try.

Resources