I was hoping to reproduce the following python k-fold target encoding strategy in pure Spark:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np
dataset = pd.read_csv("dataset.csv")
cols_to_encode = ["cat_1", "cat_2"]
X_train, X_test, y_train, y_test = train_test_split(dataset[cols_to_encode], dataset["target"], test_size=0.25, random_state=0)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
encoder = ce.WOEEncoder(cols=cols_to_encode)
X_train_fitted = pd.DataFrame([], columns = X_train.columns)
kf = KFold(n_splits = 5, shuffle = False, random_state=0)
for tr_ind, val_ind in kf.split(X_train, y_train):
encoder.fit(X_train.loc[tr_ind], y_train.loc[tr_ind])
X_train_fitted = X_train_fitted.append(encoder.transform(X_train.loc[val_ind]))
encoder.fit(X_train, y_train)
X_test_fitted = encoder.transform(X_test)
C = np.logspace(0, 4, num = 10)
penalty = ['l1', 'l2', 'elasticnet', 'none']
solver = ['liblinear', 'saga', "sag", "lbfgs"]
params = dict(C=C, penalty=penalty, solver=solver)
param_comb = 10
lr = LogisticRegression(random_state=0)
#preserving same cv
random_search = RandomizedSearchCV(lr, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=2, verbose=3, random_state=0, cv = kf.split(X_train, y_train))
random_search.fit(X_train_fitted, y_train)
As I didn't find any current implementation, I'm trying to write my own.
So far, for the mean encoding strategy all I could come with was using a group by in the training set, creating a dataframe that I can join with the validation set. This approach looks rather manual and hard to reproduce for more complex encodings.
I expect to reproduce it in a more spark-like way, in the same way the encoder worked in the above python code.
Edit: An attempt of the code in Spark:
import spark.implicits._
import org.apache.spark.ml.Pipeline
val simpleData = Seq(("James","Sales","NY",90000,34,10000),
val df = simpleData.toDF("employee_name","department","salary","state","age","bonus")
val splitDF = df.randomSplit(Array(1,1,1,1,1))
val (df1,df2,df3,df4,df5) = (splitDF(0),splitDF(1),splitDF(2),splitDF(3),splitDF(4))
val df1_encoded_train = df2.union(df3).union(df4).union(df5).groupBy("department").mean("bonus")
val df1_encoded_val = df1.join(df1_encoded_train, Seq("department"), "left")
val df2_encoded_train = df1.union(df3).union(df4).union(df5).groupBy("department").mean("bonus")
val df2_encoded_val = df2.join(df2_encoded_train, Seq("department"), "left")
val df3_encoded_train = df1.union(df2).union(df4).union(df5).groupBy("department").mean("bonus")
val df3_encoded_val = df3.join(df3_encoded_train, Seq("department"), "left")
val df4_encoded_train = df1.union(df2).union(df3).union(df5).groupBy("department").mean("bonus")
val df4_encoded_val = df4.join(df4_encoded_train, Seq("department"), "left")
val df5_encoded_train = df1.union(df2).union(df3).union(df4).groupBy("department").mean("bonus")
val df5_encoded_val = df5.join(d5_encoded_train, Seq("department"), "left")
val df_encoded = df1_encoded_val.union(df2_encoded_val).union(df3_encoded_val).union(df4_encoded_val).union(df5_encoded_val)


Linear Regression Using sklearn issues with reshape code

I've got my data cleaned and prepped. I've done a split test and am now trying to do a linear regression. The issue is, when I first tried it, it say that I needed to create an array and reshape the data. I have done this, but now it's giving me an error " _reshape_dispatcher() missing 1 required positional argument: 'newshape'". All of the methods I've looked up to declare a newshape have not worked.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
df = pd.read_csv('googleplaystore.csv') # 1
df = df.dropna() # 3
df['Size'] = df['Size'].str.extract(r'(\d+\.?\d)', expand=False).astype(float) * df['Size'].str[-1].replace({'M': 1024, 'k': 1}) # 4
df = df.dropna() # remove nan from "Varies with device"
df['Price'] = df['Price'].str.strip('$').astype(float) # 5
df['Installs'] = df['Installs'].str.strip('+')
df['Installs'] = df['Installs'].str.replace(',',"").astype(int)
df['Reviews'] = df['Reviews'].astype(float)
df['Size'] = df['Size'].astype(float)
df = df.loc[df['Rating'].between(1, 5)] # 6
df = df.loc[df['Type'] != 'Free'] # 7
df.drop(df[df['Price'] >= 200].index, inplace = True)
df.drop(df[df['Reviews'] >2000000].index, inplace = True)
df.drop(df[df['Installs'] >10000].index, inplace = True)
inp1 = df.copy()
del df['App']
del df['Last Updated']
del df['Current Ver']
del df['Android Ver']
pd.get_dummies(df, columns=['Category', 'Genres', 'Content Rating'], drop_first=True)
inp2 = df.copy()
df_train = X_train,X_test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.7, random_state=0)
df_test = X_train,X_Test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.3, random_state=0)
df_train = np.array(df_train)
df_test = np.array(df_test)
df_train = np.reshape(df_train.shape)
df_test = np.reshape(df_test.shape)
lr = LinearRegression()

Error saving model in sklearn2pmml using VotingClassifier

I'm new to programming and I'm having a little trouble saving a model in pmml. I have a database and I need to make a selection of attributes, then use the majority vote and finally save in pmml. Even the majority vote part works, but when I save the model on the last line using sklearn2pmml it gives an error.
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score
from sklearn2pmml import make_pmml_pipeline
from sklearn2pmml import sklearn2pmml
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble import VotingClassifier
import joblib
url = 'D:/treinamento.CSV'
df = read_csv(url, header=None)
data = df.values
url_test = 'D:/TESTE.CSV'
df_test = read_csv(url_test, header=None)
data_test = df_test.values
X = data[:, :-1]
y = data_test[:, -1]
X_train = data[:, :-1]
X_test = data_test[:, :-1]
y_train = data[:, -1]
y_test = y
#features selection
features1 = [2, 5, 7]
features2 = [0, 1, 4, 5, 7]
features3 = [0, 1, 4, 5, 6]
features4 = [1, 4]
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor1 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features1)])
preprocessor2 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features2)])
preprocessor3 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features3)])
preprocessor4 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features4)])
pipe1 = PMMLPipeline(steps=[('preprocessor', preprocessor1),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe2 = PMMLPipeline(steps=[('preprocessor', preprocessor2),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe3 = PMMLPipeline(steps=[('preprocessor', preprocessor3),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
pipe4 = PMMLPipeline(steps=[('preprocessor', preprocessor4),('classifier', DecisionTreeClassifier(min_samples_split = 2))])
eclf = VotingClassifier(estimators=[('pipe1', PMMLPipeline(steps=[('preprocessor', preprocessor1),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe2', PMMLPipeline(steps=[('preprocessor', preprocessor2),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe3', PMMLPipeline(steps=[('preprocessor', preprocessor3),('classifier', DecisionTreeClassifier(min_samples_split = 2))])),
('pipe4', PMMLPipeline(steps=[('preprocessor', preprocessor4),('classifier', DecisionTreeClassifier(min_samples_split = 2))]))], voting='hard', weights=[1,1,1,1])
eclf.fit(X_train, y_train)
yhat = eclf.predict(X_test)
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy * 100))
sklearn2pmml(eclf, "D:/Mestrado/ARTIGO DRC/dados_pos_revisao/cross validation - dados reavaliados/4 revisao/5 FOLDS/1 FOLD/eclf.pmml", with_repr = True)
Code error
65 sklearn2pmml(eclf, "D:/mest/eclf.pmml", with_repr = True)
~\anaconda3\lib\site-packages\sklearn2pmml\__init__.py in sklearn2pmml(pipeline, pmml, user_classpath, with_repr, debug, java_encoding)
222 print("{0}: {1}".format(java_version[0], java_version[1]))
223 if not isinstance(pipeline, PMMLPipeline):
--> 224 raise TypeError("The pipeline object is not an instance of " + PMMLPipeline.__name__ + ". Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline")
225 estimator = pipeline._final_estimator
226 cmd = ["java", "-cp", os.pathsep.join(_classpath(user_classpath)), "org.jpmml.sklearn.Main"]
TypeError: The pipeline object is not an instance of PMMLPipeline. Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline
The pipeline object is not an instance of PMMLPipeline
Did you read the SkLearn2PMML error message or not? Probably not, because it clearly states what's the issue!
You're using the PMMLPipeline class in completely wrong places. It should be used only as the topmost wrapper to the VotingClassifier estimator.
Please reorganize your code like this:
pipeline = PMMLPipeline([
("classifier", VotingClassifier([
("pipe1", Pipeline(...)),
("pipe2", Pipeline(...)),
("pipe3", Pipeline(...))
sklearn2pmml(pipeline, "pipeline.pmml")

Confusion Matrix to get precsion,recall, f1score

I have a dataframe df. I have performed decisionTree classification algorithm on the dataframe. The two columns are label and features when algorithm is performed. The model is called dtc. How can I create a confusion matrix in pyspark?
dtc = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label')
dtcModel = dtc.fit(train)
predictions = dtcModel.transform(test)
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import MulticlassMetrics
preds = df.select(['label', 'features']) \
.df.map(lambda line: (line[1], line[0]))
metrics = MulticlassMetrics(preds)
# Confusion Matrix
You need to cast to an rdd and map to tuple before calling metrics.confusionMatrix().toArray().
From the official documentation,
class pyspark.mllib.evaluation.MulticlassMetrics(predictionAndLabels)[source]
Evaluator for multiclass classification.
Parameters: predictionAndLabels – an RDD of (prediction, label) pairs.
Here is an example to guide you.
ML part
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
#Note the differences between ml and mllib, they are two different libraries.
#create a sample data frame
data = [(1.54,3.45,2.56,0),(9.39,8.31,1.34,0),(1.25,3.31,9.87,1),(9.35,5.67,2.49,2),\
cols = ('a','b','c','d')
df = spark.createDataFrame(data, cols)
assembler = VectorAssembler(inputCols=['a','b','c'], outputCol='features')
df_features = assembler.transform(df)
train_data, test_data = df_features.randomSplit([0.6,0.4])
dtc = DecisionTreeClassifier(featuresCol='features',labelCol='d')
dtcModel = dtc.fit(train_data)
predictions = dtcModel.transform(test_data)
Evaluation part
#important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = predictions.select(['predictions','d']).withColumn('label', F.col('d').cast(FloatType())).orderBy('prediction')
#select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','label'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
Use this:
import sklearn
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees=500)
rfModel = rf.fit(train)
predictions_train = rfModel.transform(train)
y_true = predictions_train.select(['label']).collect()
y_pred = predictions_train.select(['prediction']).collect()
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))
where train is your training data.

Why am I getting the same value for the accuracy and recall when using spark's mllib's MulticlassClassificationEvaluator?

So, I am having a play around with some tree based algorithms from Spark's mllib. The code I have is here;
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
conf = SparkConf()
conf.set('spark.logConf', 'true').set("spark.ui.port", "4060")
spark = SparkSession.builder.config(conf=conf).appName("Gradient Boosted Tree").getOrCreate()
data = spark.read.parquet('/mydata/location)
def yt_func(x):
if x <= 10:
yt = 0
yt = 1
return yt
yt_udf = udf(yt_func, IntegerType())
data = data.withColumn('yt_1',yt_udf(data['count']))
datasub = data.select('feature1', 'feature2',
'feature3', 'feature4',
'feature5', 'feature6',
'feature7', 'feature8',
'feature9', 'feature10',
datasub = datasub.na.fill(0)
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['feature1', 'feature2',
'feature3', 'feature4',
'feature5', 'feature6',
'feature7', 'feature8',
'feature9', 'feature10',
'feature13'], outputCol = 'features')
output = assembler.transform(datasub)
finaldata = output.select('features','yt_1')
train_data,test_data = finaldata.randomSplit([0.7,0.3])
dtc = DecisionTreeClassifier(featuresCol='features',labelCol='yt_1')
rfc = RandomForestClassifier(featuresCol='features',labelCol='yt_1', numTrees=70)
gbt = GBTClassifier(featuresCol='features',labelCol='yt_1')
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)
accuracy_eval = MulticlassClassificationEvaluator(metricName = 'accuracy', labelCol='yt_1')
recall_eval = MulticlassClassificationEvaluator(metricName = 'weightedRecall', labelCol='yt_1')
print 'dtc accuracy:', accuracy_eval.evaluate(dtc_preds)
print 'dtc recall', recall_eval.evaluate(dtc_preds)
print 'rfc accuracy:', accuracy_eval.evaluate(rfc_preds)
print 'rfc recall', recall_eval.evaluate(rfc_preds)
print 'gbt accuracy:', accuracy_eval.evaluate(gbt_preds)
print 'gbt recall', recall_eval.evaluate(gbt_preds)
When I run this I get the following;
dtc accuracy: 0.98596755767033761
dtc recall: 0.98596755767033761
rfc accuracy: 0.98551077243825225
rfc recall: 0.98551077243825225
gbt accuracy: 0.98624595624862965
gbt recall: 0.98624595624862965
What is confusing me here is why I am getting the same values for the accuracy and the recall.... they are EXACTLY the same. Surely this isn't correct....??
Any ideas?
An answer to this question can be found where I posted the same question on the Data Science Stack Exchange

How to cross validate RandomForest model?

I want to evaluate a random forest being trained on some data. Is there any utility in Apache Spark to do the same or do I have to perform cross validation manually?
ML provides CrossValidator class which can be used to perform cross-validation and parameter search. Assuming your data is already preprocessed you can add cross-validation as follows:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
// [label: double, features: vector]
trainingData org.apache.spark.sql.DataFrame = ???
val nFolds: Int = ???
val numTrees: Int = ???
val metric: String = ???
val rf = new RandomForestClassifier()
val pipeline = new Pipeline().setStages(Array(rf))
val paramGrid = new ParamGridBuilder().build() // No parameter search
val evaluator = new MulticlassClassificationEvaluator()
// "f1" (default), "weightedPrecision", "weightedRecall", "accuracy"
val cv = new CrossValidator()
// ml.Pipeline with ml.classification.RandomForestClassifier
// ml.evaluation.MulticlassClassificationEvaluator
val model = cv.fit(trainingData) // trainingData: DataFrame
Using PySpark:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
trainingData = ... # DataFrame[label: double, features: vector]
numFolds = ... # Integer
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
evaluator = MulticlassClassificationEvaluator() # + other params as in Scala
pipeline = Pipeline(stages=[rf])
paramGrid = (ParamGridBuilder.
.addGrid(rf.numTrees, [3, 10])
.addGrid(...) # Add other parameters
crossval = CrossValidator(
model = crossval.fit(trainingData)
To build on zero323's great answer using Random Forest Classifier, here is a similar example for Random Forest Regressor:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.ml.regression.RandomForestRegressor // CHANGED
import org.apache.spark.ml.evaluation.RegressionEvaluator // CHANGED
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
val numFolds = ??? // Integer
val data = ??? // DataFrame
// Training (80%) and test data (20%)
val Array(train, test) = data.randomSplit(Array(0.8,0.2))
val featuresCols = data.columns
val va = new VectorAssembler()
val vi = new VectorIndexer()
val regressor = new RandomForestRegressor()
val metric = "rmse"
val evaluator = new RegressionEvaluator()
// "rmse" (default): root mean squared error
// "mse": mean squared error
// "r2": R2 metric
// "mae": mean absolute error
val paramGrid = new ParamGridBuilder().build()
val cv = new CrossValidator()
val model = cv.fit(train) // train: DataFrame
val predictions = model.transform(test)
val rmse = evaluator.evaluate(predictions)
Evaluator metric source:
