I was hoping to reproduce the following python k-fold target encoding strategy in pure Spark:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np
dataset = pd.read_csv("dataset.csv")
cols_to_encode = ["cat_1", "cat_2"]
X_train, X_test, y_train, y_test = train_test_split(dataset[cols_to_encode], dataset["target"], test_size=0.25, random_state=0)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
encoder = ce.WOEEncoder(cols=cols_to_encode)
X_train_fitted = pd.DataFrame([], columns = X_train.columns)
kf = KFold(n_splits = 5, shuffle = False, random_state=0)
for tr_ind, val_ind in kf.split(X_train, y_train):
encoder.fit(X_train.loc[tr_ind], y_train.loc[tr_ind])
X_train_fitted = X_train_fitted.append(encoder.transform(X_train.loc[val_ind]))
encoder.fit(X_train, y_train)
X_test_fitted = encoder.transform(X_test)
C = np.logspace(0, 4, num = 10)
penalty = ['l1', 'l2', 'elasticnet', 'none']
solver = ['liblinear', 'saga', "sag", "lbfgs"]
params = dict(C=C, penalty=penalty, solver=solver)
param_comb = 10
lr = LogisticRegression(random_state=0)
#preserving same cv
random_search = RandomizedSearchCV(lr, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=2, verbose=3, random_state=0, cv = kf.split(X_train, y_train))
random_search.fit(X_train_fitted, y_train)
As I didn't find any current implementation, I'm trying to write my own.
So far, for the mean encoding strategy all I could come with was using a group by in the training set, creating a dataframe that I can join with the validation set. This approach looks rather manual and hard to reproduce for more complex encodings.
I expect to reproduce it in a more spark-like way, in the same way the encoder worked in the above python code.
Edit: An attempt of the code in Spark:
import spark.implicits._
import org.apache.spark.ml.Pipeline
val simpleData = Seq(("James","Sales","NY",90000,34,10000),
val df = simpleData.toDF("employee_name","department","salary","state","age","bonus")
val splitDF = df.randomSplit(Array(1,1,1,1,1))
val (df1,df2,df3,df4,df5) = (splitDF(0),splitDF(1),splitDF(2),splitDF(3),splitDF(4))
val df1_encoded_train = df2.union(df3).union(df4).union(df5).groupBy("department").mean("bonus")
val df1_encoded_val = df1.join(df1_encoded_train, Seq("department"), "left")
val df2_encoded_train = df1.union(df3).union(df4).union(df5).groupBy("department").mean("bonus")
val df2_encoded_val = df2.join(df2_encoded_train, Seq("department"), "left")
val df3_encoded_train = df1.union(df2).union(df4).union(df5).groupBy("department").mean("bonus")
val df3_encoded_val = df3.join(df3_encoded_train, Seq("department"), "left")
val df4_encoded_train = df1.union(df2).union(df3).union(df5).groupBy("department").mean("bonus")
val df4_encoded_val = df4.join(df4_encoded_train, Seq("department"), "left")
val df5_encoded_train = df1.union(df2).union(df3).union(df4).groupBy("department").mean("bonus")
val df5_encoded_val = df5.join(d5_encoded_train, Seq("department"), "left")
val df_encoded = df1_encoded_val.union(df2_encoded_val).union(df3_encoded_val).union(df4_encoded_val).union(df5_encoded_val)
I have created a KMeans model using Spark ML methods.
val kmeans = new KMeans()
val model = kmeans.fit(df)
I got my model ready. But how to predict that in which cluster new data points will fall. In MLlib, model.predict(Vector) predict the cluster for the new data points. I saw the transform method on the model but its not working.
Thanks Jacek Laskowski for clarifying Oli. Its working fine for me now. It was a simple mistake. Below is the whole code.
val conf = new SparkConf().setMaster("local").setAppName("ml Kmeans")
val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
val trainingData = spark.read.json(spark.sparkContext.wholeTextFiles("file:/home/iot/data/traingJson.json").values)
val parsedData = trainingData.select("value.humidity", "value.speed", "value.temperature", "value.vibration")
val assembler = new VectorAssembler().setInputCols(Array("humidity", "speed", "temperature", "vibration")).setOutputCol("features")
val df = assembler.transform(parsedData)
val kmeans = new KMeans()
val model = kmeans.fit(df)
//--------------------------------Testing the Model------------------------
val uploadModel=KMeansModel.load("file:/home/iot/data/model1")
val testData = spark.read.json(spark.sparkContext.wholeTextFiles("file:/home/iot/data/testJson.json").values).select("value.humidity", "value.speed", "value.temperature", "value.vibration")
val assembler=new VectorAssembler().setInputCols(Array("humidity","speed","temperature","vibration")).setOutputCol("features")
val df = assembler.transform(testData)
I am new to Spark and Machine Learning. I am trying to cluster using KMeans Some data like
1::Hi How are you
2::I am fine, how about you
In the data, separator is :: and Actual text to cluster is second column that has text data.
After reading on the spark official page and numerous articles I have written following code but I am not able to generate the vector to provide as input to KMeans.train step.
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
val sc = new SparkContext("local", "test")
val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
val rawData = sc.textFile("data/mllib/KM.txt").map(line => line.split("::")(1))
val sentenceData = rawData.toDF("sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
val clusters = KMeans.train(featurizedData, 2, 10)
I am getting following error
<console>:27: error: type mismatch;
found : org.apache.spark.sql.DataFrame
(which expands to) org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
required: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector]
val clusters = KMeans.train(featurizedData, 2, 10)
Please suggest how to process input data for KMeans
Thanks in advance.
Finaly I get it working after replacing the following code.
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
val clusters = KMeans.train(featurizedData, 2, 10)
val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
val kmeans = new KMeans().setK(2).setFeaturesCol("features").setPredictionCol("prediction")
val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, kmeans))
I want to evaluate a random forest being trained on some data. Is there any utility in Apache Spark to do the same or do I have to perform cross validation manually?
ML provides CrossValidator class which can be used to perform cross-validation and parameter search. Assuming your data is already preprocessed you can add cross-validation as follows:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
// [label: double, features: vector]
trainingData org.apache.spark.sql.DataFrame = ???
val nFolds: Int = ???
val numTrees: Int = ???
val metric: String = ???
val rf = new RandomForestClassifier()
val pipeline = new Pipeline().setStages(Array(rf))
val paramGrid = new ParamGridBuilder().build() // No parameter search
val evaluator = new MulticlassClassificationEvaluator()
// "f1" (default), "weightedPrecision", "weightedRecall", "accuracy"
val cv = new CrossValidator()
// ml.Pipeline with ml.classification.RandomForestClassifier
// ml.evaluation.MulticlassClassificationEvaluator
val model = cv.fit(trainingData) // trainingData: DataFrame
Using PySpark:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
trainingData = ... # DataFrame[label: double, features: vector]
numFolds = ... # Integer
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
evaluator = MulticlassClassificationEvaluator() # + other params as in Scala
pipeline = Pipeline(stages=[rf])
paramGrid = (ParamGridBuilder.
.addGrid(rf.numTrees, [3, 10])
.addGrid(...) # Add other parameters
crossval = CrossValidator(
model = crossval.fit(trainingData)
To build on zero323's great answer using Random Forest Classifier, here is a similar example for Random Forest Regressor:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.ml.regression.RandomForestRegressor // CHANGED
import org.apache.spark.ml.evaluation.RegressionEvaluator // CHANGED
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
val numFolds = ??? // Integer
val data = ??? // DataFrame
// Training (80%) and test data (20%)
val Array(train, test) = data.randomSplit(Array(0.8,0.2))
val featuresCols = data.columns
val va = new VectorAssembler()
val vi = new VectorIndexer()
val regressor = new RandomForestRegressor()
val metric = "rmse"
val evaluator = new RegressionEvaluator()
// "rmse" (default): root mean squared error
// "mse": mean squared error
// "r2": R2 metric
// "mae": mean absolute error
val paramGrid = new ParamGridBuilder().build()
val cv = new CrossValidator()
val model = cv.fit(train) // train: DataFrame
val predictions = model.transform(test)
val rmse = evaluator.evaluate(predictions)
Evaluator metric source:
I want to run a SVM Regression, but have problems with input format. Right now my train and test set for one customer looks like this:
1 '12262064 |f offer_quantity:1
has_bought_brand_company:1 has_bought_brand_a:6.79 has_bought_brand_q_60:1.0
has_bought_brand:2.0 has_bought_company_a:1.95 has_bought_brand_180:1.0
has_bought_brand_q_180:1.0 total_spend:218.37 has_bought_brand_q:3.0 offer_value:1.5
has_bought_brand_a_60:2.79 has_bought_brand_60:1.0 has_bought_brand_q_90:1.0
has_bought_brand_a_90:2.79 has_bought_company_q:1.0 has_bought_brand_90:1.0
has_bought_company:1.0 never_bought_category:1 has_bought_brand_a_180:2.79
If tried to read this textfile into Spark, but without success. What am I missing? Do I have to delete feature names? Right now its in Vowal Wabbit format.
My code looks like this:
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
Load training data in LIBSVM format.
val data = MLUtils.loadLibSVMFile(sc, "mllib/data/train.txt")
Split data into training (60%) and test (40%).
val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0).cache()
val test = splits(1)
Run training algorithm to build the model
val numIterations = 100
val model = SVMWithSGD.train(training, numIterations)
val scoreAndLabels = test.map { point =>
val score = model.predict(point.features)
(score, point.label)
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
val auROC = metrics.areaUnderROC()
println("Area under ROC = " + auROC)
``I get an answer, but my AUC value is 1, which shouldnt be the case.
scala> println("Area under ROC = " + auROC)
Area under ROC = 1.0
I think your File is not in LIBSVM format.If you can convert the file to libsvm format
you will have to load it as normal file and then create a label point
This is what i did for my file.
import org.apache.spark.mllib.feature.HashingTF
val tf = new HashingTF(2)
val tweets = sc.textFile(tweetInput)
val labelPoint = tweets.map(l=>{
val parts = l.split(' ')
var t=tf.transform(parts.tail.map(x => x).sliding(2).toSeq)
LabeledPoint(parts(0).toDouble,t )
val model = LinearRegressionWithSGD.train(labelPoint, numIterations)