I am using MLlib of spark to perform a regression random forest.
I am using the python code here:
It works but now I would like to get the predicted values as well as the R or R² of the prediction model.
How to get that?

Here is how to save a csv file into RDD (spark data format):
# Imports
import csv
from StringIO import StringIO
except ImportError:
from io import StringIO
from collections import namedtuple
from operator import add, itemgetter
from pyspark import SparkConf, SparkContext
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
import shutil
import numpy
def parse(row):
Parses a row and returns a named tuple.
row[0] = str(row[0])
row[1] = float(row[1])
row[2] = float(row[2])
row[3] = float(row[3])
row[4] = float(row[4])
return LabeledPoint(row[4], row[:4])
def split(line):
Operator function for splitting a line with csv module
reader = csv.reader(StringIO(line), delimiter=';')
return next(reader)
#save csv file on a spark cluster (RDD format)
data = sc.textFile("datafile").map(split).map(parse)
Here is how to perform the random forest algorithm and how to get the predicted values:
def random_forest_regression(data):
Run the random forest (regression) algorithm on the data to perform the prediction
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=100, featureSubsetStrategy="auto", impurity='variance', maxDepth=10, maxBins=32)
#increase number of trees to have a better prediction
# Evaluate model on TEST instances and compute test error
predictions_test = model.predict(testData.map(lambda x: x.features))
real_and_predicted_test = testData.map(lambda lp: lp.label).zip(predictions_test)
#get the list of real and predicted values FOR ALL THE POINTS
predictions = model.predict(data.map(lambda x: x.features))
real_and_predicted = data.map(lambda lp: lp.label).zip(predictions)
print("real and predicted values")
for value in real_and_predicted:
return model, real_and_predicted
To get the correlation coefficient (R value), I used numpy:
def compute_correlation_coefficient(real_and_predicted):
compute and display the correlation coefficient from a list of real and predicted values
for tuple in real_and_predicted:
print("correlation coefficient")
print(numpy.corrcoef(list1, list2)[0, 1])
To get the R², take the square value of the correlation coefficient.
Voilà !


Mlflow log_model, not able to predict with spark_udf but with python works

I was wondering to log a model on mlflow, once I do it, I'm able to predict probabilities with python loaded model but not with spark_udf. The thing is, I still need to have a preprocessing function inside the model. Here is a toy reproductible example for you to see when it fails:
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_classes=2, shuffle=True, random_state=1995)
X, y = pd.DataFrame(X), pd.DataFrame(y,columns=["target"])
# geerate column names
X.columns = [f"col_{idx}" for idx in range(len(X.columns))]
X["categorical_column"] = np.random.choice(["a","b","c"], size=len(X) )
def encode_catcolumn(X):
X = X.copy()
# replace cat values [a,b,c] for [-10,0,35] respectively
X['categorical_column'] = np.select([X["categorical_column"] == "a", X["categorical_column"] == "b", X["categorical_column"] == "c"], [-10, 0,35] )
return X
# with catcolumn encoded; i need to use custom encoding , we'll do this within mlflow later
X_encoded = encode_catcolumn(X)
Now let's create a wrapper for the model to encode the function within the model. Please see that the function encode_catcolumn within the class and the one outside the class presented before are the same.
class SklearnModelWrapper(mlflow.pyfunc.PythonModel):
def __init__(self, model):
self.model = model
def encode_catcolumn(self,X):
X = X.copy()
# replace cat values [a,b,c] for [-10,0,35] respectively
X['categorical_column'] = np.select([X["categorical_column"] == "a", X["categorical_column"] == "b", X["categorical_column"] == "c"], [-10, 0,35] )
return X
def predict(self, context, model_input):
# encode catvariable
model_input = self.encode_catcolumn(model_input)
# predict probabilities
predictions = self.model.predict_proba(model_input)[:,1]
return predictions
Now let's log the model
with mlflow.start_run(run_name="reproductible_example") as run:
clf = RandomForestClassifier()
# wrappmodel with pyfunc, does the encoding inside the class
wrappedModel = SklearnModelWrapper(clf)
# When the model is deployed, this signature will be used to validate inputs.
mlflow.pyfunc.log_model("reproductible_example_model", python_model=wrappedModel)
model_uuid = run.info.run_uuid
model_path = f'runs:/{model_uuid}/reproductible_example_model'
Do the inference without spark and works perfectly:
model_uuid = run.info.run_uuid
model_path = f'runs:/{model_uuid}/reproductible_example_model'
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_path)
# predictions without spark , encodes the variables INSIDE; this WORKS
Now do the inference with spark_udf and get an error:
# create spark dataframe to test it on spark
X_spark = spark.createDataFrame(X)
# Load model as a Spark UDF.
loaded_model_spark = mlflow.pyfunc.spark_udf(spark, model_uri=model_path)
# Predict on a Spark DataFrame.
columns = list(X_spark.columns)
# this does not work
X_spark.withColumn('predictions', loaded_model_spark(*columns)).collect()
The error is:
PythonException: An exception was thrown from a UDF: 'KeyError: 'categorical_column'', from <command-908038>, line 7. Full traceback below:
I need to some how encode the variables and preprocess within the class. Is there any solution to this or any workaround to make this code able to woork with spark?
What I've tried so far:
Incorporate the encode_catcolumn within a sklearn Pipeline (with a custom encoder sklearn) -> Fails;
Create a function within the sklearn wrapper class (this example) -> Fails
3 ) Use the log_model and then create a pandas_udf in order to do it with spark as well --> works but that's not what I want. I would like to be able to run the model on spark with just calling .predict() method or something like that.
When a remove the preprocessing function and do it outside the class --> this actually works but this is not what
I solve this by just changing the last chunk of my question, when I load the spark_udf model and perform inference. This is a possible answer to the problem. Just pass an F.struct() to the spark_udf instead of a list of columns. Like in the chunk bellow:
import pyspark.sql.functions as F
# create spark dataframe to test it on spark
X_spark = spark.createDataFrame(X)
# Load model as a Spark UDF.
loaded_model_spark = mlflow.pyfunc.spark_udf(spark, model_uri=model_path)
# Predict on a Spark DataFrame.
# columns = list(X_spark.columns) --> delete this
columns = F.struct(X_spark.columns) # use struct
# this does work
X_spark.withColumn('predictions', loaded_model_spark(columns)).collect()

Pyspark retrieve metrics (AUC ROC) from each submodel in CrossValidator

How do I return the individual auc-roc score for each fold/submodel when using crossValidator.
The documentation indicates that collectSubModels=True should save all models rather than just the best or average, but after inspecting model.subModels I can't find how to print them.
The below example works just missing the model.subModels.aucScore
Desired Result would be each fold with their corresponding score like [fold1:0.85, fold2:0.07, fold3:0.55]
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
#Creating test dataframe
training = spark.createDataFrame([
(0,1,0)], ["label", "feature1", "feature2"])
#Vectorizing features for modelling
assembler = VectorAssembler(inputCols=['feature1','feature2'],outputCol="features")
prepped = assembler.transform(training).select('label','features')
#setting variables and configuring CrossValidator
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
params = ParamGridBuilder().build()
evaluator = BinaryClassificationEvaluator()
folds = 3
cv = CrossValidator(estimator=rf,
#Fitting model
model = cv.fit(prepped)
#Print Metrics
>>>>>[[RandomForestClassificationModel (uid=RandomForestClassifier_95da3a68af93) with 20 trees], >>>>>[RandomForestClassificationModel (uid=RandomForestClassifier_95da3a68af93) with 20 trees], >>>>>[RandomForestClassificationModel (uid=RandomForestClassifier_95da3a68af93) with 20 trees]]

Confusion Matrix to get precsion,recall, f1score

I have a dataframe df. I have performed decisionTree classification algorithm on the dataframe. The two columns are label and features when algorithm is performed. The model is called dtc. How can I create a confusion matrix in pyspark?
dtc = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label')
dtcModel = dtc.fit(train)
predictions = dtcModel.transform(test)
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import MulticlassMetrics
preds = df.select(['label', 'features']) \
.df.map(lambda line: (line[1], line[0]))
metrics = MulticlassMetrics(preds)
# Confusion Matrix
You need to cast to an rdd and map to tuple before calling metrics.confusionMatrix().toArray().
From the official documentation,
class pyspark.mllib.evaluation.MulticlassMetrics(predictionAndLabels)[source]
Evaluator for multiclass classification.
Parameters: predictionAndLabels – an RDD of (prediction, label) pairs.
Here is an example to guide you.
ML part
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
#Note the differences between ml and mllib, they are two different libraries.
#create a sample data frame
data = [(1.54,3.45,2.56,0),(9.39,8.31,1.34,0),(1.25,3.31,9.87,1),(9.35,5.67,2.49,2),\
cols = ('a','b','c','d')
df = spark.createDataFrame(data, cols)
assembler = VectorAssembler(inputCols=['a','b','c'], outputCol='features')
df_features = assembler.transform(df)
train_data, test_data = df_features.randomSplit([0.6,0.4])
dtc = DecisionTreeClassifier(featuresCol='features',labelCol='d')
dtcModel = dtc.fit(train_data)
predictions = dtcModel.transform(test_data)
Evaluation part
#important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = predictions.select(['predictions','d']).withColumn('label', F.col('d').cast(FloatType())).orderBy('prediction')
#select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','label'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
Use this:
import sklearn
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees=500)
rfModel = rf.fit(train)
predictions_train = rfModel.transform(train)
y_true = predictions_train.select(['label']).collect()
y_pred = predictions_train.select(['prediction']).collect()
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))
where train is your training data.

Transform RDD to valid input for kmeans

I am calculating TF and IDF using spark mllib algorithm of a directory that contains csv files with the following code:
import argparse
from os import system
### args parsing
parser = argparse.ArgumentParser(description='runs TF/IDF on a directory of
text docs')
parser.add_argument("-i","--input", help="the input in HDFS",
parser.add_argument("-o", '--output', help="the output in HDFS",
required=True )
parser.add_argument("-mdf", '--min_document_frequency', default=1 )
args = parser.parse_args()
docs_dir = args.input
d_out = "hdfs://master:54310/" + args.output
min_df = int(args.min_document_frequency)
# import spark-realated stuff
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
sc = SparkContext(appName="TF-IDF")
# Load documents (one per line).
documents = sc.textFile(docs_dir).map(lambda title_text:
title_text[1].split(" "))
hashingTF = HashingTF()
tf = hashingTF.transform(documents)
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
I get this output:
[SparseVector(1048576, {812399: 4.3307}), SparseVector(1048576, {411697:
0.0066}), SparseVector(1048576, {411697: 0.0066}), SparseVector(1048576,
{411697: 0.0066}), SparseVector(1048576, {411697: 0.0066}), ....
I have also tested the KMeans mllib algorithm :
from __future__ import print_function
import sys
import numpy as np
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans
def parseVector(line):
return np.array([float(x) for x in line.split(' ')])
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: kmeans <file> <k>", file=sys.stderr)
sc = SparkContext(appName="KMeans")
lines = sc.textFile(sys.argv[1])
data = lines.map(parseVector)
k = int(sys.argv[2])
model = KMeans.train(data, k, runs)
print("Final centers: " + str(model.clusterCenters))
print("Total Cost: " + str(model.computeCost(data)))
with this sample test case
0.0 0.0 0.0
0.1 0.1 0.1
0.2 0.2 0.2
9.0 9.0 9.0
9.1 9.1 9.1
9.2 9.2 9.2
and it works fine.
Now I want to apply the rdd output from tfidf above in the KMeans algorithm but I don't know how is it possible to transform the rdd like the sample text above, or how to split properly the rdd in the KMeans algorithm to work properly.
I really need some help with this one.
My real question is how can i read the input to apply it to KMeans mllib from a text file like this
I am not sure at all but i think i need to go from above vectors to the below array so as to apply it directly to KMeans mllib algorithm
1.75642010278 2.41857747478 1.97365255252
2.98856378408 1.63863706713 2.44956728334
1.42412015238 1.58759872958 2.01237484818
The output of IDF is a dataframe of SparseVector. KMeans takes a vector as input (sparse or dense), hence, there should be no need to make any transformations. You should be able to use the output column from IDF directly as input to KMeans.
If you need to save the data to disk in between running the TFIDF and KMeans, I would recommend saving it as a csv through the dataframe API.
First convert to a dataframe using Row:
from pyspark.sql import Row
row = Row("features") # column name
df = tfidf.map(row).toDF()
An alternative way to convert without import:
df = tfidf.map(lambda x: (x, )).toDF(["features"])
After the conversion save the dataframe as a parquet file:
To read the data, simply use:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.read.parquet('/path/to/file')
# converting from dataframe into an RDD[Vector]
data = df.rdd.map(list)
If you in any case need to convert from a vector saved as a string, that is also possible. Here is some example code:
from pyspark.mllib.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
df = sc.parallelize(["(7,[1,2,4],[1,1,1])"]).toDF(["features"])
parse = udf(lambda s: Vectors.parse(s), VectorUDT())
First an example dataframe is created with the same formatting. Then an UDF is used to parse the string into a vector. If you want an rdd instead of the dataframe, use the code above at the "reading from parquet" part to convert.
However, the output from IDF is very sparse. The vectors have a length of 1048576 and only one of these have a values over 1. KMeans would not give you any interesting results.
I would recommend you to look into word2vec instead. It will give you a more compact vector for each word and clustering these vectors would make more sense. Using this method you can receive a map of words to their vector representations which can be used for clustering.

Computing precision and recall for two sets of keywords in NLTK and Scikit for sets of different sizes

I am trying to compute precision and recall for two sets of keywords. The gold_standard has 823 terms and the test has 1497 terms.
Using nltk.metrics's version of precision and recall, I am able to provide the two sets just fine. But doing the same for Scikit is throwing me an error:
ValueError: Found arrays with inconsistent numbers of samples: [ 823 1497]
How do I resolve this?
from nltk.metrics import precision, recall
from sklearn.metrics import precision_score
from sys import argv
from time import time
import numpy
import csv
def readCSVFile(filename):
termList = set()
with open(filename, 'rt', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
return termList
def readDocuments(gs_file, fileToProcess):
print("Reading CSV files...")
gold_standard = readCSVFile(gs_file)
test = readCSVFile(fileToProcess)
print("All files successfully read!")
return gold_standard, test
def calcPrecisionScipy(gs, test):
gs = numpy.array(list(gs))
test = numpy.array(list(test))
print("Precision Scipy: ",precision_score(gs, test, average=None))
def process(datasest):
print("Processing input...")
gs, test = dataset
print("Precision: ", precision(gs, test))
calcPrecisionScipy(gs, test)
def usage():
print("Usage: python3 generate_stats.py gold_standard.csv termlist_to_process.csv")
if __name__ == '__main__':
if len(argv) != 3:
t0 = time()
process(readDocuments(argv[1], argv[2]))
print("Total runtime: %0.3fs" % (time() - t0))
I referred to the following pages for coding:
Okay, so I tried to add 'non-sensical' data to the list to make them equal length:
def calcPrecisionScipy(gs, test):
if len(gs) < len(test):
gs = numpy.array(list(gs))
test = numpy.array(list(test))
print("Precision Scipy: ",precision_score(gs, test, average=None))
Now I have another error:
UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
seems scientifically not possible to compute precision or recall of two sets of different lengths.
I guess what nltk must do is to truncate the sets to the same lengths, you can do the same in your script.
import numpy as np
import sklearn.metrics
set1 = [True,True]
set2 = [True,False,False]
length = np.amin([len(set1),len(set2)])
set1 = set1[:length]
set2 = set2[:length]
print sklearn.metrics.precision_score(set1,set2))
