value toDF is not a member of org.apache.spark.rdd.RDD[(Long, org.apache.spark.ml.linalg.Vector)] - apache-spark

Am getting a compilation error converting the pre-LDA transformation to a data frame using SCALA in SPARK 2.0. The specific code that is throwing an error is as per below:
val documents = PreLDAmodel.transform(mp_listing_lda_df)
.select("docId","features")
.rdd
.map{ case Row(row_num: Long, features: MLVector) => (row_num, features) }
.toDF()
The complete compilation error is:
Error:(132, 8) value toDF is not a member of org.apache.spark.rdd.RDD[(Long, org.apache.spark.ml.linalg.Vector)]
possible cause: maybe a semicolon is missing before `value toDF'?
.toDF()
Here is the complete code:
import java.io.FileInputStream
import java.sql.{DriverManager, ResultSet}
import java.util.Properties
import org.apache.spark.SparkConf
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.linalg.{Vector => MLVector}
import org.apache.spark.mllib.clustering.{LDA => oldLDA}
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
object MPClassificationLDA {
/*Start: Configuration variable initialization*/
val props = new Properties
val fileStream = new FileInputStream("U:\\JIRA\\MP_Classification\\target\\classes\\mpclassification.properties")
props.load(fileStream)
val mpExtract = props.getProperty("mpExtract").toString
val shard6_db_server_name = props.getProperty("shard6_db_server_name").toString
val shard6_db_user_id = props.getProperty("shard6_db_user_id").toString
val shard6_db_user_pwd = props.getProperty("shard6_db_user_pwd").toString
val mp_output_file = props.getProperty("mp_output_file").toString
val spark_warehouse_path = props.getProperty("spark_warehouse_path").toString
val rf_model_file_path = props.getProperty("rf_model_file_path").toString
val windows_hadoop_home = props.getProperty("windows_hadoop_home").toString
val lda_vocabulary_size = props.getProperty("lda_vocabulary_size").toInt
val pre_lda_model_file_path = props.getProperty("pre_lda_model_file_path").toString
val lda_model_file_path = props.getProperty("lda_model_file_path").toString
fileStream.close()
/*End: Configuration variable initialization*/
val conf = new SparkConf().set("spark.sql.warehouse.dir", spark_warehouse_path)
def main(arg: Array[String]): Unit = {
//SQL Query definition and parameter values as parameter upon executing the Object
val cont_id = "14211599"
val top = "100000"
val start_date = "2016-05-01"
val end_date = "2016-06-01"
val mp_spark = SparkSession
.builder()
.master("local[*]")
.appName("MPClassificationLoadLDA")
.config(conf)
.getOrCreate()
MPClassificationLDACalculation(mp_spark, cont_id, top, start_date, end_date)
mp_spark.stop()
}
private def MPClassificationLDACalculation
(mp_spark: SparkSession
,cont_id: String
,top: String
,start_date: String
,end_date: String
): Unit = {
//DB connection definition
def createConnection() = {
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver").newInstance();
DriverManager.getConnection("jdbc:sqlserver://" + shard6_db_server_name + ";user=" + shard6_db_user_id + ";password=" + shard6_db_user_pwd);
}
//DB Field Names definition
def extractvalues(r: ResultSet) = {
Row(r.getString(1),r.getString(2))
}
//Prepare SQL Statement with parameter value replacement
val query = """SELECT docId = audt_id, text = auction_title FROM brands6.dbo.uf_ds_marketplace_classification_listing(#cont_id, #top, '#start_date', '#end_date') WHERE ? < ? OPTION(RECOMPILE);"""
.replaceAll("#cont_id", cont_id)
.replaceAll("#top", top)
.replaceAll("#start_date", start_date)
.replaceAll("#end_date", end_date)
.stripMargin
//Connect to Source DB and execute the Prepared SQL Steatement
val mpDataRDD = new JdbcRDD(mp_spark.sparkContext
,createConnection
,query
,lowerBound = 0
,upperBound = 10000000
,numPartitions = 1
,mapRow = extractvalues)
val schema_string = "docId,text"
val fields = StructType(schema_string.split(",")
.map(fieldname => StructField(fieldname, StringType, true)))
//Create Data Frame using format identified through schema_string
val mpDF = mp_spark.createDataFrame(mpDataRDD, fields)
mpDF.collect()
val mp_listing_tmp = mpDF.selectExpr("cast(docId as long) docId", "text")
mp_listing_tmp.printSchema()
println(mp_listing_tmp.first)
val mp_listing_lda_df = mp_listing_tmp.withColumn("docId", mp_listing_tmp("docId"))
mp_listing_lda_df.printSchema()
val tokenizer = new RegexTokenizer()
.setInputCol("text")
.setOutputCol("rawTokens")
.setMinTokenLength(2)
val stopWordsRemover = new StopWordsRemover()
.setInputCol("rawTokens")
.setOutputCol("tokens")
val vocabSize = 4000
val countVectorizer = new CountVectorizer()
.setVocabSize(vocabSize)
.setInputCol("tokens")
.setOutputCol("features")
val PreLDApipeline = new Pipeline()
.setStages(Array(tokenizer, stopWordsRemover, countVectorizer))
val PreLDAmodel = PreLDApipeline.fit(mp_listing_lda_df)
//comment out after saving it the first time
PreLDAmodel.write.overwrite().save(pre_lda_model_file_path)
val documents = PreLDAmodel.transform(mp_listing_lda_df)
.select("docId","features")
.rdd
.map{ case Row(row_num: Long, features: MLVector) => (row_num, features) }
.toDF()
//documents.printSchema()
val numTopics: Int = 20
val maxIterations: Int = 100
//note the FeaturesCol need to be set
val lda = new LDA()
.setOptimizer("em")
.setK(numTopics)
.setMaxIter(maxIterations)
.setFeaturesCol(("_2"))
val vocabArray = PreLDAmodel.stages(2).asInstanceOf[CountVectorizerModel].vocabulary
}
}
Am thinking that it is related to conflicts in the imports section of the code. Appreciate any help.

2 things needed to be done:
Import implicits: Note that this should be done only after an instance of org.apache.spark.sql.SQLContext is created. It should be written as:
val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
Move case class outside of the method: case class, by use of which you define the schema of the DataFrame, should be defined outside of the method needing it. You can read more about it here: https://issues.scala-lang.org/browse/SI-6649

Related

How to incrementally load , fit with new data , save the pipeline model in using spark?

Any pointers to incrementally train and build the model , and get the prediction on single element.
Trying to run a web application will write data to csv in a shared path , and the ml application will read data and loads the model , tries to fit the data and save the model , transform the test data. ( This is supposed to happen in loop)
But when loading the saved model second time , facing following exception ,
(am using a minmax scaler to normalize the data)
Exception in thread "main" java.lang.IllegalArgumentException: Output column features_intermediate already exists.
Any pointers would be much appreciated , Thank you
object RunAppPooling {
def main(args: Array[String]): Unit = { // start the spark session
val conf = new SparkConf().setMaster("local[2]").set("deploy-mode", "client").set("spark.driver.bindAddress", "127.0.0.1")
.set("spark.broadcast.compress", "false")
.setAppName("local-spark")
val spark = SparkSession
.builder()
.config(conf)
.getOrCreate()
val filePath = "src/main/resources/train.csv"
val modelPath = "file:///home/vagrant/custom.model"
val schema = StructType(
Array(
StructField("IDLE_COUNT", IntegerType),
StructField("TIMEOUTS", IntegerType),
StructField("ACTIVE_COUNT", IntegerType),
StructField("FACTOR_LOAD", DoubleType)))
while(true){
// read the raw data
val df_raw = spark
.read
.option("header", "true")
.schema(schema)
.csv(filePath)
df_raw.show()
println(df_raw.count())
// fill all na values with 0
val df = df_raw.na.fill(0)
df.printSchema()
// create the feature vector
val vectorAssembler = new VectorAssembler()
.setInputCols(Array("IDLE_COUNT", "TIMEOUTS", "ACTIVE_COUNT" ))
.setOutputCol("features_intermediate")
var lr1: PipelineModel = null
try {
lr1 = PipelineModel.load(modelPath)
} catch {
case ie: InvalidInputException => println(ie.getMessage)
}
import org.apache.spark.ml.feature.StandardScaler
val scaler = new StandardScaler().setWithMean(true).setWithStd(true).setInputCol("features_intermediate").setOutputCol("features")
var pipeline: Pipeline = null
if (lr1 == null) {
val lr =
new LinearRegression()
.setMaxIter(100)
.setRegParam(0.1)
.setElasticNetParam(0.8)
.setLabelCol("FACTOR_LOAD") // setting label column
// create the pipeline with the steps
pipeline = new Pipeline().setStages(Array( vectorAssembler, scaler, lr))
} else {
pipeline = new Pipeline().setStages(Array(vectorAssembler, scaler, lr1))
}
// create the model following the pipeline steps
val cvModel = pipeline.fit(df)
// save the model
cvModel.write.overwrite.save(modelPath)
var testschema = StructType(
Array(
StructField("PACKAGE_KEY", StringType),
StructField("IDLE_COUNT", IntegerType),
StructField("TIMEOUTS", IntegerType),
StructField("ACTIVE_COUNT", IntegerType)
))
val df_raw1 = spark
.read
.option("header", "true")
.schema(testschema)
.csv("src/main/resources/test_pooling.csv")
// fill all na values with 0
val df1 = df_raw1.na.fill(0)
val extracted = cvModel.transform(df1) //.toDF("prediction")
import org.apache.spark.sql.functions._
val test = extracted.select(mean(df("FACTOR_LOAD"))).collect()
println(test.apply(0))
}
}
}
I figured out a way at-least to get past away the exception not sure whether it is right apporach or not .Here it goes while creating the pipeline after loading the model , set the stages as only the model , because model has already defined with respective schema . Not sure whether this will normalize the new data or not .
pipeline = new Pipeline().setStages(Array( lr1))

Spark Find the Occurrences of Matched Strings

how i can find the occurence of the matched string as per the below code snippet, i'm able to get the filtered strings as an output , but not the occurences
import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object WordCount {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("wordCount")
val sc = new SparkContext(conf)
// Load our input data.
val input = sc.textFile("file:///tmp/ganesh/*")
val matched_pattern = input.filter(line => line.contains("Title"))
// Split it up into words.
val words = matched_pattern.flatMap(line => line.split(" "))
// Transform into pairs and count.
val counts = words.map(word => (word, 1)).reduceByKey{case (x, y) => x + y}
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile("file:///tmp/sparkout")
}
}
Here is an example - with broadcast variable usage. stopWords is in fact include words.
val dfsFilename = "/FileStore/tables/7dxa9btd1477497663691/Text_File_01-880f5.txt"
val readFileRDD = spark.sparkContext.textFile(dfsFilename)
// res4: Array[String] = Array(The the is Is a A to To OK ok I) //stopWords
val stopWordsInput = spark.sparkContext.textFile("/FileStore/tables/filter_words.txt")
val stopWords = stopWordsInput.flatMap(x => x.split(" ")).map(_.trim).collect.toSet
val broadcasted = sc.broadcast(stopWords)
val wcounts1 = readFileRDD.map(x => (x.replaceAll("[^A-Za-z0-9]", " ")
.trim.toLowerCase))
.flatMap(line=>line.split(" "))
.filter(broadcasted.value.contains(_))
.map(word=>(word, 1))
.reduceByKey(_ + _)
wcounts1.collect
returns:
res2: Array[(String, Int)] = Array((The,1), (I,3), (to,1), (the,1))
You can embellish with broadcast on the stopWords -which is what I did.
I saw you XML input and a replaceAll. You can fiddle with that to your liking. I also added a clause to put it all to lower case.

Creating a stream from a text file in Pyspark

I'm getting the following error when I try to create a stream from a text file in Pyspark:
TypeError: unbound method textFileStream() must be called with StreamingContext instance as first argument (got str instance instead)
I don't want to use SparkContext because I get another error so to remove thet error I have to use SparkSession.
My code:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.mllib.stat import Statistics
if __name__ == "__main__":
spark = SparkSession.builder.appName("CrossCorrelation").getOrCreate()
ssc = StreamingContext(spark.sparkContext, 5)
input_path1 = sys.argv[1]
input_path2 = sys.argv[2]
ds1 = ssc.textFileStream(input_path1)
lines1 = ds1.map(lambda x1: x1[1])
windowedds1 = lines1.flatMap(lambda line1: line1.strip().split("\n")).map(lambda strelem1: float(strelem1)).window(5,10)
ds2 = ssc.textFileStream(input_path2)
lines2 = ds2.map(lambda x2: x2[1])
windowedds2 = lines2.flatMap(lambda line2: line2.strip().split("\n")).map(lambda strelem2: float(strelem2)).window(5,10)
result = Statistics.corr(windowedds1,windowedds2, method="pearson")
if result > 0.7:
print("ds1 and ds2 are correlated!!!")
spark.stop()
Thank you!
You have to first create streamingcontext object and then use it to call textFileStream.
spark =
SparkSession.builder.appName("CrossCorrelation").getOrCreate()
ssc = StreamingContext(spark.sparkContext, 1)
ds = ssc.textFileStream(input_path)

Add extra column for child data frame from parent data frame in nested XML in Spark

I am creating a data after loading many XML files .
Each xml file has one unique field fun:DataPartitionId
I am creating many rows from one XML files .
Now I want to add this fun:DataPartitionId for each row in the resulting rows from the XML.
For example suppose 1st XML has 100 rows then each 100 rows will have same fun:DataPartitionId field .
So fun:DataPartitionId is as a header filed in each XML.
This is what I am doing .
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
import org.apache.spark.{ SparkConf, SparkContext }
import java.sql.{Date, Timestamp}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
val getDataPartition = udf { (DataPartition: String) =>
if (DataPartition=="1") "SelfSourcedPublic"
else if (DataPartition=="2") "Japan"
else if (DataPartition=="3") "SelfSourcedPrivate"
else "ThirdPartyPrivate"
}
val getFFActionParent = udf { (FFAction: String) =>
if (FFAction=="Insert") "I|!|"
else if (FFAction=="Overwrite") "I|!|"
else "D|!|"
}
val getFFActionChild = udf { (FFAction: String) =>
if (FFAction=="Insert") "I|!|"
else if (FFAction=="Overwrite") "O|!|"
else "D|!|"
}
val dfContentEnvelope = sqlContext.read.format("com.databricks.spark.xml").option("rowTag", "env:ContentEnvelope").load("s3://trfsmallfffile/XML")
val dfDataPartition=getDataPartition(dfContentEnvelope("env:Header.fun:DataPartitionId"))
val dfContentItem = dfContentEnvelope.withColumn("column1", explode(dfContentEnvelope("env:Body.env:ContentItem"))).select("column1.*")
val df =dfContentItem.withColumn("DataPartition",dfDataPartition)
df.show()
When you read your xml file using
val dfContentEnvelope = sqlContext.read.format("com.databricks.spark.xml").option("rowTag", "env:ContentEnvelope").load("s3://trfsmallfffile/XML")
DataParitionId column is read as Long
fun:DataPartitionId: long (nullable = true)
so you should change the udf function as
val getDataPartition = udf { (DataPartition: Long) =>
if (DataPartition== 1) "SelfSourcedPublic"
else if (DataPartition== 2) "Japan"
else if (DataPartition== 3) "SelfSourcedPrivate"
else "ThirdPartyPrivate"
}
If possible you should be using when function instead of udf function to boost the processing speed and memory usage
Now I want to add this fun:DataPartitionId for each row in the resulting rows from the xml .
Your mistake is that you forgot to select that particular column, so the following code
val dfContentItem = dfContentEnvelope.withColumn("column1", explode(dfContentEnvelope("env:Body.env:ContentItem"))).select("column1.*")
should be
val dfContentItem = dfContentEnvelope.withColumn("column1", explode(dfContentEnvelope("env:Body.env:ContentItem"))).select($"env:Header.fun:DataPartitionId".as("DataPartitionId"),$"column1.*")
Then you can apply the udf function
val df = dfContentItem.select(getDataPartition($"DataPartitionId"), $"env:Data.sr:Source.*", $"_action".as("FFAction|!|"))
So working code as a whole should be
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
import org.apache.spark.{ SparkConf, SparkContext }
import java.sql.{Date, Timestamp}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
val getDataPartition = udf { (DataPartition: Long) =>
if (DataPartition=="1") "SelfSourcedPublic"
else if (DataPartition=="2") "Japan"
else if (DataPartition=="3") "SelfSourcedPrivate"
else "ThirdPartyPrivate"
}
val dfContentEnvelope = sqlContext.read.format("com.databricks.spark.xml").option("rowTag", "env:ContentEnvelope").load("s3://trfsmallfffile/XML")
val dfContentItem = dfContentEnvelope.withColumn("column1", explode(dfContentEnvelope("env:Body.env:ContentItem"))).select($"env:Header.fun:DataPartitionId".as("DataPartitionId"),$"column1.*")
val df = dfContentItem.select(getDataPartition($"DataPartitionId"), $"env:Data.sr:Source.*", $"_action".as("FFAction|!|"))
df.show(false)
And you can proceed with the rest of the code.

Spark can not serialize the BufferedImage class

I have a Not Serializable Class exception in Spark 2.2.0.
The following procedure is what I am trying to do in Scala:
To read from HDFS a set of JPEG images.
To build an array of java.awt.image.BufferedImageS.
To extract the java.awt.image.BufferedImage buffer and store it in a 2D array for each image, by building an array of two-dimensional arrays containing the image buffer information Array[Array[Int]].
Transform the Array[Array[Int]] into an org.apache.spark.rdd.RDD[Array[Array[Int]]] by using sc.parallelize method.
Perform image processing operations distributelly by transforming the initial org.apache.spark.rdd.RDD[Array[Array[Int]]].
This is the code:
import org.apache.spark.sql.SparkSession
import javax.imageio.ImageIO
import java.io.ByteArrayInputStream
def binarize(image: Array[Array[Int]], threshold: Int) : Array[Array[Int]] = {
val height = image.size
val width = image(0).size
val result = Array.ofDim[Int](height, width)
for (i <- 0 until height) {
for (j <- 0 until width){
result(i)(j) = if (image(i)(j) <= threshold) 0 else 255
}
}
result
}
object imageTestObj {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("imageTest2").getOrCreate()
val sc = spark.sparkContext
val saveToHDFS = false
val threshold: Int = 128
val partitions = 32
val inPathStr = "hdfs://192.168.239.218:9000/vitrion/input"
val outPathStr = if (saveToHDFS) "hdfs://192.168.239.54:9000/vitrion/output/" else "/home/vitrion/IdeaProjects/imageTest2/output/"
val files = sc.binaryFiles(inPathStr).collect
val AWTImageArray = files.map { binFile =>
val input = binFile._2.open()
val name = binFile._1
var buffer: Array[Byte] = Array.fill(input.available)(0)
input.readFully(buffer)
ImageIO.read(new ByteArrayInputStream(buffer))
}
val ImgBuffers = AWTImageArray.map { image =>
val height = image.getHeight
val width = image.getWidth
val buffer = Array.ofDim[Int](height, width)
for (i <- 0 until height) {
for (j <- 0 until width){
buffer(i)(j) = image.getRaster.getDataBuffer.getElem(0, i * width + j)
}
}
buffer
}
val inputImages = sc.parallelize(ImgBuffers, partitions).cache()
val op1 = inputImages.map(image => binarize(image, threshold))
}
}
This algorithm gets a very well-known exception:
org.apache.spark.SparkException: Task not serializable
...
Caused by: java.io.NotSerializableException: java.awt.image.BufferedImage
Serialization stack:
- object not serializable (class: java.awt.image.BufferedImage, ...
I do not understand why Spark attempts to serialize the BufferedImage class when it is used before creating the first RDD in the application. Isn't it supposed that the BufferedImage class should be serialized if I try to create an RDD[BufferedImage]?
Can somebody explain me what is going on?
Thank you in advance...
Actually you are serializing a function in Spark. This function cannot contain references to non serializable classes. You can instantiate in the function non-serializable classes (OK), but NOT refer to instances of non serializable classes in the function.
Most probably you are referencing in one of the functions you use to an instance of a BufferedImage.
Check your code and see if you are not referencing from a function a BufferedImage object.
By inlining some code and not serializing BufferedImage objects, I guess you can overcome the exception. Can you try out this code (did not execute it myself)?:
object imageTestObj {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("imageTest2").getOrCreate()
val sc = spark.sparkContext
val saveToHDFS = false
val threshold: Int = 128
val partitions = 32
val inPathStr = "hdfs://192.168.239.218:9000/vitrion/input"
val outPathStr = if (saveToHDFS) "hdfs://192.168.239.54:9000/vitrion/output/" else "/home/vitrion/IdeaProjects/imageTest2/output/"
val ImgBuffers = sc.binaryFiles(inPathStr).collect.map { binFile =>
val input = binFile._2.open()
val name = binFile._1
var buffer: Array[Byte] = Array.fill(input.available)(0)
input.readFully(buffer)
val image = ImageIO.read(new ByteArrayInputStream(buffer))
// Inlining must be here, so that BufferedImage is not serialized.
val height = image.getHeight
val width = image.getWidth
val buffer = Array.ofDim[Int](height, width)
for (i <- 0 until height) {
for (j <- 0 until width){
buffer(i)(j) = image.getRaster.getDataBuffer.getElem(0, i * width + j)
}
}
buffer
}
val inputImages = sc.parallelize(ImgBuffers, partitions).cache()
val op1 = inputImages.map(image => binarize(image, threshold))
}
}

Resources